In [1]:
import urllib3
import json
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_rows', 4000)
pd.set_option('display.max_columns', 100)


import warnings
warnings.filterwarnings("ignore")

In [2]:
df1 = pd.read_json('/fastcampus-data/articles/articels_only_contents.json')
df = df1.copy()
df["full_content"] = df['title'] + ' ' + df['content_tag_removed']
df = df[['id', 'title', 'full_content']].reset_index(drop=True)

In [3]:
# api_key = '8e216d4f-bfae-4cc4-ae6f-25f2dc4968c5' # 혜빈 키
api_key = 'de384842-9f74-42c9-a0e5-036ecd76eab5' # 영수님 키

In [4]:
# // 언어 분석 기술 문어/구어 중 한가지만 선택해 사용
# // 언어 분석 기술(문어)
openApiURL = "http://aiopen.etri.re.kr:8000/WiseNLU" 
# // 언어 분석 기술(구어)
# openApiURL = "http://aiopen.etri.re.kr:8000/WiseNLU_spoken"

In [5]:
class ETRISentenceAnalysis:
  def __init__(self, api_key: dict, url: str):
    self.api_key = api_key
    self.url = url

    self.http = urllib3.PoolManager()
  
  def _make_request_json(self, text: str, analysis_code: int) -> dict:
    return {
      "access_key": self.api_key,
      "argument": {
          "text": text,
          "analysis_code": analysis_code
          }
      }

  def _request(self, request_json: dict) -> object:
    return self.http.request(
        "POST",
        openApiURL,
        headers={"Content-Type": "application/json; charset=UTF-8"},
        body=json.dumps(request_json)
    )

  def get_analyzed_sentence(self, sentence: str, analysis_code: int) -> dict:
    def _get_return_object(sentence: str) -> dict:
      data = str(response.data, 'utf-8')
      data = json.loads(data)
      
       # 딕셔너리 하나 벗김 
      return data.get('return_object') 

    request_json = self._make_request_json(sentence, analysis_code)
    response = self._request(request_json)

#     assert response.status == 200, f'Error {response.status}'
        
    return _get_return_object(response)

def get_dependency_from_text(data: dict) -> dict:
    # 딕셔너리 하나 더 벗김
    analyzed_sentences = data.get('sentence')
    
    extracted_sentences_by_text_and_dependency = {
        i: {'text':sentence.get('text'), # 문장 한개씩 뱉음
            'dependency': {
                'subject': [info.get('text') for info in sentence.get('dependency') if info.get('label') == 'NP' or info.get('label') == 'NP_SBJ'],
                'object': [info.get('text') for info in sentence.get('dependency') if info.get('label') == 'NP_OBJ']
                }
            } for i, sentence in enumerate(analyzed_sentences)
        }
    
    return extracted_sentences_by_text_and_dependency

# only nouns
def get_nouns_from_text(data: dict) -> dict:
    # 딕셔너리 하나 더 벗김
    analyzed_sentences = data.get('sentence')
    
    extracted_sentences_by_text_and_dependency = {
        i: {'text':sentence.get('text'), # 문장 한개씩 뱉음
            'dependency': [info.get('text') for info in sentence.get('morp') if info.get('type') == 'NNG']
            } for i, sentence in enumerate(analyzed_sentences)
        }
    
    return extracted_sentences_by_text_and_dependency

In [6]:
etri_sentence_analysis = ETRISentenceAnalysis(api_key, openApiURL)

# preprocessing

In [7]:
#  한국어를 제외한 글자를 제거하는 패턴.
import re    
df['full_content'] = df['full_content'].apply(lambda x: re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", x))

In [9]:
# df['full_content']

글자가 20글자 이하이거나, 내용이 NoneType인 아티클 제거하기 총 31개. id로 접근함

In [10]:
# contents 글자가 20글자 이하인 아티클
id_less_20 = []
for i in range(len(df['full_content'])):
    if len(df['full_content'][i]) < 20:
#         print(i, df['full_content'][i])
        id_less_20.append(df['id'][i])

        
print('len(id_less_20) = ',len(id_less_20))    
print('id_less_20 = ',id_less_20)    

# none type인 아티클
type_none_idx = [1723, 1724, 1804, 1826, 2245, 2318, 2319, 2377, 2431, 2442, 2455, 2456, 2542, 2561, 2563, 2567, 2655, 2701, 2768, 2832, 2857, 2937, 2938, 2954, 3065, 3088] 
new_id = []
# for i in range(len(df['content_tag_removed'])):
for j in type_none_idx:
    new_id.append(df.iloc[j].id)
    
print('len(new_id) = ',len(new_id))    
print('new_id = ', new_id)    
print()

drop_id_list = list(set(id_less_20 + new_id))
print('len(drop_id_list) = ',len(drop_id_list))
print('drop_id_list = ', drop_id_list)

len(id_less_20) =  0
id_less_20 =  []
len(new_id) =  26
new_id =  [15107, 15376, 18326, 17194, 22131, 27831, 27888, 24527, 29398, 29528, 28451, 28471, 30618, 29157, 27366, 27451, 29891, 33399, 32309, 33743, 33446, 34361, 34422, 35145, 35259, 36137]

len(drop_id_list) =  26
drop_id_list =  [15107, 15376, 18326, 30618, 28451, 33446, 36137, 17194, 32309, 28471, 27831, 34361, 27451, 35259, 29891, 35145, 24527, 33743, 29398, 29528, 29157, 27366, 27888, 22131, 34422, 33399]


# corpus에 형태소 분석 결과 담기

In [11]:
corpus = []

for i in tqdm(range(len(df))):
#     if df['id'][i] not in drop_id_list:
    con = df['full_content'].iloc[i]
    # 형태소 분석
    data = etri_sentence_analysis.get_analyzed_sentence(con, 'morp')
    corpus.append(data)
        
#3438 - 31 = 3407 이어야 함.        
len(corpus)

100%|██████████| 3438/3438 [19:41<00:00,  2.91it/s]  


3438

In [27]:
# dataframe 초기화
new_df = pd.DataFrame(df[['id','title','full_content']])
new_df['tokens'] = None

# contents 20글자 이하인 corpus Pop


# dataframe 각 토픽 담기
for i in tqdm(range(len(corpus))):
    nouns = []
    res3 = corpus[i]['sentence'][0]['morp'] # 형태소 결과
    
    for j in res3:
        if j['type'] == 'NNG' and len(j['lemma']) > 1: #  형태소가 명사이고 2글자 이상인 것만
#             nouns.insert(i, j['lemma'])
            nouns.append(j['lemma'])
            new_df['tokens'].iloc[i] = nouns

100%|██████████| 3438/3438 [02:01<00:00, 28.21it/s]


# save to json

In [24]:
# set으로 토큰 중복제거
# new_df['tokens'] = new_df['tokens'].apply(lambda x: list(set(x)))

In [29]:
new_df.to_json('/home/user_4/CBF/Token/final_tokens_article.json')