<a href="https://colab.research.google.com/github/threegenie/climate_classify/blob/main/climate_labeling_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
! pip install konlpy

In [None]:
%%capture
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
import os
import tqdm
import urllib.request
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import log_loss, accuracy_score,f1_score
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
# 오류(SettingWithCopyError 발생)
pd.set_option('mode.chained_assignment', 'raise') # SettingWithCopyError

# 경고(SettingWithCopyWarning 발생, 기본 값입니다)
pd.set_option('mode.chained_assignment', 'warn') # SettingWithCopyWarning

# 무시
pd.set_option('mode.chained_assignment',  None) # <==== 경고를 끈다

#### 데이터 업로드

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/open/train.csv')
test = pd.read_csv('/content/drive/My Drive/open/test.csv')
label = pd.read_csv('/content/drive/My Drive/open/labels_mapping.csv')

In [None]:
df.head()

In [None]:
label.head()

In [None]:
# # 결측값 처리
# df = df.fillna('내용없음')
# test = test.fillna('내용없음')

In [None]:
df.info()

In [None]:
test.info()

#### 필요한 특성만 남기기

In [None]:
df = df[['index', '요약문_연구목표','요약문_연구내용','요약문_한글키워드']]

In [None]:
test = test[['index', '요약문_연구목표','요약문_연구내용','요약문_한글키워드']]

#### 데이터 정제 - 훈련, 테스트셋에 모두 적용 -> 정규표현식 사용하여 한글 제외 모두 제거

In [None]:
# df['요약문_연구목표'] = df['요약문_연구목표'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# df['요약문_연구목표'].replace('', np.nan, inplace=True)

# df['요약문_연구내용'] = df['요약문_연구내용'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# df['요약문_연구내용'].replace('', np.nan, inplace=True)

# df['요약문_한글키워드'] = df['요약문_한글키워드'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# df['요약문_한글키워드'].replace('', np.nan, inplace=True)

In [None]:
# test['요약문_연구목표'] = test['요약문_연구목표'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# test['요약문_연구목표'].replace('', np.nan, inplace=True)

# test['요약문_연구내용'] = test['요약문_연구내용'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# test['요약문_연구내용'].replace('', np.nan, inplace=True)

# test['요약문_한글키워드'] = test['요약문_한글키워드'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# test['요약문_한글키워드'].replace('', np.nan, inplace=True)

In [None]:
# df.head()

In [None]:
# test.head()

#### 토큰화, 전처리

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
from konlpy.tag import Komoran
komoran = Komoran()

In [None]:
stop_words=['은','는','이','가', '하','아','것','들','의','있','되','수','보','주','등','한']

In [None]:
# df['연구목표_T'] = None
# df['연구내용_T'] = None
# df['한글키워드_T'] = None

In [None]:
# test['연구목표_T'] = None
# test['연구내용_T'] = None
# test['한글키워드_T'] = None

In [None]:
def preprocessing(text, okt, remove_stopwords=False, stop_words=[]):
  text=re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ]","", text) 
  word_text=okt.morphs(text, stem=True)
  if remove_stopwords:
    word_review=[token for token in word_text if not token in stop_words]
  return word_review

In [None]:
aim_train_text = []
aim_test_text = []

content_train_text = []
content_test_text = []

key_train_text = []
key_test_text = []

In [None]:
for text in tqdm.tqdm(df['요약문_연구목표']):
    try:
        aim_train_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        aim_train_text.append([])

In [None]:
aim_train_text

In [None]:
for text in tqdm.tqdm(df['요약문_연구내용']):
    try:
        content_train_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        content_train_text.append([])

In [None]:
for text in tqdm.tqdm(df['요약문_한글키워드']):
    try:
        key_train_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        key_train_text.append([])

In [None]:
for text in tqdm.tqdm(test['요약문_연구목표']):
    try:
        aim_test_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        aim_test_text.append([])

In [None]:
for text in tqdm.tqdm(test['요약문_연구내용']):
    try:
        content_test_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        content_test_text.append([])

In [None]:
for text in tqdm.tqdm(test['요약문_한글키워드']):
    try:
        key_test_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    except:
        key_test_text.append([])

In [None]:
# df['연구목표_T'] = df['요약문_연구목표'].apply(okt.morphs)
# df['연구목표_T'] = df['요약문_연구목표'].apply(lambda x: [item for item in x if item not in stop_words])
# df['연구내용_T'] = df['요약문_연구내용'].apply(okt.morphs)
# df['연구내용_T'] = df['요약문_연구내용'].apply(lambda x: [item for item in x if item not in stop_words])
# df['한글키워드_T'] = df['요약문_한글키워드'].apply(okt.morphs)
# df['한글키워드_T'] = df['요약문_한글키워드'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
# test['연구목표_T'] = test['요약문_연구목표'].apply(okt.morphs)
# test['연구목표_T'] = test['요약문_연구목표'].apply(lambda x: [item for item in x if item not in stop_words])
# test['연구내용_T'] = test['요약문_연구내용'].apply(okt.morphs)
# test['연구내용_T'] = test['요약문_연구내용'].apply(lambda x: [item for item in x if item not in stop_words])
# test['한글키워드_T'] = test['요약문_한글키워드'].apply(okt.morphs)
# test['한글키워드_T'] = test['요약문_한글키워드'].apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
df.head(10)

In [None]:
test.head(10)