In [5]:
import os 
import json
import pickle

from konlpy.tag import Okt
from tqdm import trange

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Bidirectional, Attention, Concatenate, Dense

from konlpy.tag import Okt
from tqdm import trange

from copy import deepcopy
from keras.callbacks import ModelCheckpoint

import re
from datetime import datetime

# 그래프를 위한 라이브러리 및 초기 그래프 설정
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline

matplotlib_inline.backend_inline.set_matplotlib_formats("png2x") # svg, retina, png2x ...
mpl.style.use("seaborn-v0_8")
mpl.rcParams.update({"figure.constrained_layout.use": True})
sns.set_context("paper") 
sns.set_palette("Set2") 
sns.set_style("whitegrid") 

plt.rc("font", family = "Malgun Gothic")
plt.rcParams["axes.unicode_minus"] = False

# 파일 저장시 파일명의 용이성
def now_time():
    now = datetime.now()
    return now.strftime('%Y%m%d_%H_%M_%S')

In [6]:
df_train = pd.read_csv('data/_1_before_prep/unsmile_train_v1.0.tsv', sep='\t')
df_valid = pd.read_csv('data/_1_before_prep/unsmile_valid_v1.0.tsv', sep='\t')

In [7]:
df = pd.concat([df_train,df_valid],axis=0)

In [8]:
# df.to_csv('data/1_before_prep/all_data_unsmile.csv')
df = pd.read_csv('data/_1_before_prep/all_data_unsmile.csv', index_col=0)

In [9]:
label = [[i for i in j if i != ''] for j in (df.values[:,1:] * df.columns[1:])]

In [10]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.index #순차적인 인덱스가 필요 (for문을 위해)

RangeIndex(start=0, stop=18742, step=1)

# 라벨 코퍼스 및 라벨 수치화 작업

## corpus_method_1 를 위한 정답 라벨
- input 문장 : Okp()의 morphs 이용
- output 문장 : 각 카테고리의 묶음을 정답 라벨의 corpus 로 이용

In [11]:
label_corpus_word_index = {'padding':0,'start':1,'end':2}
label_index=3
for i in label:
    for j in i:
        if j not in label_corpus_word_index:
            label_corpus_word_index[j] = label_index
            label_index +=1


labels = []
for i in label:
    single_label=[]
    for j in (i):
        single_label.append(label_corpus_word_index[j])
    labels.append(single_label)


label_corpus_word_index , labels[:10]

({'padding': 0,
  'start': 1,
  'end': 2,
  'clean': 3,
  '종교': 4,
  '여성/가족': 5,
  '인종/국적': 6,
  '지역': 7,
  '기타 혐오': 8,
  '악플/욕설': 9,
  '성소수자': 10,
  '개인지칭': 11,
  '남성': 12,
  '연령': 13},
 [[3], [4], [3], [3], [5], [3], [6, 7, 4, 8], [9], [5], [3]])

### 문장 인덱스화 하기

In [12]:
tokenizer = Okt()

corpus=['padding']
for i in trange(len(df)):
    for j in tokenizer.morphs(df.iloc[i,0]):
        if j not in corpus:
            corpus.append(j)

# 코퍼스 워드투 인덱스 과정
sentences_corpus_word_index = {corpus[i]: i for i in trange(0,len(corpus))}

sentences = []
for i in trange(len(df)):
    a_sentence = [sentences_corpus_word_index[i] for i in tokenizer.morphs(df.iloc[i,0])]
    sentences.append(a_sentence)

100%|██████████| 18742/18742 [01:45<00:00, 178.04it/s]
100%|██████████| 38712/38712 [00:00<00:00, 1858842.55it/s]
100%|██████████| 18742/18742 [01:39<00:00, 187.75it/s]


### 문장 명사와 동사 등 기준으로 인덱스화 하기

- 모프 단위가 아닌 방법으로 해보기

### 저장하기 목록
1. 문장 데이터(시퀀스 데이터) : sentences.pkl
2. 정답 데이터(시퀀스 화 된 데이터) : summarys.pkl
3. 문장에 대한 코퍼스 : corpus_word_index.json
4. 정답에 대한 코퍼스 : corpus_word_index.json

In [13]:
with open("data\_2_after_prep\corpus_method_1\sentences.pkl","wb") as f:
    pickle.dump(sentences, f)
    
with open("data\_2_after_prep\corpus_method_1\labels.pkl","wb") as f:
    pickle.dump(labels, f)

with open("data\_2_after_prep\corpus_method_1\sentences_corpus_word_index.json","w") as f:
    json.dump(sentences_corpus_word_index, f)

with open("data\_2_after_prep\corpus_method_1/label_corpus_word_index.json","w") as f:
    json.dump(label_corpus_word_index, f)

# corpus_method_2 를 위한 정답 라벨
- 정답 라벨을 단어 단위로 분리 시켜 seq2seq 학습에 이용

In [14]:
label_corpus_method_2 = []
for i in label:
    for j in i:
        for k in tokenizer.morphs(j):
            if (k not in label_corpus_method_2) and (k != '/'): 
                label_corpus_method_2.append(k)

label_corpus_word_index_method_2 = {'padding':0,'start':1,'end':2}
label_index=3
for l in label_corpus_method_2:
    label_corpus_word_index_method_2[l] = label_index
    label_index +=1

label_corpus_word_index_method_2

{'padding': 0,
 'start': 1,
 'end': 2,
 'clean': 3,
 '종교': 4,
 '여성': 5,
 '가족': 6,
 '인종': 7,
 '국적': 8,
 '지역': 9,
 '기타': 10,
 '혐오': 11,
 '악플': 12,
 '욕설': 13,
 '성소수자': 14,
 '개인': 15,
 '지칭': 16,
 '남성': 17,
 '연령': 18}

In [15]:
labels_tokened = []
for words in label:
    single_label = []
    for word in words:
        for token in tokenizer.morphs(word):
            if token != '/':
                single_label.append(label_corpus_word_index_method_2[token])
    labels_tokened.append(single_label)
    # single_label.append(label_corpus_word_index_method_2[])

In [16]:
# 토큰화 잘 되었는지 확인
for i,j in zip(labels_tokened[:10] , label[:10]):
    print(i,j)

[3] ['clean']
[4] ['종교']
[3] ['clean']
[3] ['clean']
[5, 6] ['여성/가족']
[3] ['clean']
[7, 8, 9, 4, 10, 11] ['인종/국적', '지역', '종교', '기타 혐오']
[12, 13] ['악플/욕설']
[5, 6] ['여성/가족']
[3] ['clean']


In [17]:
# label 인덱스 토큰과, corpus 저장하기
with open("data/_2_after_prep/corpus_method_2/labels.pkl","wb") as f:
    pickle.dump(labels_tokened, f)

with open("data/_2_after_prep/corpus_method_2/label_corpus_word_index.json","w") as f:
    json.dump(label_corpus_word_index_method_2, f)

###  불러오기 코드

In [18]:
with open("data/_2_after_prep/corpus_method_1/sentences.pkl","rb") as f:
    sentences = pickle.load(f)

with open("data/_2_after_prep/corpus_method_1/labels.pkl","rb") as f:
    labels = pickle.load(f)

with open("data/_2_after_prep/corpus_method_2/labels.pkl","rb") as f:
    labels2 = pickle.load(f)

with open("data/_2_after_prep/corpus_method_1/sentences_corpus_word_index.json","r") as f:
    sentences_corpus_word_index = json.load(f)

with open("data/_2_after_prep/corpus_method_1/label_corpus_word_index.json","r") as f:
    label_corpus_word_index = json.load(f)

with open("data/_2_after_prep/corpus_method_2/label_corpus_word_index.json","r") as f:
    label_corpus_word_index2 = json.load(f)

In [19]:
label_corpus_index_word1 = {label_corpus_word_index[key]:key for key in label_corpus_word_index}
label_corpus_index_word2 = {label_corpus_word_index2[key]:key for key in label_corpus_word_index2}

In [20]:
def seek_(number):   
    print(number,'번째 인덱싱') 
    print("단어 인덱싱 :",sentences[number])
    print("문장의 원본 :",df['문장'][number],end="\n\n")
    print('-'*100)
    print("정답 라벨  :" ,df.columns[1:].to_list())
    print('라벨 방법 1:', labels[number] ,df.values[number,1:].tolist())
    print('라벨 방법 2:', labels[number] ,',14 개의 코퍼스 인덱스 매칭:',[label_corpus_index_word1[i] for i in labels[number]])
    print('라벨 방법 3:', labels2[number],',19 개의 코퍼스 인덱스 매칭:',[label_corpus_index_word2[i] for i in labels2[number]])

In [21]:
print('코퍼스 1   :', label_corpus_word_index)
print('코퍼스 2   :', label_corpus_word_index2)

코퍼스 1   : {'padding': 0, 'start': 1, 'end': 2, 'clean': 3, '종교': 4, '여성/가족': 5, '인종/국적': 6, '지역': 7, '기타 혐오': 8, '악플/욕설': 9, '성소수자': 10, '개인지칭': 11, '남성': 12, '연령': 13}
코퍼스 2   : {'padding': 0, 'start': 1, 'end': 2, 'clean': 3, '종교': 4, '여성': 5, '가족': 6, '인종': 7, '국적': 8, '지역': 9, '기타': 10, '혐오': 11, '악플': 12, '욕설': 13, '성소수자': 14, '개인': 15, '지칭': 16, '남성': 17, '연령': 18}


In [22]:
i = 18000 #+=1
seek_(i)

18000 번째 인덱싱
단어 인덱싱 : [77, 37822, 813, 37823]
문장의 원본 : 여기 해피하우스아니냐

----------------------------------------------------------------------------------------------------
정답 라벨  : ['여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭']
라벨 방법 1: [3] [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
라벨 방법 2: [3] ,14 개의 코퍼스 인덱스 매칭: ['clean']
라벨 방법 3: [3] ,19 개의 코퍼스 인덱스 매칭: ['clean']
