# 데이터 전처리

In [31]:
import pandas as pd
import pickle
# from gensim.models import fasttext
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder
from collections import Counter

from torch.utils.data import Dataset, DataLoader

from torchmetrics import F1Score, ConfusionMatrix
import time

import gensim
from gensim.models import fasttext
from sklearn.model_selection import train_test_split
# ft_model = fasttext.load_facebook_model('/content/drive/MyDrive/wiki.ko.bin')
ft_model = gensim.models.fasttext.load_facebook_model('D:\\wiki.ko\\wiki.ko.bin')
# ft_model = gensim.models.fasttext.load_facebook_model('./wiki.ko.bin')

## 데이터 불러오기

## 필요한 데이터 추출

In [76]:
# 저장한 데이터 토큰 피클 파일 불러오기
with open('./dataset/token_NonNP.pkl', 'rb') as f:
    text_data = pickle.load(f)

text_data['token_len'] = [len(x) for x in text_data['token']]

# train용 aug token data
with open('./dataset/augNonNP_df.pkl', 'rb') as f:
    text_data2 = pickle.load(f)

# 다중 레이블 처리한 훈련 데이터셋 가져오기
with open('./dataset/train_origin.pkl', 'rb') as f:
    train_data = pickle.load(f)

# 다중 레이블 처리한 테스트 데이터셋 가져오기
with open('./dataset/test_origin.pkl', 'rb') as f:
    test_data = pickle.load(f)

text_data2.head()

Unnamed: 0,segment_id,emotion,emotion_id,token,token_len,augmented
4432,Sess19_script06_User038F_015,angry,0,"[어, 랑, 친해져서, 그, 친구, 랑, 친해, 지려]",8,"[랑, 어, 친해져서, 그, 친구, 친해, 랑, 지려]"
4528,Sess20_script02_User039M_039,angry,0,"[또, 계산, 안, 해, 가지, 고]",6,"[또, 계산, 안, 고, 가지, 해]"
5640,Sess25_script02_User050F_024,angry,0,"[친구, 도, 아니, 다]",4,"[친구, 도, 아니, 둘]"
3593,Sess16_script06_User031M_020,angry,0,"[아, 욕, 나와, 욕, 나올, 거, 같, 은데, 욕, 나올, 거, 같, 은데, 암...",17,"[거, 아, 나와, 어, 나올, 욕, 나올, 은데, 욕, 같, 거, 같, 욕, 암튼..."
1578,Sess09_script02_User018M_008,angry,0,"[는, 좀, 되게, 화, 가]",5,"[는, 좀, 되게, 화, 와]"


In [77]:
# 필요한 컬럼만 남겨두기
#train_data = train_data.drop(['seconds', 'sess', 'script'], axis=1)
test_data = test_data.drop(['seconds', 'sess', 'script'], axis=1)

# 텍스트에서 필요한 컬럼만 남겨두기 및 컬럼 이름 맞추기
text_data = text_data[['segment_id', 'token','token_len']]
text_data2 = text_data2[['segment_id','emotion','emotion_id','augmented','token_len']]
text_data2 = text_data2.rename(columns={'augmented':'token'})

In [78]:
# 훈련 및 테스트 데이터에 텍스트 데이터 붙이기
train = text_data2
test = pd.merge(test_data, text_data, how='left', on=['segment_id'])

In [79]:
len(train), len(train[train['emotion_id']==4])

(67900, 8750)

In [80]:
import numpy as np
len(np.unique(train['segment_id']))

10140

In [84]:
max_class = len(train_data[train_data['emotion_id']==3])
max_class

977

In [85]:
# 원래 happy 데이터
happy_origin = pd.merge(train_data[train_data['emotion']=='happy'], text_data, how='left', on='segment_id')[['segment_id','emotion','emotion_id', 'token', 'token_len']]

# happy 개수에 맞춰서 증강
for i in [0,1,2,5,6]:
    a = train[train['emotion_id']==i].sample(n=max_class,replace=F, random_state=42)
    if i < 1:
        data = a
    else:
        data = pd.concat([data, a], axis=0)
    print(i,"class : ",len(data[data['emotion_id']==i]))
data = pd.concat([data, happy_origin])
data = pd.concat([data, train[train['emotion_id']==4]], axis=0)
len(data)
print("3 class : ",len(data[data['emotion_id']==3]))
print("4 class : ",len(data[data['emotion_id']==4]))

train = data

0 class :  977
1 class :  977
2 class :  977
5 class :  977
6 class :  977
3 class :  977
4 class :  8750


## 텍스트 임베딩

In [86]:
# 최대 토큰 개수 구하기
max_token = train['token_len'].max()
max_token

150

In [89]:
max_token = int(train.describe()['token_len'].loc['75%'])

In [90]:
def get_embedding(data):
    # 텍스트 임베딩
    text_emb = []
    for tokens in tqdm(data['token']):
        sent_emb = []
        for i in range(max_token):
            # 워드 임베딩 초기화
            word_emb = []

            # 최대 단어 길이(156) 이하면 패딩
            if i >= len(tokens):
                word_emb = [np.array([0.]*300, dtype='float32')]*(max_token-i)
                sent_emb.extend(word_emb)
                break;

            # fasttext 워드 임베딩 가져옴
            word_emb = ft_model.wv[tokens[i]]
            sent_emb.append(word_emb)

        text_emb.append(sent_emb)

    data['text_emb'] = text_emb
    return data['text_emb']

In [91]:
# 데이터셋에 텍스트 임베딩 추가하기
train['text_emb'] = get_embedding(train)
test['text_emb'] = get_embedding(test)

100%|█████████████████████████████████████████████████████████████████████████| 14612/14612 [00:00<00:00, 22377.62it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2614/2614 [00:00<00:00, 23275.08it/s]


In [92]:
# 텍스트 임베딩 결과 확인
train['text_emb'].head()

6146    [[0.18122339, 0.20740172, -0.12993784, 0.01915...
618     [[-0.2459137, 0.14915791, -0.8468651, 0.109163...
6929    [[-0.9367216, 0.85393, -1.3444862, -0.25269663...
3332    [[-0.24933465, 0.42417058, -0.72755027, -0.392...
4528    [[-0.16821234, 0.26754832, -0.41189933, 0.0361...
Name: text_emb, dtype: object

In [93]:
test['text_emb'].head()

0    [[-0.9367216, 0.85393, -1.3444862, -0.25269663...
1    [[-0.19490944, -0.06741688, -0.23153214, 0.168...
2    [[-1.0400262, 0.25054872, -1.7955443, -0.41869...
3    [[-0.004059928, 0.08098799, -0.5521582, 0.2013...
4    [[-0.40613854, 0.21009226, -0.21392642, 0.1807...
Name: text_emb, dtype: object

# 모델링

## 데이터셋 및 모델 설정

In [94]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [95]:
# 레이블에 대해 정수 인코딩
encoder = LabelEncoder()
train_label = encoder.fit_transform(train[['emotion']].to_numpy().reshape(-1))
test_label = encoder.transform(test[['emotion']].to_numpy().reshape(-1))

In [96]:
# 텍스트 데이터셋 만들기
class TextDataset(Dataset):
    def __init__(self, mode):
        self.mode = mode
        X_train, X_valid, y_train, y_valid = train_test_split(train['text_emb'], train_label, test_size=0.1, random_state=42)
        if self.mode == 'train':
            self.x_data = torch.FloatTensor(X_train.array)
            self.y_data = torch.LongTensor(y_train)
        elif self.mode == 'valid':
            self.x_data = torch.FloatTensor(X_valid.array)
            self.y_data = torch.LongTensor(y_valid)
        else:
            self.x_data = torch.FloatTensor(test['text_emb'].array)
            self.y_data = torch.LongTensor(test_label)
    
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]
        return x, y

In [97]:
# textCNN 모델 구성
class textCNN(nn.Module):
    def __init__(self, dim_channel, kernel_wins, dropout_rate, num_class):
        super(textCNN, self).__init__()
        self.emb_dim = 300
        # 커널 사이즈에 따라 CNN 설정(kernel_wins)
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, self.emb_dim)) for w in kernel_wins])
        self.dropout = nn.Dropout(dropout_rate)
        # CNN에서 나온 값 합치기
        self.ft_fc = nn.Linear(len(kernel_wins)*dim_channel, 128)
        self.fc = nn.Linear(128, num_class)
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        emb_x = x.unsqueeze(1)
        
        conv_x = [conv(emb_x) for conv in self.convs]
        
        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in conv_x]
        
        fc_x = torch.cat(pool_x, dim=1)
        fc_x = fc_x.squeeze(-1)
        fc_x = F.relu(fc_x)
        fc_x = self.dropout(fc_x)
        ft_fc = self.ft_fc(fc_x)
        fc_x = self.fc(ft_fc)
        logit = fc_x
        return logit, ft_fc

In [98]:
# 하이퍼파라미터 설정 및 모델 생성
learning_rate = 0.0005
dim_channel = 100
kernel_wins = [4,5]
dropout_rate = 0.4
num_class = len(encoder.classes_)

model = textCNN(dim_channel=dim_channel, kernel_wins=kernel_wins, dropout_rate=dropout_rate, num_class=num_class);
model.to(device)

criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

textCNN(
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (ft_fc): Linear(in_features=200, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=7, bias=True)
  (softmax): Softmax(dim=1)
)


In [99]:
# 데이터셋 생성(40초 가량 소요)
train_dataset = TextDataset(mode='train')
valid_dataset = TextDataset(mode='valid')
test_dataset = TextDataset(mode='test')

In [100]:
# 데이터로더 생성
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=100)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100)

## 학습하기

In [101]:
model.train()
# 모델 학습
num_epochs = 70

start_time = time.time()
for epoch in range(1,num_epochs+1):
    for x_data, y_data in train_loader:
        x_data, y_data = x_data.to(device), y_data.to(device)
        
        train_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
        labels = Variable(y_data)
        
        outputs, _ = model(train_x)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if not (epoch % 10):
        label_list = []
        predict_list = []
        for x_data, y_data in valid_loader:
            x_data, y_data = x_data.to(device), y_data.to(device)
            
            test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
            outputs, _ = model(test_x)
            predictions = torch.max(outputs,1)[1].to(device)

            label_list.extend(y_data)
            predict_list.extend(predictions)
        
        print('Epoch: {}, Loss: {}'.format(epoch, loss.data))
end_time = time.time()
print('Time:', end_time-start_time)

Epoch: 10, Loss: 0.07535051554441452
Epoch: 20, Loss: 0.03451499342918396
Epoch: 30, Loss: 0.03049006499350071
Epoch: 40, Loss: 0.02703574113547802
Epoch: 50, Loss: 0.002186761237680912
Epoch: 60, Loss: 0.014063133858144283
Epoch: 70, Loss: 0.0007723644375801086
Time: 180.72291898727417


# 성능 확인

## 훈련셋 성능

In [102]:
# 훈련셋에서의 f1스코어
train_predictions_list = []
train_labels_list = []
model.eval()
for x_data, y_data in train_loader:
    x_data, y_data = x_data.to(device), y_data.to(device)
    train_labels_list.extend(y_data)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, feature = model(test_x)
    
    train_predictions = torch.max(outputs,1)[1].to(device)
    train_predictions_list.extend(train_predictions)
    
train_predictions_list = torch.tensor(train_predictions_list)
train_labels_list = torch.tensor(train_labels_list)

f1 = F1Score(task='multiclass', num_classes=7)
f1_score = f1(train_predictions_list, train_labels_list)
f1_score

tensor(0.9994)

## 테스트셋 성능

In [103]:
# 테스트셋에서의 f1스코어
test_predictions_list = []
test_labels_list = []
model.eval()
for x_data, y_data in test_loader:
    x_data, y_data = x_data.to(device), y_data.to(device)
    test_labels_list.extend(y_data)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, feature = model(test_x)
    
    test_predictions = torch.max(outputs,1)[1].cpu()
    test_predictions_list.extend(test_predictions)
    
test_predictions_list = torch.tensor(test_predictions_list)
test_labels_list = torch.tensor(test_labels_list)

f1 = F1Score(task='multiclass', num_classes=7)
f1_score = f1(test_predictions_list, test_labels_list)
f1_score

tensor(0.7789)

## 클래스별 예측 개수 확인

In [104]:
# 클래스 별로 예측 값 개수 확인
train_label_origin = encoder.inverse_transform(train_labels_list)
train_pred_origin = encoder.inverse_transform(train_predictions_list)

test_label_origin = encoder.inverse_transform(test_labels_list)
test_pred_origin = encoder.inverse_transform(test_predictions_list)

train_label_counter = Counter(train_label_origin)
train_pred_counter = Counter(train_pred_origin)

test_label_counter = Counter(test_label_origin)
test_pred_counter = Counter(test_pred_origin)

print('------- train 레이블 및 예측 클래스 -------')
print("(train)Origin Label:\n", train_label_counter.most_common(), end='\n')
print("(train)Pred:\n", train_pred_counter.most_common(), end='\n\n\n')
print('------- test 레이블 및 예측 클래스 -------')
print("(test)Origin Label:\n", test_label_counter.most_common(), end='\n')
print("(test)Pred:\n", test_pred_counter.most_common(), end='\n\n\n')

train_corrects = [x for x, y in zip(train_label_origin, train_pred_origin) if x==y]
test_corrects = [x for x, y in zip(test_label_origin, test_pred_origin) if x==y]
train_correct_counter = Counter(train_corrects)
test_correct_counter = Counter(test_corrects)

print("(train)accuracy:",len(train_corrects)/len(train), end='\n')
print("(train)F1-score:",f1(train_predictions_list, train_labels_list), end='\n\n')
print("(test)accuracy:",len(test_corrects)/len(test), end='\n')
print("(test)F1-score:",f1(test_predictions_list, test_labels_list), end='\n\n')

------- train 레이블 및 예측 클래스 -------
(train)Origin Label:
 [('neutral', 7846), ('surprise', 890), ('disgust', 886), ('happy', 885), ('angry', 883), ('sad', 881), ('fear', 879)]
(train)Pred:
 [('neutral', 7844), ('surprise', 890), ('happy', 887), ('disgust', 886), ('angry', 883), ('sad', 881), ('fear', 879)]


------- test 레이블 및 예측 클래스 -------
(test)Origin Label:
 [('neutral', 2310), ('happy', 194), ('angry', 38), ('surprise', 27), ('sad', 24), ('disgust', 15), ('fear', 6)]
(test)Pred:
 [('neutral', 2154), ('happy', 349), ('sad', 32), ('angry', 32), ('surprise', 18), ('fear', 17), ('disgust', 12)]


(train)accuracy: 0.8993977552696414
(train)F1-score: tensor(0.9994)

(test)accuracy: 0.7788829380260138
(test)F1-score: tensor(0.7789)



In [105]:
# train 성능
print(encoder.classes_)
confmat = ConfusionMatrix(task='multiclass',num_classes = 7)
print(confmat(train_predictions_list, train_labels_list))
train_f1_score = f1(train_predictions_list, train_labels_list)
print("(train)accuracy:",len(train_corrects)/len(train), end='\n')
print("(train)F1-score:",train_f1_score)

['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
tensor([[ 883,    0,    0,    0,    0,    0,    0],
        [   0,  886,    0,    0,    0,    0,    0],
        [   0,    0,  879,    0,    0,    0,    0],
        [   0,    0,    0,  882,    3,    0,    0],
        [   0,    0,    0,    5, 7841,    0,    0],
        [   0,    0,    0,    0,    0,  881,    0],
        [   0,    0,    0,    0,    0,    0,  890]])
(train)accuracy: 0.8993977552696414
(train)F1-score: tensor(0.9994)


In [106]:
#test 성능

print(encoder.classes_)
print(confmat(test_predictions_list, test_labels_list))
test_f1_score = f1(test_predictions_list, test_labels_list)
print("(test)accuracy:",len(test_corrects)/len(test), end='\n')
print("(test)F1-score:",test_f1_score)

['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
tensor([[   3,    0,    0,    5,   29,    0,    1],
        [   1,    0,    1,    4,    8,    1,    0],
        [   1,    0,    3,    0,    2,    0,    0],
        [   3,    1,    1,   61,  124,    3,    1],
        [  24,   11,   12,  266, 1962,   22,   13],
        [   0,    0,    0,    7,   13,    4,    0],
        [   0,    0,    0,    6,   16,    2,    3]])
(test)accuracy: 0.7788829380260138
(test)F1-score: tensor(0.7789)


In [107]:
corrects = [x for x, y in zip(test_label_origin, test_pred_origin) if x==y]

correct_counter = Counter(corrects)
print(correct_counter.most_common())

[('neutral', 1962), ('happy', 61), ('sad', 4), ('surprise', 3), ('fear', 3), ('angry', 3)]


In [108]:
from sklearn  import metrics
print(metrics.classification_report(test_predictions_list, test_labels_list))

              precision    recall  f1-score   support

           0       0.08      0.09      0.09        32
           1       0.00      0.00      0.00        12
           2       0.50      0.18      0.26        17
           3       0.31      0.17      0.22       349
           4       0.85      0.91      0.88      2154
           5       0.17      0.12      0.14        32
           6       0.11      0.17      0.13        18

    accuracy                           0.78      2614
   macro avg       0.29      0.24      0.25      2614
weighted avg       0.75      0.78      0.76      2614



# 앙상블을 위한 모델 결과 추출

In [109]:
prediction_df = pd.DataFrame()
session = []
prediction = []
for idx, (x_data, y_data) in enumerate(test_loader):
    x_data, y_data = x_data.to(device), y_data.to(device)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, _ = model(test_x)
    session.extend(test['segment_id'][idx*100:(idx+1)*100])
    prediction.extend(outputs.tolist())

In [110]:
prediction = nn.Softmax(dim=1)(torch.Tensor(prediction)).cpu().detach().numpy()

prediction_df['segment_id'] = session
for idx, emotion in enumerate(encoder.classes_):
    prediction_df[emotion] = prediction[:,idx]
    
prediction_df

Unnamed: 0,segment_id,angry,disgust,fear,happy,neutral,sad,surprise
0,Sess04_script01_User007M_001,2.310783e-21,2.687779e-24,6.301774e-16,3.034450e-09,1.000000,2.695916e-14,1.210288e-19
1,Sess04_script01_User007M_002,9.331641e-13,1.417528e-16,1.644207e-13,2.718976e-08,1.000000,2.557166e-12,7.208035e-12
2,Sess04_script01_User007M_003,1.292569e-12,1.021408e-17,1.759455e-12,1.125723e-08,1.000000,2.710382e-15,1.436499e-16
3,Sess04_script01_User008F_001,1.456021e-11,9.911065e-22,1.762394e-15,3.664366e-13,1.000000,3.492650e-13,7.087115e-08
4,Sess04_script01_User008F_002,2.011627e-09,1.580955e-11,3.528787e-11,3.840004e-03,0.996159,1.301090e-06,2.901958e-12
...,...,...,...,...,...,...,...,...
2609,Sess39_script06_User077F_030,2.919015e-08,2.470039e-14,1.102114e-11,4.943937e-04,0.999506,1.652069e-08,1.121950e-21
2610,Sess39_script06_User077F_031,2.033342e-02,2.018174e-06,5.989259e-07,1.373145e-03,0.978271,1.763372e-05,2.576178e-06
2611,Sess39_script06_User077F_032,2.085477e-04,9.234060e-05,4.475374e-04,2.129416e-03,0.665013,3.321090e-01,4.197914e-09
2612,Sess39_script06_User077F_033,4.975493e-03,5.551174e-04,1.419322e-03,5.080065e-02,0.817444,1.241173e-01,6.884003e-04


In [111]:
prediction_df.to_csv('./textCNN_prediction.csv', index=False)