# 데이터 전처리

In [31]:
import pandas as pd
import pickle
# from gensim.models import fasttext
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder
from collections import Counter

from torch.utils.data import Dataset, DataLoader

from torchmetrics import F1Score, ConfusionMatrix
import time

import gensim
from gensim.models import fasttext
from sklearn.model_selection import train_test_split
# ft_model = fasttext.load_facebook_model('/content/drive/MyDrive/wiki.ko.bin')
ft_model = gensim.models.fasttext.load_facebook_model('D:\\wiki.ko\\wiki.ko.bin')
# ft_model = gensim.models.fasttext.load_facebook_model('./wiki.ko.bin')

## 데이터 불러오기

## 필요한 데이터 추출

In [32]:
# 저장한 데이터 토큰 피클 파일 불러오기
with open('./dataset/token_NonNP.pkl', 'rb') as f:
    text_data = pickle.load(f)

text_data['token_len'] = [len(x) for x in text_data['token']]

# train용 aug token data
with open('./dataset/augNonNP_df.pkl', 'rb') as f:
    text_data2 = pickle.load(f)

# 다중 레이블 처리한 훈련 데이터셋 가져오기
with open('./dataset/train_origin.pkl', 'rb') as f:
    train_data = pickle.load(f)

# 다중 레이블 처리한 테스트 데이터셋 가져오기
with open('./dataset/test_origin.pkl', 'rb') as f:
    test_data = pickle.load(f)

text_data2.head()

Unnamed: 0,segment_id,emotion,emotion_id,token,token_len,augmented
4432,Sess19_script06_User038F_015,angry,0,"[어, 랑, 친해져서, 그, 친구, 랑, 친해, 지려]",8,"[랑, 어, 친해져서, 그, 친구, 친해, 랑, 지려]"
4528,Sess20_script02_User039M_039,angry,0,"[또, 계산, 안, 해, 가지, 고]",6,"[또, 계산, 안, 고, 가지, 해]"
5640,Sess25_script02_User050F_024,angry,0,"[친구, 도, 아니, 다]",4,"[친구, 도, 아니, 둘]"
3593,Sess16_script06_User031M_020,angry,0,"[아, 욕, 나와, 욕, 나올, 거, 같, 은데, 욕, 나올, 거, 같, 은데, 암...",17,"[거, 아, 나와, 어, 나올, 욕, 나올, 은데, 욕, 같, 거, 같, 욕, 암튼..."
1578,Sess09_script02_User018M_008,angry,0,"[는, 좀, 되게, 화, 가]",5,"[는, 좀, 되게, 화, 와]"


In [33]:
# 필요한 컬럼만 남겨두기
#train_data = train_data.drop(['seconds', 'sess', 'script'], axis=1)
test_data = test_data.drop(['seconds', 'sess', 'script'], axis=1)

# 텍스트에서 필요한 컬럼만 남겨두기 및 컬럼 이름 맞추기
text_data = text_data[['segment_id', 'token','token_len']]
text_data2 = text_data2[['segment_id','emotion','emotion_id','augmented','token_len']]
text_data2 = text_data2.rename(columns={'augmented':'token'})

In [34]:
# 훈련 및 테스트 데이터에 텍스트 데이터 붙이기
train = text_data2
test = pd.merge(test_data, text_data, how='left', on=['segment_id'])

In [35]:
len(train), len(train[train['emotion_id']==4])

(67900, 8750)

In [36]:
import numpy as np
len(np.unique(train['segment_id']))

10140

In [37]:
max_class = len(train_data[train_data['emotion_id']==4])
max_class

8750

In [38]:
# 원래 happy 데이터
happy_origin = pd.merge(train_data[train_data['emotion']=='happy'], text_data, how='left', on='segment_id')[['segment_id','emotion','emotion_id', 'token', 'token_len']]

# happy 개수에 맞춰서 증강
for i in [0,1,2,5,6]:
    a = train[train['emotion_id']==i].sample(n=max_class,replace=F, random_state=42)
    if i < 1:
        data = a
    else:
        data = pd.concat([data, a], axis=0)
    print(i,"class : ",len(data[data['emotion_id']==i]))
data = pd.concat([data, happy_origin])
data = pd.concat([data, train[train['emotion_id']==4]], axis=0)
len(data)
print("3 class : ",len(data[data['emotion_id']==3]))
print("4 class : ",len(data[data['emotion_id']==4]))

train = data

0 class :  8750
1 class :  8750
2 class :  8750
5 class :  8750
6 class :  8750
3 class :  977
4 class :  8750


## 텍스트 임베딩

In [39]:
# 최대 토큰 개수 구하기
max_token = train['token_len'].max()
max_token

150

In [40]:
train.describe()

Unnamed: 0,emotion_id,token_len
count,53477.0,53477.0
mean,3.0,16.888176
std,2.140443,15.464082
min,0.0,1.0
25%,1.0,6.0
50%,3.0,12.0
75%,5.0,23.0
max,6.0,150.0


In [41]:
max_token = 21

In [42]:
def get_embedding(data):
    # 텍스트 임베딩
    text_emb = []
    for tokens in tqdm(data['token']):
        sent_emb = []
        for i in range(max_token):
            # 워드 임베딩 초기화
            word_emb = []

            # 최대 단어 길이(156) 이하면 패딩
            if i >= len(tokens):
                word_emb = [np.array([0.]*300, dtype='float32')]*(max_token-i)
                sent_emb.extend(word_emb)
                break;

            # fasttext 워드 임베딩 가져옴
            word_emb = ft_model.wv[tokens[i]]
            sent_emb.append(word_emb)

        text_emb.append(sent_emb)

    data['text_emb'] = text_emb
    return data['text_emb']

In [43]:
# 데이터셋에 텍스트 임베딩 추가하기
train['text_emb'] = get_embedding(train)
test['text_emb'] = get_embedding(test)

100%|█████████████████████████████████████████████████████████████████████████| 53477/53477 [00:02<00:00, 21369.75it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2614/2614 [00:00<00:00, 23793.73it/s]


In [44]:
# 텍스트 임베딩 결과 확인
train['text_emb'].head()

6146    [[0.18122339, 0.20740172, -0.12993784, 0.01915...
618     [[-0.2459137, 0.14915791, -0.8468651, 0.109163...
6929    [[-0.9367216, 0.85393, -1.3444862, -0.25269663...
3332    [[-0.24933465, 0.42417058, -0.72755027, -0.392...
4528    [[-0.16821234, 0.26754832, -0.41189933, 0.0361...
Name: text_emb, dtype: object

In [45]:
test['text_emb'].head()

0    [[-0.9367216, 0.85393, -1.3444862, -0.25269663...
1    [[-0.19490944, -0.06741688, -0.23153214, 0.168...
2    [[-1.0400262, 0.25054872, -1.7955443, -0.41869...
3    [[-0.004059928, 0.08098799, -0.5521582, 0.2013...
4    [[-0.40613854, 0.21009226, -0.21392642, 0.1807...
Name: text_emb, dtype: object

# 모델링

## 데이터셋 및 모델 설정

In [46]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [47]:
# 레이블에 대해 정수 인코딩
encoder = LabelEncoder()
train_label = encoder.fit_transform(train[['emotion']].to_numpy().reshape(-1))
test_label = encoder.transform(test[['emotion']].to_numpy().reshape(-1))

In [48]:
# 텍스트 데이터셋 만들기
class TextDataset(Dataset):
    def __init__(self, mode):
        self.mode = mode
        X_train, X_valid, y_train, y_valid = train_test_split(train['text_emb'], train_label, test_size=0.1, random_state=42)
        if self.mode == 'train':
            self.x_data = torch.FloatTensor(X_train.array)
            self.y_data = torch.LongTensor(y_train)
        elif self.mode == 'valid':
            self.x_data = torch.FloatTensor(X_valid.array)
            self.y_data = torch.LongTensor(y_valid)
        else:
            self.x_data = torch.FloatTensor(test['text_emb'].array)
            self.y_data = torch.LongTensor(test_label)
    
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]
        return x, y

In [49]:
# textCNN 모델 구성
class textCNN(nn.Module):
    def __init__(self, dim_channel, kernel_wins, dropout_rate, num_class):
        super(textCNN, self).__init__()
        self.emb_dim = 300
        # 커널 사이즈에 따라 CNN 설정(kernel_wins)
        self.convs = nn.ModuleList([nn.Conv2d(1, dim_channel, (w, self.emb_dim)) for w in kernel_wins])
        self.dropout = nn.Dropout(dropout_rate)
        # CNN에서 나온 값 합치기
        self.ft_fc = nn.Linear(len(kernel_wins)*dim_channel, 128)
        self.fc = nn.Linear(128, num_class)
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        emb_x = x.unsqueeze(1)
        
        conv_x = [conv(emb_x) for conv in self.convs]
        
        pool_x = [F.max_pool1d(x.squeeze(-1), x.size()[2]) for x in conv_x]
        
        fc_x = torch.cat(pool_x, dim=1)
        fc_x = fc_x.squeeze(-1)
        fc_x = F.relu(fc_x)
        fc_x = self.dropout(fc_x)
        ft_fc = self.ft_fc(fc_x)
        fc_x = self.fc(ft_fc)
        logit = fc_x
        return logit, ft_fc

In [50]:
# 하이퍼파라미터 설정 및 모델 생성
learning_rate = 0.0005
dim_channel = 100
kernel_wins = [4,5]
dropout_rate = 0.4
num_class = len(encoder.classes_)

model = textCNN(dim_channel=dim_channel, kernel_wins=kernel_wins, dropout_rate=dropout_rate, num_class=num_class);
model.to(device)

criterion = nn.CrossEntropyLoss();
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

textCNN(
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (ft_fc): Linear(in_features=200, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=7, bias=True)
  (softmax): Softmax(dim=1)
)


In [51]:
# 데이터셋 생성(40초 가량 소요)
train_dataset = TextDataset(mode='train')
valid_dataset = TextDataset(mode='valid')
test_dataset = TextDataset(mode='test')

In [52]:
# 데이터로더 생성
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=100)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100)

## 학습하기

In [53]:
model.train()
# 모델 학습
num_epochs = 70

start_time = time.time()
for epoch in range(1,num_epochs+1):
    for x_data, y_data in train_loader:
        x_data, y_data = x_data.to(device), y_data.to(device)
        
        train_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
        labels = Variable(y_data)
        
        outputs, _ = model(train_x)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if not (epoch % 10):
        label_list = []
        predict_list = []
        for x_data, y_data in valid_loader:
            x_data, y_data = x_data.to(device), y_data.to(device)
            
            test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
            outputs, _ = model(test_x)
            predictions = torch.max(outputs,1)[1].to(device)

            label_list.extend(y_data)
            predict_list.extend(predictions)
        
        print('Epoch: {}, Loss: {}'.format(epoch, loss.data))
end_time = time.time()
print('Time:', end_time-start_time)

Epoch: 10, Loss: 0.004533366300165653
Epoch: 20, Loss: 0.0038035784382373095
Epoch: 30, Loss: 0.0003169849223922938
Epoch: 40, Loss: 0.006059729959815741
Epoch: 50, Loss: 0.008041176944971085
Epoch: 60, Loss: 0.00042084703454747796
Epoch: 70, Loss: 0.0018562530167400837
Time: 647.0195119380951


# 성능 확인

## 훈련셋 성능

In [54]:
# 훈련셋에서의 f1스코어
train_predictions_list = []
train_labels_list = []
model.eval()
for x_data, y_data in train_loader:
    x_data, y_data = x_data.to(device), y_data.to(device)
    train_labels_list.extend(y_data)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, feature = model(test_x)
    
    train_predictions = torch.max(outputs,1)[1].to(device)
    train_predictions_list.extend(train_predictions)
    
train_predictions_list = torch.tensor(train_predictions_list)
train_labels_list = torch.tensor(train_labels_list)

f1 = F1Score(task='multiclass', num_classes=7)
f1_score = f1(train_predictions_list, train_labels_list)
f1_score

tensor(0.9969)

## 테스트셋 성능

In [55]:
# 테스트셋에서의 f1스코어
test_predictions_list = []
test_labels_list = []
model.eval()
for x_data, y_data in test_loader:
    x_data, y_data = x_data.to(device), y_data.to(device)
    test_labels_list.extend(y_data)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, feature = model(test_x)
    
    test_predictions = torch.max(outputs,1)[1].cpu()
    test_predictions_list.extend(test_predictions)
    
test_predictions_list = torch.tensor(test_predictions_list)
test_labels_list = torch.tensor(test_labels_list)

f1 = F1Score(task='multiclass', num_classes=7)
f1_score = f1(test_predictions_list, test_labels_list)
f1_score

tensor(0.8416)

## 클래스별 예측 개수 확인

In [57]:
# 클래스 별로 예측 값 개수 확인
train_label_origin = encoder.inverse_transform(train_labels_list)
train_pred_origin = encoder.inverse_transform(train_predictions_list)

test_label_origin = encoder.inverse_transform(test_labels_list)
test_pred_origin = encoder.inverse_transform(test_predictions_list)

train_label_counter = Counter(train_label_origin)
train_pred_counter = Counter(train_pred_origin)

test_label_counter = Counter(test_label_origin)
test_pred_counter = Counter(test_pred_origin)

print('------- train 레이블 및 예측 클래스 -------')
print("(train)Origin Label:\n", train_label_counter.most_common(), end='\n')
print("(train)Pred:\n", train_pred_counter.most_common(), end='\n\n\n')
print('------- test 레이블 및 예측 클래스 -------')
print("(test)Origin Label:\n", test_label_counter.most_common(), end='\n')
print("(test)Pred:\n", test_pred_counter.most_common(), end='\n\n\n')

train_corrects = [x for x, y in zip(train_label_origin, train_pred_origin) if x==y]
test_corrects = [x for x, y in zip(test_label_origin, test_pred_origin) if x==y]
train_correct_counter = Counter(train_corrects)
test_correct_counter = Counter(test_corrects)

print("(train)accuracy:",len(train_corrects)/len(train), end='\n')
print("(train)F1-score:",f1(train_predictions_list, train_labels_list), end='\n\n')
print("(test)accuracy:",len(test_corrects)/len(test), end='\n')
print("(test)F1-score:",f1(test_predictions_list, test_labels_list), end='\n\n')

------- train 레이블 및 예측 클래스 -------
(train)Origin Label:
 [('disgust', 7918), ('fear', 7895), ('sad', 7867), ('surprise', 7856), ('angry', 7856), ('neutral', 7848), ('happy', 889)]
(train)Pred:
 [('neutral', 7993), ('disgust', 7918), ('fear', 7895), ('sad', 7868), ('surprise', 7856), ('angry', 7856), ('happy', 743)]


------- test 레이블 및 예측 클래스 -------
(test)Origin Label:
 [('neutral', 2310), ('happy', 194), ('angry', 38), ('surprise', 27), ('sad', 24), ('disgust', 15), ('fear', 6)]
(test)Pred:
 [('neutral', 2421), ('sad', 44), ('surprise', 38), ('angry', 34), ('happy', 29), ('fear', 27), ('disgust', 21)]


(train)accuracy: 0.8972081455579034
(train)F1-score: tensor(0.9969)

(test)accuracy: 0.8416220351951033
(test)F1-score: tensor(0.8416)



In [58]:
# train 성능
print(encoder.classes_)
confmat = ConfusionMatrix(task='multiclass',num_classes = 7)
print(confmat(train_predictions_list, train_labels_list))
train_f1_score = f1(train_predictions_list, train_labels_list)
print("(train)accuracy:",len(train_corrects)/len(train), end='\n')
print("(train)F1-score:",train_f1_score)

['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
tensor([[7856,    0,    0,    0,    0,    0,    0],
        [   0, 7918,    0,    0,    0,    0,    0],
        [   0,    0, 7895,    0,    0,    0,    0],
        [   0,    0,    0,  743,  146,    0,    0],
        [   0,    0,    0,    0, 7846,    1,    1],
        [   0,    0,    0,    0,    0, 7867,    0],
        [   0,    0,    0,    0,    1,    0, 7855]])
(train)accuracy: 0.8972081455579034
(train)F1-score: tensor(0.9969)


In [59]:
#test 성능

print(encoder.classes_)
print(confmat(test_predictions_list, test_labels_list))
test_f1_score = f1(test_predictions_list, test_labels_list)
print("(test)accuracy:",len(test_corrects)/len(test), end='\n')
print("(test)F1-score:",test_f1_score)

['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
tensor([[   4,    1,    0,    0,   32,    0,    1],
        [   1,    2,    1,    1,    9,    1,    0],
        [   0,    0,    5,    0,    1,    0,    0],
        [   4,    1,    1,    9,  170,    5,    4],
        [  24,   17,   19,   19, 2171,   30,   30],
        [   0,    0,    1,    0,   17,    6,    0],
        [   1,    0,    0,    0,   21,    2,    3]])
(test)accuracy: 0.8416220351951033
(test)F1-score: tensor(0.8416)


In [60]:
corrects = [x for x, y in zip(test_label_origin, test_pred_origin) if x==y]

correct_counter = Counter(corrects)
print(correct_counter.most_common())

[('neutral', 2171), ('happy', 9), ('sad', 6), ('fear', 5), ('angry', 4), ('surprise', 3), ('disgust', 2)]


In [61]:
from sklearn  import metrics
print(metrics.classification_report(test_predictions_list, test_labels_list))

              precision    recall  f1-score   support

           0       0.11      0.12      0.11        34
           1       0.13      0.10      0.11        21
           2       0.83      0.19      0.30        27
           3       0.05      0.31      0.08        29
           4       0.94      0.90      0.92      2421
           5       0.25      0.14      0.18        44
           6       0.11      0.08      0.09        38

    accuracy                           0.84      2614
   macro avg       0.35      0.26      0.26      2614
weighted avg       0.89      0.84      0.86      2614



# 앙상블을 위한 모델 결과 추출

In [63]:
prediction_df = pd.DataFrame()
session = []
prediction = []
for idx, (x_data, y_data) in enumerate(test_loader):
    x_data, y_data = x_data.to(device), y_data.to(device)
    test_x = Variable(x_data.view(x_data.shape[0], max_token, 300))
    outputs, _ = model(test_x)
    session.extend(test['segment_id'][idx*100:(idx+1)*100])
    prediction.extend(outputs.tolist())

In [64]:
prediction = nn.Softmax(dim=1)(torch.Tensor(prediction)).cpu().detach().numpy()

prediction_df['segment_id'] = session
for idx, emotion in enumerate(encoder.classes_):
    prediction_df[emotion] = prediction[:,idx]
    
prediction_df

Unnamed: 0,segment_id,angry,disgust,fear,happy,neutral,sad,surprise
0,Sess04_script01_User007M_001,6.520003e-21,1.137993e-27,1.023544e-19,7.574638e-14,1.000000,1.047191e-08,2.063061e-35
1,Sess04_script01_User007M_002,1.493681e-07,1.561719e-19,8.623955e-13,6.276847e-16,1.000000,4.351435e-13,1.290336e-21
2,Sess04_script01_User007M_003,2.780623e-14,7.769328e-20,2.413651e-17,9.262696e-13,1.000000,1.583426e-15,5.246088e-29
3,Sess04_script01_User008F_001,9.956599e-10,2.030422e-16,6.999223e-18,5.981200e-09,1.000000,1.600428e-13,2.974548e-22
4,Sess04_script01_User008F_002,6.752290e-09,7.723592e-12,1.561404e-12,9.442046e-05,0.999905,9.402771e-10,1.775467e-07
...,...,...,...,...,...,...,...,...
2609,Sess39_script06_User077F_030,1.915243e-01,3.167058e-08,7.088742e-14,2.114390e-10,0.808476,3.843464e-12,4.440940e-18
2610,Sess39_script06_User077F_031,1.370620e-04,8.929949e-07,6.556760e-07,1.181387e-05,0.999799,1.610984e-06,4.881570e-05
2611,Sess39_script06_User077F_032,4.475780e-04,1.996180e-06,7.552893e-04,8.839448e-03,0.985327,4.628805e-03,8.079978e-09
2612,Sess39_script06_User077F_033,3.675811e-03,3.630049e-04,1.060227e-02,5.387709e-02,0.904571,1.460734e-02,1.230337e-02


In [65]:
prediction_df.to_csv('./textCNN_prediction.csv', index=False)