In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install torch
!pip install transformers==3

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/29/bb/54cbabe428351c06d10903c658878d29ee7026efbe45133fd133598d6eb6/mxnet-1.7.0.post1-py2.py3-none-manylinux2014_x86_64.whl (55.0MB)
[K     |████████████████████████████████| 55.0MB 58kB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Installing collected packages: graphviz, mxnet
  Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.7.0.post1
Collecting gluonnlp
[?25l  Downloading https://files.pythonhosted.org/packages/9c/81/a238e47ccba0d7a61dcef4e0b4a7fd4473cb86bed3d84dd4fe28d45a0905/gluonnlp-0.10.0.tar.gz (344kB)
[K     |████████████████████████████████| 348kB 16.5MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluo

In [3]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-vca3g56o
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-vca3g56o
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.2-cp36-none-any.whl size=12734 sha256=4d9d93f61ea9f33c2d6546a57674e0b1a583981437182779829ecac9df90181b
  Stored in directory: /tmp/pip-ephem-wheel-cache-sm_9prac/wheels/a2/b0/41/435ee4e918f91918be41529283c5ff86cd010f02e7525aecf3
Successfully built kobert
Installing collected packages: kobert
Successfully installed kobert-0.1.2


In [4]:
# os path
import os

# data analysis
import tensorflow as tf
import pandas as pd

# nlp
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [5]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [6]:
BASE_DIR = "/gdrive/My Drive/datascience/goverment_hackerton/emotion_analysis" # 로컬 경로로 바꿔주어야 로컬에서 작동됨

CODE_DIR = os.path.join(BASE_DIR, "code")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "model")

DATA_BINARY_DIR = os.path.join(DATA_DIR, "binary_label")
DATA_MULTI_DIR = os.path.join(DATA_DIR, "multi_label")

TRAIN_DATA_DIR = os.path.join(DATA_MULTI_DIR, "translated_train.csv")
TEST_DATA_DIR = os.path.join(DATA_MULTI_DIR, "translated_test.csv")

In [7]:
data_train = pd.read_csv(TRAIN_DATA_DIR, encoding='utf-8')
data_test = pd.read_csv(TEST_DATA_DIR, encoding='utf-8')

X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()

y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()

data = data_train.append(data_test, ignore_index=True)

class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']

print('size of training set: %s' % (len(data_train['Text'])))
print('size of validation set: %s' % (len(data_test['Text'])))
print(data.Emotion.value_counts())

data.head(10)

size of training set: 7934
size of validation set: 3393
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: Emotion, dtype: int64


Unnamed: 0,Emotion,Text
0,neutral,내가 더 낫다고 생각하는 다른 그림이 많이 있습니다.
1,sadness,"그러나 개는 늙고 능력이 떨어졌고, 어느 날 길리가 와서 그 개가 뇌졸중을 앓았으므..."
2,fear,티켓을 지불하지 않고 지하철이나 기차에 들어갈 때.
3,fear,이 마지막 부분은 상당한 불안의 근원이 될 수 있으며 처음에는 그러한 명백한 '부도...
4,anger,"그녀는 그가 그들 중 일부에 대해 보여준 친밀함을 싫어했고, 그녀가 일부가 아닌 공..."
5,sadness,우리 가족은 영국에 사는 어머니의 사촌이 임파선 암에 걸렸다 고 우리에게 편지를 썼...
6,joy,내가 중국 실어증에 대한 규범을 수집하도록 선택되었음을 알게 됨 (나는 중국이 신경...
7,anger,"대변인은 ""글렌은 새로운""무정부 상태 ""프로모션이 자신뿐만 아니라 시드 비셔스의 영..."
8,neutral,예 .
9,sadness,화상을 입은 사람을 보면 슬프지만 실제로는 많이 고통을 당해야한다고 생각해서 표현조...


In [8]:
##GPU 사용 시
device = torch.device("cuda:0")

In [9]:
# model load
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [10]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [16]:
emotion2label = {"neutral" : 0, "sadness" : 1, "fear" : 2, "anger" : 3, "joy" : 4}
label2emotion = { 0 : "neutral", 1 : "sadness",  2 : "fear",  3 : "anger",  4 : "joy"}

In [26]:
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        texts = dataset["Text"].tolist()
        labels = dataset["Emotion"].tolist()

        self.sentences = [transform([text]) for text in texts]
        self.labels = [np.int32(emotion2label[label]) for label in labels]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [27]:
## Setting parameters
max_len = 64 ##############################
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5 ##############################
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [28]:
data_train = BERTDataset(data_train, tok, max_len, True, False) ############################## 0 or 1 수정(원래 0 이 sentence, 1 이 label)
data_test = BERTDataset(data_test, tok, max_len, True, False) ############################## 0 or 1 수정(원래 0 이 sentence, 1 이 label)

In [29]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [30]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 5,
                 dr_rate = None,
                 params = None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

    def predict(self, sentence)

In [31]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [32]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [33]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [34]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [35]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [36]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [69]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        print(label)
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=124.0), HTML(value='')))

tensor([0, 1, 2, 2, 3, 1, 4, 3, 0, 1, 2, 1, 4, 3, 0, 0, 4, 2, 3, 3, 2, 4, 1, 0,
        1, 0, 4, 3, 0, 3, 1, 3, 1, 4, 2, 2, 1, 2, 4, 3, 4, 4, 1, 3, 1, 4, 4, 3,
        1, 2, 0, 3, 3, 2, 0, 1, 3, 4, 1, 4, 3, 1, 4, 4], dtype=torch.int32)
epoch 1 batch id 1 loss 0.4669877290725708 train acc 0.84375
tensor([2, 3, 0, 3, 0, 4, 0, 0, 4, 2, 3, 0, 2, 0, 2, 0, 3, 1, 4, 2, 1, 0, 0, 3,
        1, 4, 0, 4, 3, 4, 1, 2, 3, 3, 3, 3, 2, 4, 1, 1, 1, 4, 4, 2, 3, 2, 1, 2,
        3, 0, 1, 3, 1, 3, 0, 1, 2, 0, 3, 2, 4, 3, 2, 2], dtype=torch.int32)
tensor([4, 1, 2, 3, 2, 1, 4, 3, 3, 0, 1, 1, 1, 0, 0, 0, 3, 1, 3, 1, 2, 3, 4, 1,
        3, 0, 1, 4, 3, 4, 0, 0, 2, 1, 2, 1, 4, 3, 2, 1, 4, 4, 3, 4, 0, 4, 0, 1,
        3, 0, 0, 0, 2, 4, 4, 3, 3, 3, 3, 4, 2, 0, 1, 0], dtype=torch.int32)
tensor([1, 0, 3, 4, 1, 4, 4, 0, 1, 1, 0, 1, 3, 4, 1, 2, 1, 1, 0, 0, 3, 2, 4, 2,
        1, 2, 4, 3, 2, 0, 3, 3, 1, 2, 4, 2, 0, 3, 1, 4, 1, 0, 4, 1, 1, 0, 4, 4,
        1, 3, 0, 1, 0, 1, 4, 3, 3, 2, 3, 2, 1, 4, 0, 0], dtype=torch.in

KeyboardInterrupt: ignored

In [41]:
torch.save(model.state_dict(), os.path.join(MODEL_DIR, "5_label_model.pt"))

# Inference


In [78]:
class BERTDatasetForTest(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        texts = dataset["sentence"].tolist()
        # labels = dataset["Emotion"].tolist()

        self.sentences = [transform([text]) for text in texts]
        # self.labels = [np.int32(emotion2label[label]) for label in labels]

    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))


In [79]:
real_test_data = pd.read_csv(os.path.join(DATA_LABEL5_DIR, "pos_neg_test.txt"))
real_test_data.head(10)

Unnamed: 0,sentence
0,오랜만에 만나서 반갑다는 얘기를 할 것 같다.
1,나에 대해 관심이 없어서 얘기할 것 같지 않다.
2,나를 동정할 것 같다. 최근에 안좋은 일이 많았기 때문에..
3,친구랑 놀고 다시 혼자가 되면 머릿속이 복잡해진다.
4,역시 사람을 만나는건 너무 힘들어
5,오늘 친구에게 실수한 것이 없는지 곱씹어보고 괜시리 불안해함
6,나는 너무 부주의해. 매일 실수만 한다.
7,애인이 나에 대해 실망할 것 같아서 슬퍼진다.
8,애인이 알기 전에 새로 사놓아야 겠다
9,그래도 이땐 행복했었구나 하는 생각. 기분이 묘하다


In [80]:
data_real_test = BERTDatasetForTest(real_test_data, tok, max_len, True, False)

In [81]:
real_test_dataloader = torch.utils.data.DataLoader(data_real_test, batch_size=batch_size, num_workers=5)

In [82]:
PATH = os.path.join(MODEL_DIR, os.path.join(MODEL_DIR, "5_label_model.pt"))
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
model.load_state_dict(torch.load(PATH))


<All keys matched successfully>

In [83]:
model.eval()
for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(real_test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    #label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [86]:
max_vals, max_indices = torch.max(out, 1)

In [89]:
predicted_emotion = list(label2emotion[label] for label in max_indices.tolist())
predicted_emotion

['joy',
 'sadness',
 'sadness',
 'sadness',
 'sadness',
 'fear',
 'sadness',
 'sadness',
 'sadness',
 'joy',
 'joy',
 'anger',
 'sadness',
 'sadness',
 'joy',
 'sadness',
 'sadness',
 'joy',
 'sadness',
 'anger',
 'neutral',
 'fear',
 'joy',
 'sadness',
 'joy',
 'neutral',
 'joy',
 'fear',
 'fear']

In [90]:
inference = pd.DataFrame({"Text" : real_test_data["sentence"].tolist(), "Emotion" : predicted_emotion})
inference

Unnamed: 0,Text,Emotion
0,오랜만에 만나서 반갑다는 얘기를 할 것 같다.,joy
1,나에 대해 관심이 없어서 얘기할 것 같지 않다.,sadness
2,나를 동정할 것 같다. 최근에 안좋은 일이 많았기 때문에..,sadness
3,친구랑 놀고 다시 혼자가 되면 머릿속이 복잡해진다.,sadness
4,역시 사람을 만나는건 너무 힘들어,sadness
5,오늘 친구에게 실수한 것이 없는지 곱씹어보고 괜시리 불안해함,fear
6,나는 너무 부주의해. 매일 실수만 한다.,sadness
7,애인이 나에 대해 실망할 것 같아서 슬퍼진다.,sadness
8,애인이 알기 전에 새로 사놓아야 겠다,sadness
9,그래도 이땐 행복했었구나 하는 생각. 기분이 묘하다,joy
