In [None]:
from google.colab import drive 
drive.mount('/content/gdrive') 

Mounted at /content/gdrive


In [None]:
DATA_PATH = 'gdrive/My Drive/Colab Notebooks/KU-NLP-2020-1/Data/'
import sys
sys.path.append(DATA_PATH)

In [None]:
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp36-cp36m-linux_x86_64.whl
!pip install transformers --quiet # package installer for python

In [None]:
import os
import torch
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import random
import time
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_xla
import torch_xla.core.xla_model as xm
from transformers import AdamW, BertConfig
from transformers import BertModel, BertForSequenceClassification, BertTokenizer
from transformers import ElectraTokenizer
from transformers import ElectraModel, ElectraForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split



In [None]:
# GPU 및 TPU 사용여부 설정
tpu_use = True
# Acquires the default Cloud TPU core and moves the model to it
if tpu_use == True:
    device = xm.xla_device()
    print(device)      
elif torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'    
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')   

xla:1


In [None]:
# Bert 모델 설정
pretrained_weights = 'bert-base-uncased' #google/electra-small-generator' 'monologg/bert-base-cased-goemotions-ekman' 'bert-large-cased' 
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
#tokenizer = ElectraTokenizer.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
# 데이터셋 로드
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open(DATA_PATH + 'friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [None]:
# 캐글 테스트셋 로드
test_data = pd.read_csv(DATA_PATH + "en_data.csv", sep=',')
print(test_data.shape)
print(test_data[:2])

(1623, 5)
   id  i_dialog  i_utterance speaker                      utterance
0   0         0            0  Phoebe  Alright, whadyou do with him?
1   1         0            1  Monica              Oh! You're awake!


In [None]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}
e2i_dict

{'anger': 2,
 'disgust': 7,
 'fear': 5,
 'joy': 0,
 'neutral': 4,
 'non-neutral': 6,
 'sadness': 3,
 'surprise': 1}

In [None]:
# 추가 학습 데이터 가공
train_add_data = pd.read_csv(DATA_PATH + "kaggle_train.txt", sep=';', names=['utterance','emotion'])
# 불필요 항목 제거
train_add_data = train_add_data.drop(train_add_data[train_add_data.emotion == 'love'].index) 
train_add_data['speaker'] = 'Anonymous'
train_add_data.head

<bound method NDFrame.head of                                                utterance  emotion    speaker
0                                i didnt feel humiliated  sadness  Anonymous
1      i can go from feeling so hopeless to so damned...  sadness  Anonymous
2       im grabbing a minute to post i feel greedy wrong    anger  Anonymous
4                                   i am feeling grouchy    anger  Anonymous
5      ive been feeling a little burdened lately wasn...  sadness  Anonymous
...                                                  ...      ...        ...
19995  i just keep feeling like someone is being unki...    anger  Anonymous
19996  im feeling a little cranky negative after this...    anger  Anonymous
19997  i feel that i am useful to my people and that ...      joy  Anonymous
19998  im feeling more comfortable with derby i feel ...      joy  Anonymous
19999  i feel all weird when i have to meet w people ...     fear  Anonymous

[18359 rows x 3 columns]>

In [None]:
# 추가 학습 데이터 기존 훈련데이터 추가
for idx, add_data in train_add_data.iterrows():
  data['train']['utterance'].append(add_data[0])
  data['train']['emotion'].append(add_data[1])
  data['train']['speaker'].append(add_data[2])

In [None]:
e2i_cnt = [e2i_dict[data['train']['emotion'][i]] for i in range(len(data['train']['utterance']))]

#감정별 데이터 분포 확인
e2i_pd = pd.DataFrame(e2i_cnt, columns=['emotion'])
e2i_pd["emotion_nm"] = e2i_pd.apply(lambda x : i2e_dict[x["emotion"]] , axis = 1 )
e2i_pd.groupby(e2i_pd['emotion']).count()

Unnamed: 0_level_0,emotion_nm
emotion,Unnamed: 1_level_1
0,8044
1,1939
2,3222
3,6148
4,4752
5,2558
6,2017
7,240


In [None]:
# BERT 모델 config 설정
config = BertConfig.from_pretrained(
        pretrained_weights,
        num_labels=len(e2i_dict),
        id2label=i2e_dict,
        label2id=e2i_dict
    )

#print(config.num_labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [None]:
# 데이터셋 샘플 출력
print("대화", data['train']['utterance'][:5 ]) 
print("화자",data['train']['speaker'][:5] )
print("감정",data['train']['emotion'][:5] )

대화 ['also I was the point person on my company\x92s transition from the KL-5 to GR-6 system.', 'You must\x92ve had your hands full.', 'That I did. That I did.', 'So let\x92s talk a little bit about your duties.', 'My duties?  All right.']
화자 ['Chandler', 'The Interviewer', 'Chandler', 'The Interviewer', 'Chandler']
감정 ['neutral', 'neutral', 'neutral', 'neutral', 'surprise']


In [None]:
# 배치 사이즈 조정
batch_size = 2
# 우리 모델은 한 문장을 사용합니다. (컨텍스트도 파악하십시오.)
# 우리 모델은 화자 정보를 고려하지 않습니다. (정보를 고려하십시오.)

# 테스트데이터 pytorch dataset클래스 활용 전처리
def Embedding(texts, speakers, labels, batch_size, final_flag=False):
    # BERT의 입력 형식에 맞게 변환

    bf_text = ''
    sentences = []
    for text, speaker in zip(texts, speakers):       
      #sentences.append("[CLS] " + str(text) + " [SEP]")
      #sentences.append("[CLS] " + str(speaker) + " [SEP]" + str(text) + " [SEP]")      
      #sentences.append("[CLS] " + str(bf_text) + " [SEP]" + str(text) + " [SEP]")
      #sentences.append("[CLS] " + str(bf_text) + " [SEP]" + str(speaker) + " [SEP]" + str(text) + " [SEP]")
      sentences.append("[CLS] " + str(bf_text) + "[" + str(speaker) + "] " + str(text) + "[SEP]")
      bf_text = str(text)
    

    # 텍스트 평균, MAX길이 구하기
    max_length = 0
    average_length = 0

    for i, text in enumerate(sentences):  
      text_length = len(text)
      average_length += text_length
        
      if text_length > max_length:
        max_length = text_length
      
      #print(i, text, text_length)

    average_length /= len(sentences)    

    print("Max Text Length", max_length)
    print("Average Text Length", average_length)

    #sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in texts]

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 256
    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    if not final_flag:
      labels = [e2i_dict[label] for label in labels]

    # 데이터를 파이토치의 텐서로 변환
    ebd_inputs = torch.tensor(input_ids)
    ebd_labels = torch.tensor(labels)
    ebd_masks = torch.tensor(attention_masks)

    #print(train_inputs[0], train_labels[0], train_masks[0])

    if not final_flag:
      ebd_data = TensorDataset(ebd_inputs, ebd_masks, ebd_labels)
      ebd_sampler = RandomSampler(ebd_data)
      ebd_dataloader = DataLoader(ebd_data, sampler=ebd_sampler, batch_size=batch_size)
    else:
      ebd_data = TensorDataset(ebd_inputs, ebd_masks, ebd_labels)
      ebd_dataloader = DataLoader(ebd_data, batch_size=batch_size)

    return ebd_dataloader

train_dataloader = Embedding( data['train']['utterance'], data['train']['speaker'], data['train']['emotion'], batch_size)   
validation_dataloader = Embedding( data['dev']['utterance'], data['dev']['speaker'], data['dev']['emotion'], batch_size)   
test_dataloader = Embedding( data['test']['utterance'], data['test']['speaker'], data['test']['emotion'], batch_size)   
final_test_dataloader = Embedding( test_data['utterance'], test_data['speaker'], test_data['id'], 1, True)   

Max Text Length 584
Average Text Length 172.7609958506224
Max Text Length 344
Average Text Length 97.78268251273344
Max Text Length 334
Average Text Length 101.58357452966715
Max Text Length 300
Average Text Length 100.6746765249538


In [None]:
#model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels = 8)
#model = ElectraForSequenceClassification.from_pretrained(pretrained_weights, num_labels = 8)
#model = BertForMultiLabelClassification.from_pretrained(pretrained_weights, config=config)

In [None]:
# Bert모델에 LinearClassfier를 붙여 분류모델 생성
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden_size = 768 # large 1024    
    self.num_labels = len(e2i_dict)
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)    
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(self.hidden_size, 256) 
    self.dropout = torch.nn.Dropout(0.1)
    self.classifier = torch.nn.Linear(256, self.num_labels)

  def forward(self, b_input_ids, attention_mask, labels=None, weights=None):   

    hidden_tensor = self.bert_model(input_ids=b_input_ids, attention_mask=attention_mask)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)

    x = self.linear(hidden_tensor)    
    x = self.dropout(x)
    x = F.gelu(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
    x = self.dropout(x)
    logits = self.classifier(x)

    if labels is not None:
      loss_fct = torch.nn.CrossEntropyLoss().to(device) # LogSoftmax & NLLLoss weight=weights    
      loss = loss_fct(logits, labels)
    else:
      loss = None
    
    return loss, logits

model = Model()    

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
'''
# Electra 등 다른 모델 생성
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden_size = 512#768 #large 1024    
    self.num_labels = len(e2i_dict)
    #self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_tokenizer = ElectraTokenizer.from_pretrained(pretrained_weights)
    #self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.bert_model = ElectraModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(self.hidden_size, 256) 
    self.dropout = torch.nn.Dropout(0.1)
    self.classifier = torch.nn.Linear(256, self.num_labels)

  def forward(self, b_input_ids, attention_mask, labels=None, weights=None):   

    hidden_tensor = self.bert_model(input_ids=b_input_ids, attention_mask=attention_mask)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)

    x = hidden_tensor
    #x = self.linear(hidden_tensor)    
    #x = self.dropout(x)
    #x = F.gelu(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
    #x = self.dropout(x)
    logits = self.classifier(x)

    #logits = self.linear(hidden_tensor)

    if labels is not None:
      loss_fct = torch.nn.CrossEntropyLoss().to(device) # LogSoftmax & NLLLoss weight=weights      
      #loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      loss = loss_fct(logits, labels)
    else:
      loss = None
    
    return loss, logits

model = Model()    
'''

In [None]:
model.to(device) 

Model(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 정확도 계산 함수1
def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

In [None]:
# 정확도 계산 함수2
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    #evaluate(labels_flat, pred_flat)

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr =1e-6, # 학습률  1e-5 (1×10-5, 0.00001), 2e-5
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 1

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 처음에 학습률을 조금씩 변화시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 클래스 불균형 문제가 있습니다. (가중 교차 엔트로피 사용 등)
# {'anger': 0, 'joy': 1, 'sadness': 2, 'disgust': 3,  'neutral': 4,  'surprise': 5 'fear': 6, 'non-neutral': 7 }
#nSamples = [513, 185,	1283, 240, 2017, 351, 4752, 1220]
nSamples = [3222, 8044, 6148, 240, 4752, 1939, 2558, 2017]
#normedWeights = [1 - (x / sum(nSamples)) for x in nSamples]
normedWeights = [sum(nSamples)/x for x in nSamples]
#print(normedWeights)
normedWeights = torch.FloatTensor(normedWeights).to()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    model.train()
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device
                           ) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        #if step == 0:
          #print(b_input_ids, '\n\n', b_input_mask, '\n\n', b_labels)

        # Forward 수행                
        outputs = model(b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels#)
                       ,  weights=normedWeights)

         # 로스 구함
        loss = outputs[0]               

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()        

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

     # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids,  
                            attention_mask=b_input_mask) 
        
        # 로스 구함
        logits = outputs[1] #custom class일경우 1

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        #evaluate(logits, label_ids)
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


In [None]:
# 테스트셋 평가
# 평가모드로 변경
model.eval()

# 변수 초기화
pred_list, true_list = [], []

# 데이터로더에서 배치만큼 반복하여 가져옴
for batch in test_dataloader:
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids,  
                        attention_mask=b_input_mask) 
    
    # 로스 구함
    logits = outputs[1] #custom class일경우 1

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()

    pred_list += pred_flat.tolist()
    true_list += labels_flat.tolist()

evaluate(pred_list, true_list) # print results

In [None]:
# 캐글 테스트셋 평가 및 결과저장
#시작 시간 설정
t0 = time.time()
batch = 1

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
final_result = []

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(final_test_dataloader):
    # 경과 정보 표시
    if step % 1000 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(final_test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_id = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids,  
                            attention_mask=b_input_mask) 
       
    # 로스 구함
    logits = outputs[1] #custom class일경우 1

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    b_id = b_id.cpu().numpy()
    #print(b_id, pred_flat)

    result = np.concatenate((b_id, i2e_dict[int(pred_flat)]), axis=None)
        
    final_result.append(result)    

#긍정(1) 혹은 부정(0)으로 분류
rdf = pd.DataFrame(final_result, columns =['Id', 'Expected'])
rdf.to_csv(DATA_PATH + 'sample_eng.csv', index=False)

final_result[:10]

In [None]:
# 모델 저장하기
torch.save(model.state_dict(), DATA_PATH +  "friends_model.pt")