In [1]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
import tensorflow as tf
import torch
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import time
import datetime
import json
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords


I1223 12:47:37.091719 140411319576384 file_utils.py:39] PyTorch version 1.0.1 available.
I1223 12:47:39.098146 140411319576384 file_utils.py:55] TensorFlow version 2.1.0 available.
Using TensorFlow backend.


In [2]:

# ======================== cell 1 =============================

nltk.download('stopwords')
stops = set(stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')
with open('./friends_train.json') as json_file:
    json_train = json.load(json_file)
with open('./friends_test.json') as json_file:
    json_test = json.load(json_file)
with open('./friends_dev.json') as json_file:
    json_dev = json.load(json_file)

def cleaning(str):
    replaceAll= str
    only_english = re.sub('[^a-zA-Z]', ' ', replaceAll)
    no_capitals = only_english.lower().split()
    no_stops = [word for word in no_capitals if not word in stops]
    stemmer_words = [stemmer.stem(word) for word in no_stops]
    return ' '.join(stemmer_words)

i = 0
train_data=[]
for rows in json_train:
    for row in rows:
        train_data.append( [cleaning(row['utterance']), row['emotion']] )


test_data = []
for rows in json_test:
    for row in rows:
        test_data.append([cleaning(row['utterance']), row['emotion']])
        

def jsonToDf(file_name):
  with open(file_name, encoding = 'utf-8', mode = 'r') as file:
    json_array = json.load(file)
  
  result = pd.DataFrame.from_dict(json_array[0])

  is_first = True
  for array in json_array:
    if is_first:
      is_first = False
      continue
    
    temp_df = pd.DataFrame.from_dict(array)
    result = result.append(temp_df, ignore_index = True)

  return result

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ds_user1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#1216 추가 
train = jsonToDf('friends_train.json')
dev = jsonToDf('friends_dev.json')
train2 = jsonToDf('friends_test.json')

train = train.append(train2, ignore_index = True)

In [4]:
# 1216 추가
def getIndex(dataset):
  data = dataset.copy(deep = True)
  input_index = data.id.tolist()
  return torch.tensor(input_index)

In [5]:
#1216 추가
MAX_LEN = 85

def getInputsAndLabels(dataset):
  data = dataset.copy(deep=True)
  #data['utterance'] = data['utterance'].str.lower()

  utterances = data['utterance']
  utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
  encoder = LabelEncoder()
  labels = data['emotion'].values
  encoder.fit(labels)
  labels = encoder.transform(labels)

  tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
  tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, labels, attention_masks

In [6]:
#1216 추가
from sklearn.preprocessing import LabelEncoder

In [7]:
#1216 추가
def getInputsFromTest(dataset):
  data = dataset.copy(deep=True)
  #data['utterance'] = data['utterance'].str.lower()

  utterances = data['utterance']
  utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
  tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
  tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, attention_masks

In [8]:
test = pd.read_csv('en_data.csv', encoding = 'utf-8')

In [9]:
#1216 추가
train_inputs, train_labels, train_masks = getInputsAndLabels(train)
dev_inputs, dev_labels, dev_masks = getInputsAndLabels(dev)
test_inputs, test_masks = getInputsFromTest(test)

I1223 12:48:15.189055 140411319576384 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt from cache at /home/ds_user1/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I1223 12:48:19.895386 140411319576384 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt from cache at /home/ds_user1/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef709635b48b3bed2bbd.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I1223 12:48:21.062428 140411319576384 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt from cache at /home/ds_user1/.cache/torch/transformers/ff085885d4c95651587af553adadd34a26de8a663f2cef7096

In [10]:
#1216 추가
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

dev_inputs = torch.tensor(dev_inputs)
dev_labels = torch.tensor(dev_labels)
dev_masks = torch.tensor(dev_masks)

test_index = getIndex(test)
test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)

In [51]:
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels)
dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

test_data = TensorDataset(test_index, test_inputs, test_masks)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [52]:
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-generator', num_labels=8)
model.cuda()

I1223 13:54:15.351282 140411319576384 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json from cache at /home/ds_user1/.cache/torch/transformers/7e5fbda7e75c095166b345fcfd9c212684381b6ecc13e792546398c248871d1f.611e1325a2cceaba4047dcee280bd0ef4175b1e814df696bb3e708b8552e8c74
I1223 13:54:15.352468 140411319576384 configuration_utils.py:321] Model config ElectraConfig {
  "architectures": [
    "ElectraForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (embeddings_project): Linear(in_features=768, out_features=256, bias=True)
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bi

In [63]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )

epochs = 10

total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [64]:
from sklearn.metrics import f1_score

# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def getF1Score(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  return f1_score(labels_flat, pred_flat, average = None)

In [60]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [61]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 8 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
             
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()


        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in dev_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
     
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        # tmp_eval_f1 = getF1Score(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        # eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    # print("  F1: {0:.2f}".format(eval_f1/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of    833.    Elapsed: 0:00:46.

  Average training loss: 1.42
  Training epcoh took: 0:01:17

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:01

Training...
  Batch   500  of    833.    Elapsed: 0:00:46.

  Average training loss: 1.41
  Training epcoh took: 0:01:16

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:01

Training...
  Batch   500  of    833.    Elapsed: 0:00:45.

  Average training loss: 1.42
  Training epcoh took: 0:01:16

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:01

Training...
  Batch   500  of    833.    Elapsed: 0:00:45.

  Average training loss: 1.38
  Training epcoh took: 0:01:15

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:01

Training...
  Batch   500  of    833.    Elapsed: 0:00:45.

  Average training loss: 1.35
  Training epcoh took: 0:01:16

Running Validation...
  Accuracy: 0.47
  Validation took: 0:00:01

Training...
  Batch   500  of    833.    Elapsed: 0:00

In [42]:
tmp_test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)
test_result = test.copy(deep = True)
test_result = test_result.drop(columns = ['i_dialog', 'i_utterance', 'speaker'])
test_result['Predicted'] = 'default'

encoder = LabelEncoder()
labels = train['emotion'].values
encoder.fit(labels)
labels = encoder.transform(labels)


for step, batch in enumerate(tmp_test_dataloader):
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_index, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    idx = b_index.item()
    test_result['Predicted'][idx] = encoder.classes_[np.argmax(logits)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [43]:
test_result.head()

Unnamed: 0,id,utterance,Predicted
0,0,"Alright, whadyou do with him?",neutral
1,1,Oh! You're awake!,surprise
2,2,Then you gotta come clean with Ma! This is not...,anger
3,3,"Yeah, but this is",neutral
4,4,I don't wanna hear it! Now go to my room!,non-neutral


In [44]:
dataframe = pd.DataFrame(test_result)
dataframe.to_csv("test-electra-base-batch16.csv", header=False, index=False)