In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import time

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [26]:
data_path = '/home/Danny/Data-Mining/lab2/kaggle/data/'
model_path = '/home/Danny/pytorch/model/'
source_folder = '/home/Danny/Data-Mining/lab2/kaggle/data'
destination_folder = '/home/Danny/pytorch/model'

In [46]:
train_df = pd.read_csv((data_path + 'train.csv'), encoding='utf8', engine='python')
train_df

Unnamed: 0,tweet_id,text,emotion,label
0,0x344d1b,@MichaelSpathITH Oh man. I finally get to list...,anticipation,1.0
1,0x34d97c,Why order early if you are last to receive? @P...,sadness,5.0
2,0x388ad7,It's #impossible. To express how much I <LH> ...,joy,4.0
3,0x353a70,"@POTUS Before you speak or tweet, please learn...",joy,4.0
4,0x37f73c,3 day Social Media Detox is ended 🙋😇 I feel re...,trust,7.0
...,...,...,...,...
1049097,0x23c02e,Settling in for #mountainmen and then <LH> two...,sadness,5.0
1049098,0x2de236,False Doctrine of Infallibility: If I find any...,anticipation,1.0
1049099,0x25cc57,#-#EGYPTIAN-#SYMBOLS-#DESIGN-#BLACK-#GOLD-#CAS...,anticipation,1.0
1049100,0x36b83c,#Goodnight everyone. May 2morrow be more lovin...,joy,4.0


In [45]:
valid_df = pd.read_csv((data_path + 'valid.csv'), encoding='utf8', engine='python')
valid_df

Unnamed: 0,tweet_id,text,emotion,label
0,0x2c5570,POST-SURGURY <LH> TWP: 68 YOF - 498 <LH> #BLAI...,trust,7.0
1,0x2a39c7,@XavierDLeau @Blike_Dante @donnabrazile you al...,disgust,2.0
2,0x252f67,We all know @RealDonalTrump is not bright. He ...,sadness,5.0
3,0x2d569e,@followFALO We would love to collaborate with ...,joy,4.0
4,0x238f8d,Love is amazing and <LH> is amazing. I just <...,joy,4.0
...,...,...,...,...
116576,0x230218,He made me love Aslan before I knew who Aslan ...,anticipation,1.0
116577,0x1fead9,<LH> that two of the places that I most freque...,joy,4.0
116578,0x356d24,"Hahah, Rooney! Only goes and cups his ears af...",anger,0.0
116579,0x1ee4d4,@TrumpTheHill <LH> and continued fear. Thank @...,fear,3.0


In [6]:
import os

def get_gpu():
    if torch.cuda.is_available():
        os.system('nvidia-smi -q -d Memory | grep -A4 GPU| grep Free > gpu_memory')
        with open('gpu_memory', 'r') as f:
            print(f.read())            
        memory_available_list = [int(x.split()[2]) for x in open('gpu_memory', 'r').readlines()]
        free_gpu_id = int(np.argmax(memory_available_list))
        print(free_gpu_id)
        return 'cuda:{}'.format(free_gpu_id)
    else:
        return 'cpu'

In [7]:
device = torch.device(get_gpu())
print(device)

        Free                        : 11008 MiB
        Free                        : 11005 MiB
        Free                        : 1601 MiB

0
cuda:0


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# Model parameter
MAX_SEQ_LEN = 48
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
batch_size = 64
num_epochs = 10
# Fields

label_field = Field(sequential=False, 
                    use_vocab=False, 
                    batch_first=True, 
                    dtype=torch.float)

text_field = Field(use_vocab=False, 
                   tokenize=tokenizer.encode, 
                   lower=False, 
                   include_lengths=False, 
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN, 
                   pad_token=PAD_INDEX, 
                   unk_token=UNK_INDEX)

fields = [('tweet_id', None),
          ('text', text_field),
          ('emotion', None),
          ('label', label_field)]



In [10]:
%%time
# TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, 
                                           train='train.csv', 
                                           validation='valid.csv',
                                           test='test.csv', 
                                           format='CSV', 
                                           fields=fields, 
                                           skip_header=True)



CPU times: user 8min 28s, sys: 754 ms, total: 8min 29s
Wall time: 8min 29s


In [11]:
print(vars(train[0]))
print(vars(valid[0]))
print(vars(test[0]))

{'text': [101, 1030, 17784, 15069, 8939, 2821, 2158, 1012, 1045, 2633, 2131, 2000, 4952, 2000, 1996, 2265, 2023, 2733, 999, 999, 999, 1026, 1048, 2232, 1028, 102], 'label': '1'}
{'text': [101, 2695, 1011, 7505, 27390, 2100, 1026, 1048, 2232, 1028, 1056, 2860, 2361, 1024, 6273, 10930, 2546, 1011, 4749, 2620, 1026, 1048, 2232, 1028, 1001, 10503, 13731, 16428, 102], 'label': '7'}
{'text': [101, 2042, 1037, 1001, 2733, 2085, 1001, 2144, 1045, 1026, 1048, 2232, 1028, 2026, 1001, 3566, 1012, 1045, 1001, 3335, 2014, 1012, 1001, 2016, 2003, 1013, 2001, 1013, 1001, 5091, 1001, 2097, 4783, 2026, 1001, 2190, 19699, 9013, 2094, 1012, 1001, 2941, 4887, 16774, 2594, 102], 'label': '6'}


In [36]:
# Iterators

train_iter = BucketIterator(train, 
                            batch_size=batch_size, 
#                             sort_key=lambda x: len(x.text),
                            device=device, 
                            train=True, 
                            sort=False,
                            sort_within_batch=True)

valid_iter = BucketIterator(valid, 
                            batch_size=batch_size, 
#                             sort_key=lambda x: len(x.text),
                            device=device, 
                            train=True, 
                            sort=False,
                            sort_within_batch=True)

test_iter = Iterator(test, 
                     batch_size=batch_size, 
                     device=device, 
                     train=False, 
                     shuffle=False, 
                     sort=False
                    )

In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [None]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [None]:
# Training Function

def train_model(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = num_epochs,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        start = time.time()
        
        for (labels, title, text, titletext), _ in train_loader:
            labels = labels.type(torch.LongTensor)           
            labels = labels.to(device)
            titletext = titletext.type(torch.LongTensor)  
            titletext = titletext.to(device)
            output = model(titletext, labels)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (labels, title, text, titletext), _ in valid_loader:
                        labels = labels.type(torch.LongTensor)           
                        labels = labels.to(device)
                        titletext = titletext.type(torch.LongTensor)  
                        titletext = titletext.to(device)
                        output = model(titletext, labels)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()
                end = time.time()
                second = end - start
                if second > 60:
                    minute = second // 60
                    duration = '{:.4f} m'.format(minute)
                else:
                    duration = '{:.4f} s'.format(second)
                

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}, Time: {}'
                      .format(epoch+1, 
                              num_epochs, 
                              global_step, 
                              num_epochs*len(train_loader),
                              average_train_loss, 
                              average_valid_loss, 
                              duration,
                             ))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [None]:
%%time
model = BERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
train_model(model=model, optimizer=optimizer, num_epochs=num_epochs)