In [1]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as datasets
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from copy import deepcopy

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def load_train(file_path = "./data/ISEAR.txt"):
    train_x = []
    train_y = []
    with open(file_path, "r", encoding='utf-8') as f:
        for lines in f:
            y_x = lines.split("|")
            if len(y_x) < 2:
                continue
            train_y.append(y_x[0])
            train_x.append(y_x[1])
    return train_x, train_y

def load_dev(file_path = "./data/test.txt"):
    dev_x = []
    dev_y = []
    with open(file_path, "r", encoding='utf-8') as f:
        for lines in f:
            y_x = lines.split("|")
            if len(y_x) < 2:
                continue
            dev_y.append(y_x[0])
            dev_x.append(y_x[1])
    return dev_x, dev_y

In [5]:
train_x_raw, train_y_raw = load_train()
dev_x_raw, dev_y_raw = load_dev()

y_count = {}
for y in train_y_raw:
    if y in y_count:
        y_count[y] += 1
    else:
        y_count[y] = 1
def sort_key(a):
    return a[1]
label_list = []
for key in y_count:
    item = y_count[key]
    label_list.append((key,item))
label_list.sort(reverse=True,key=sort_key)
label2id = {}
i = 0
for label in label_list:
    label2id[label[0]] = i
    label2id[i] = label[0]
    i+=1

In [6]:
label2id

{'joy': 0,
 0: 'joy',
 'sadness': 1,
 1: 'sadness',
 'anger': 2,
 2: 'anger',
 'fear': 3,
 3: 'fear',
 'shame': 4,
 4: 'shame',
 'disgust': 5,
 5: 'disgust',
 'guilt': 6,
 6: 'guilt'}

In [7]:
train_y = []
dev_y = []
for label in train_y_raw:
    train_y.append(label2id[label])
for label in dev_y_raw:
    dev_y.append(label2id[label])
train_y = np.array(train_y)
dev_y = np.array(dev_y)

In [8]:
train_y.shape

(7000,)

In [9]:
dev_y.shape

(500,)

In [10]:
tokenizer.encode("abc edf.", add_special_tokens=True)

[101, 5925, 3968, 2546, 1012, 102]

In [11]:
def preprocess(sentence_list):
    l = []
    for i in range(len(sentence_list)):
        s = sentence_list[i].lower()
        s = tokenizer.encode(s, add_special_tokens=True)
        l.append(s)
    return l

In [12]:
train_x_tokenized = preprocess(train_x_raw)
dev_x_tokenized = preprocess(dev_x_raw)

In [13]:
max_len = 0
for i in train_x_tokenized:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in train_x_tokenized])
padded_dev = np.array([i + [0]*(max_len-len(i)) for i in dev_x_tokenized])

In [14]:
padded

array([[  101,  2006,  2420, ...,     0,     0,     0],
       [  101,  2296,  2051, ...,     0,     0,     0],
       [  101,  2043,  1045, ...,     0,     0,     0],
       ...,
       [  101,  2023,  3277, ...,     0,     0,     0],
       [  101,  2026, 18328, ...,     0,     0,     0],
       [  101,  2197,  2621, ...,     0,     0,     0]])

In [15]:
padded.shape

(7000, 195)

In [16]:
padded_dev.shape

(500, 195)

In [17]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(7000, 195)

In [18]:
padded_train = []
attention_mask_train = []
for i in range(14):
    padded_train.append(padded[500*i:500*(i+1)])
    attention_mask_train.append(attention_mask[500*i:500*(i+1)])

In [19]:
torch.cuda.empty_cache()

In [20]:
padded_train[0].shape

(500, 195)

In [21]:
train_feature_list = []
for i in range(14):
    torch.cuda.empty_cache()
    input_ids = torch.tensor(padded_train[i]).to(device)
    attention_mask = torch.tensor(attention_mask_train[i]).to(device)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
        
    feature = last_hidden_states[0][:,0,:].cpu().numpy()
    train_feature_list.append(feature)

In [22]:
train_feature_list[0]

array([[-0.10286859, -0.13684382, -0.25244638, ..., -0.17019261,
         0.18526137,  0.08419316],
       [ 0.05663628,  0.09149715, -0.141327  , ..., -0.07937766,
         0.26432273,  0.27011317],
       [-0.10499352, -0.0638291 , -0.17506857, ..., -0.051651  ,
         0.24059753,  0.22042893],
       ...,
       [-0.04100357, -0.27688867, -0.09601278, ..., -0.10334397,
         0.28090087,  0.21965046],
       [ 0.08131988, -0.13469708,  0.08270323, ...,  0.066988  ,
         0.29606506,  0.3803135 ],
       [-0.17046656, -0.56751305, -0.24231999, ..., -0.0667112 ,
         0.43118402, -0.05132144]], dtype=float32)

In [23]:
train_feature_list[0].shape

(500, 768)

In [24]:
attention_mask_dev = np.where(padded_dev != 0, 1, 0)
input_ids_dev = torch.tensor(padded_dev).to(device)
attention_mask_dev = torch.tensor(attention_mask_dev).to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids_dev, attention_mask=attention_mask_dev)
dev_x = last_hidden_states[0][:,0,:].cpu().numpy()

In [25]:
dev_x.shape

(500, 768)

In [26]:
train_x = np.zeros((7000,768))
np_i = 0
for i in range(14): 
    for vec in train_feature_list[i]:
        train_x[np_i] = vec
        np_i += 1

In [27]:
train_x.shape

(7000, 768)

In [28]:
class ClassifierDataset(datasets.Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [63]:
BATCH_SIZE = 512
LEARNING_RATE = 0.01

train_dataset = ClassifierDataset(torch.from_numpy(train_x).float(), torch.from_numpy(train_y).long())
dev_dataset = ClassifierDataset(torch.from_numpy(dev_x).float(), torch.from_numpy(dev_y).long())

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [84]:
class MulticlassClassification(nn.Module):
    def __init__(self):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(768, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, 7) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [85]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

class average_meter(object):
    '''Computes and stores the average and current value
    '''
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [86]:
def train(classifier, train_loader, criterion, optimizer):
    train_loss = average_meter()
    classifier.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        pred = classifier(x)
        
        loss = criterion(pred, y)
        train_loss.update(loss.item(),x.size(0))
        loss.backward()
        optimizer.step()
    return train_loss.avg

def validate(classifier, dev_loader, criterion, optimizer):
    valid_loss = average_meter()
    valid_acc = average_meter()
    classifier.eval()
    with torch.no_grad():
        for x, y in dev_loader:
            x, y = x.to(device), y.to(device)
            pred = classifier(x)
            loss = criterion(pred, y)
            acc = multi_acc(pred, y)
            valid_loss.update(loss.item(),x.size(0))
            valid_acc.update(acc.item(),x.size(0))
    return valid_loss.avg, valid_acc.avg

In [87]:
classifier = MulticlassClassification()
classifier.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=LEARNING_RATE)

In [88]:
n_epochs = 50
best_acc = 0
acc_model = None

for epoch in range(n_epochs):
    torch.cuda.empty_cache()
    train_loss = train(classifier, train_dataloader, criterion, optimizer)
    torch.cuda.empty_cache()
    valid_loss, valid_acc = validate(classifier, dev_dataloader, criterion, optimizer)
    if valid_acc > best_acc:
        classifier = deepcopy(classifier)
        best_acc = valid_acc
    print("{}%, Epoch {}, train loss {:.8f}, validate loss {:.8f}, acc {:.8f}".format(100*(epoch+1)/n_epochs,str(epoch).zfill(2),train_loss,valid_loss,valid_acc))

2.0%, Epoch 00, train loss 1.48977905, validate loss 1.56709671, acc 47.00000000
4.0%, Epoch 01, train loss 1.22027572, validate loss 1.35392356, acc 51.00000000
6.0%, Epoch 02, train loss 1.22483490, validate loss 1.33604479, acc 51.00000000
8.0%, Epoch 03, train loss 1.21758670, validate loss 1.33285093, acc 51.00000000
10.0%, Epoch 04, train loss 1.21962523, validate loss 1.33250916, acc 51.00000000
12.0%, Epoch 05, train loss 1.22171966, validate loss 1.33206308, acc 51.00000000
14.0%, Epoch 06, train loss 1.21761187, validate loss 1.33174908, acc 51.00000000
16.0%, Epoch 07, train loss 1.22062011, validate loss 1.33170497, acc 51.00000000
18.0%, Epoch 08, train loss 1.22306282, validate loss 1.33197606, acc 51.00000000
20.0%, Epoch 09, train loss 1.21610779, validate loss 1.33187973, acc 51.00000000
22.0%, Epoch 10, train loss 1.22112705, validate loss 1.33153236, acc 51.00000000
24.0%, Epoch 11, train loss 1.22545358, validate loss 1.33171606, acc 51.00000000
26.0%, Epoch 12, tra