In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertPreTrainedModel, BertModel

from transformers import AutoConfig, AutoTokenizer

In [3]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

## Load data

In [4]:
train_df = pd.read_csv('../input/emotions/train.txt', sep=';')
test_df = pd.read_csv('../input/emotions/test.txt', sep=';')
val_df = pd.read_csv('../input/emotions/val.txt', sep=';')

In [5]:
train_df.shape, test_df.shape, val_df.shape

In [6]:
train_df.columns = ['sentence', 'emotion']
test_df.columns = ['sentence', 'emotion']
val_df.columns = ['sentence', 'emotion']

In [7]:
train_df.head()

## EDA

In [8]:
train_df['emotion'].value_counts()

In [9]:
test_df['emotion'].value_counts()

In [10]:
val_df['emotion'].value_counts()

In [11]:
# get max len of sentences
def max_len(data):
    return data['sentence'].apply(lambda x: len(x.split())).max()

max_lens = [max_len(train_df), max_len(test_df), max_len(val_df)]
max(max_lens)

In [12]:
# len(train_df['sentence'].iloc[4].split())

In [13]:
max_lens

## Configs

In [14]:

MODEL_OUT_DIR = '/kaggle/working/models/bert_emotion'
TRAIN_FILE_PATH = '../input/emotions/train.txt'
VALID_FILE_PATH = '../input/emotions/val.txt'
TEST_FILE_PATH = '../input/emotions/test.txt'
## Model Configurations
MAX_LEN_TRAIN = 68
MAX_LEN_VALID = 68
MAX_LEN_TEST = 68
BATCH_SIZE = 160
LR = 1e-5
NUM_EPOCHS = 10
NUM_THREADS = 1  ## Number of threads for collecting dataset
MODEL_NAME = 'bert-base-uncased'
LABEL_DICT = {'joy':0, 'sadness':1, 'anger':2, 'fear':3}

if not os.path.isdir(MODEL_OUT_DIR):
    os.makedirs(MODEL_OUT_DIR)

## Create Dataset

In [15]:
class Emotions_Dataset(Dataset):

    def __init__(self, filename, maxlen, tokenizer, label_dict): 
        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = ';')
        # name columns
        self.df.columns = ['sentence', 'emotion']
        #Initialize the tokenizer for the desired transformer model
        self.df['emotion'] = self.df['emotion'].map(label_dict)
        self.tokenizer = tokenizer
        #Maximum length of the tokens list to keep all the sequences of fixed size
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):    
        #Select the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'emotion']
        #Preprocess the text to be suitable for the transformer
        tokens = self.tokenizer.tokenize(sentence) 
        tokens = ['[CLS]'] + tokens + ['[SEP]'] 
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] 
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] 
        #Obtain the indices of the tokens in the BERT Vocabulary
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        input_ids = torch.tensor(input_ids) 
        #Obtain the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attention_mask = (input_ids != 0).long()
        
        label = torch.tensor(label, dtype=torch.long)
        
        return input_ids, attention_mask, label

In [16]:
class BertEmotionClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        #The classification layer that takes the [CLS] representation and outputs the logit
        self.cls_layer = nn.Linear(config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        #Feed the input to Bert model to obtain contextualized representations
        reps, _ = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        #Obtain the representations of [CLS] heads
        cls_reps = reps[:, 0]
        logits = self.cls_layer(cls_reps)
        return logits

## Training function

In [17]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
    best_acc = 0
    for epoch in trange(epochs, desc="Epoch"):
        model.train()
        train_acc = 0
        for i, (input_ids, attention_mask, labels) in enumerate(iterable=train_loader):
            optimizer.zero_grad()  
            
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            
            train_acc += get_accuracy_from_logits(logits, labels)
        
        print(f"Training accuracy is {train_acc/len(train_loader)}")
        val_acc, val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
        print("Epoch {} complete! Validation Accuracy : {}, Validation Loss : {}".format(epoch, val_acc, val_loss))
        
#         if val_acc > best_acc:
#             print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
#             best_acc = val_acc
#             model.save_pretrained(save_directory=MODEL_OUT_DIR + '/')
#             config.save_pretrained(save_directory=MODEL_OUT_DIR + '/')
#             tokenizer.save_pretrained(save_directory=MODEL_OUT_DIR + '/')

## Evaluation function

In [18]:
def evaluate(model, criterion, dataloader, device):
    model.eval()
    mean_acc, mean_loss, count = 0, 0, 0
#     predicted_labels = []
#     actual_labels = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in (dataloader):
            
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            logits = model(input_ids, attention_mask)
            
            mean_loss += criterion(logits.squeeze(-1), labels).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1
            
#             predicted_labels += output
#             actual_labels += labels
            
    return mean_acc/count, mean_loss/count

In [19]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=1)
    output = torch.argmax(probs, dim=1)
    acc = (output == labels).float().mean()
    return acc

## Predict function

In [20]:
def predict(model, dataloader, device):
    predicted_label = []
    actual_label = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in (dataloader):
            
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            logits = model(input_ids, attention_mask)
            
            probs = F.softmax(logits, dim=1)
            output = torch.argmax(probs, dim=1)
            
            predicted_label += output
            actual_label += labels
            
    return predicted_label, actual_label

In [21]:
## Configuration loaded from AutoConfig 
config = AutoConfig.from_pretrained(MODEL_NAME)
## Tokenizer loaded from AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
## Creating the model from the desired transformer model
model = BertEmotionClassifier.from_pretrained(MODEL_NAME, config=config)
## GPU or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
## Putting model to device
model = model.to(device)
## Takes as the input the logits of the positive class and computes the binary cross-entropy 
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
## Optimizer
optimizer = optim.Adam(params=model.parameters(), lr=LR)

In [22]:
## Training Dataset
train_set = Emotions_Dataset(filename=TRAIN_FILE_PATH, maxlen=MAX_LEN_TRAIN, tokenizer=tokenizer, label_dict=LABEL_DICT)
valid_set = Emotions_Dataset(filename=VALID_FILE_PATH, maxlen=MAX_LEN_VALID, tokenizer=tokenizer, label_dict=LABEL_DICT)
test_set = Emotions_Dataset(filename=TEST_FILE_PATH, maxlen=MAX_LEN_TEST, tokenizer=tokenizer, label_dict=LABEL_DICT)


## Data Loaders
train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)
valid_loader = DataLoader(dataset=valid_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, num_workers=NUM_THREADS)

# print(len(train_loader))

In [23]:
train(model=model, 
      criterion=criterion,
      optimizer=optimizer, 
      train_loader=train_loader,
      val_loader=valid_loader,
      epochs = 5,
     device = device)

In [25]:
actual_label, predicted_label = predict(model, test_loader, device=device)
actual_label = np.array([item.to('cpu') for item in actual_label])
predicted_label = np.array([item.to('cpu') for item in predicted_label])

print("Accuracy :",metrics.accuracy_score(actual_label, predicted_label))
print("f1 score macro :",metrics.f1_score(actual_label, predicted_label, average = 'macro'))
print("f1 scoore micro :",metrics.f1_score(actual_label, predicted_label, average = 'micro'))
print("Hamming loss :",metrics.hamming_loss(actual_label, predicted_label))
print("Classification Report: \n", classification_report(actual_label, predicted_label,digits=4))
print("Confusion Matrix: \n", confusion_matrix(actual_label, predicted_label))