# Classify emotions in text with BERT NLP model 

In [39]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [40]:
!pip install transformers

In [41]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io
%matplotlib inline

In [42]:
# identify and specify the GPU as the device, later in training loop we will load data into device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [43]:
device = torch.device("cuda")

BertTokenizer to run end-to-end tokenization: punctuation splitting + word piece. 
BertForSequenceClassification is the Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output). 
BertConfig is the configuration class to store model configurations. 
AdamW implements Adam learning rate optimization algorithm, it is a type of Stochastic Gradient Descent with momentum. Here momentum is described as the moving average of the gradient instead of gradient itself.
get_linear_schedule_with_warmup creates a schedule with a learning rate that decreases linearly after linearly increasing during a warm-up period.

In [44]:
df_train = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt", delimiter=';', header=None, names=['sentence','label'])
df_test = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/test.txt", delimiter=';', header=None, names=['sentence','label'])
df_val = pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/val.txt", delimiter=';', header=None, names=['sentence','label'])

In [45]:
df = pd.concat([df_train,df_test,df_val])

In [46]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['label_enc'] = labelencoder.fit_transform(df['label'])

In [47]:
df.rename(columns={'label':'label_desc'},inplace=True)
df.rename(columns={'label_enc':'label'},inplace=True)

In [48]:
## create label and sentence list
sentences = df.sentence.values

#check distribution of data based on labels

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
MAX_LEN = 256
from transformers import BertTokenizer, BertModel
## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) for sent in sentences]
labels = df.label.values

## Create attention mask
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

## Dataset Prep for training

#### Split into a training set and a test set using a stratified k fold

In [50]:
train_inputs,validation_inputs,train_labels,validation_labels = train_test_split(input_ids,labels,random_state=41,test_size=0.1)
train_masks,validation_masks,_,_ = train_test_split(attention_masks,input_ids,random_state=41,test_size=0.1)

In [51]:
# convert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

In [52]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6).to(device)

lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs 
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

# Train the Model 

In [54]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    # Calculate total loss for this epoch
    batch_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]

        # Backward pass
        loss.backward()

        # Clip the norm of the gradients to 1.0
        # Gradient clipping is not in AdamW anymore
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        optimizer.step()

        # Update learning rate schedule
        scheduler.step()

        # Clear the previous accumulated gradients
        optimizer.zero_grad()

        # Update tracking variables
        batch_loss += loss.item()

    # Calculate the average loss over the training data.
    avg_train_loss = batch_loss / len(train_dataloader)

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])
    
    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')
    
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        # Move logits and labels to CPU
        logits = logits[0].to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(logits, axis=1).flatten()
        labels_flat = label_ids.flatten()

        df_metrics=pd.DataFrame({'Epoch':epochs,'Actual_class':labels_flat,'Predicted_class':pred_flat})

        tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
        tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)

        eval_accuracy += tmp_eval_accuracy
        eval_mcc_accuracy += tmp_eval_mcc_accuracy
        nb_eval_steps += 1

    print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
    print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')

In [55]:
df[['label','label_desc']].drop_duplicates(keep='first')

In [56]:
## emotion labels
label2int = {
  "sadness": 4,
  "joy": 2,
  "love": 3,
  "anger": 0,
  "fear": 1,
  "surprise": 5
}

# Save the models for future use 

In [78]:
model_save_folder = 'model/'
tokenizer_save_folder = 'tokenizer/'

path_model = F'/kaggle/working/{model_save_folder}'
path_tokenizer = F'/kaggle/working/{tokenizer_save_folder}'

#create the dir

!mkdir -p {path_model}
!mkdir -p {path_tokenizer}

## Now let's save our model and tokenizer to a directory
model.save_pretrained(path_model)
tokenizer.save_pretrained(path_tokenizer)

model_save_name = 'fineTuneModel.pt'
path = path_model = F'/kaggle/working/{model_save_folder}/{model_save_name}'
torch.save(model.state_dict(),path);

model_save_name = 'fineTuneModel.bin'
path = path_model = F'/kaggle/working/{model_save_name}'
torch.save(model.state_dict(),path);

In [79]:
token = BertTokenizer.from_pretrained("./tokenizer")

In [81]:
from transformers import BertModel
r_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
r_model.load_state_dict(torch.load("./fineTuneModel.bin"))

# Find the levels' boundage for each emotion

In [82]:
Emo = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

def result(sentence):
    emotion = []
    value = [[],[],[],[],[],[]]
    tendency = []
    for f in sentence:
        seq = f
        attention = [float(i>0) for i in seq]
        seq = torch.tensor([seq]).to(device)
        attention = torch.tensor([attention]).to(device)
        result = model(seq, attention)

        Max = -9999999
        ctr = 0
        idx = 0
        for i in result[0][0].cpu().detach().numpy():
            if Max < i:
                Max = i
                idx = ctr
            ctr += 1

        value[idx].append(result[0][0].cpu().detach().numpy()[idx])
        emotion.append(Emo[idx])
    return value, emotion

In [83]:
train_value, train_emotion = result(train_inputs)
val_value, val_emotion = result(validation_inputs)

In [88]:
def ComputeBoundary(train_value):
    boundary = []
    for i in train_value:
        one = int(len(i)/3)
        two = one*2
        i.sort()
        boundary.append([i[one], i[two]])
    return boundary

def analyze(sent, boundary):
    seq = token.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True)
    attention = [float(i>0) for i in seq]
    seq = torch.tensor([seq]).to(device)
    attention = torch.tensor([attention]).to(device)
    result = model(seq, attention)
    Max = -9999999
    ctr = 0
    idx =0
    for i in result[0][0].cpu().detach().numpy():
        if Max < i:
            Max = i
            idx = ctr
        ctr += 1
    print(Emo[idx])
    
    if result[0][0][idx].cpu().detach().numpy() > boundary[idx][1]:
        print("level_3")
    elif result[0][0][idx].cpu().detach().numpy() > boundary[idx][0]:
        print("level_2")
    else: 
        print("level_1")

# Simple Test Sentence

In [101]:
Boundary = ComputeBoundary(train_value)
test_sentence = "He is mad "
analyze(test_sentence, train_value)

# Evaluation (Microf, Macrof, Accuracy)

In [91]:
def micro_f(predicts, labels, emotions):
    #predicts: [predict]
    #labels: [correct label]
    #emotions: [emotion]
    
    TP = 0
    FP = 0
    FN = 0
    n = len(predicts)
    
    for emotion in emotions:
        #Count TruePositive, FalsePositive, FalseNegative
        tp = 0
        fp = 0
        fn = 0
        for i in range(n):
            if predicts[i] == emotion:
                if predicts[i] == labels[i]:
                    tp += 1
                else:
                    fp += 1
            elif labels[i] == emotion:
                fn += 1
        TP += tp
        FP += fp
        FN += fn
        
    precision = TP / (TP+FP)
    recall = TP / (TP+FN)
    f1 = 2*precision*recall/(precision+recall)
    return (precision, recall, f1)

def macro_f(predicts, labels, emotions):
    #predicts: [predict]
    #labels: [correct label]
    #emotions: [emotion]
    
    precision = 0
    recall = 0
    n = len(predicts)
    
    for emotion in emotions:
        #Count TruePositive, FalsePositive, FalseNegative
        tp = 0
        fp = 0
        fn = 0
        for i in range(n):
            if predicts[i] == emotion:
                if predicts[i] == labels[i]:
                    tp += 1
                else:
                    fp += 1
            elif labels[i] == emotion:
                fn += 1
        precision += tp/(tp+fp)
        recall += tp/(tp+fn)
        
    precision /= len(emotions)
    recall /= len(emotions)
    f1 = 2*precision*recall / (precision+recall)
    return (precision, recall, f1)

def accuracy(predicts, labels):
    #predicts: [predict]
    #labels: [correct label]
    
    correct = 0
    n = len(predicts)
    
    for i in range(n):
        if predicts[i] ==labels[i]:
            correct += 1
            
    acc = correct / n
    
    return acc

def PrintEvaluation(predicts, labels, emotions):
    Microf = micro_f(predicts, labels, emotions)
    Macrof = macro_f(predicts, labels, emotions)
    Accuracy = accuracy(predicts, labels)
    print("Microf: ",Microf)
    print("Macrof: ",Macrof)
    print("Accuracy: ",Accuracy)

# The Visualization of confusion matrix

In [92]:
from sklearn.metrics import confusion_matrix,classification_report
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# Output the Final Evaluation

In [100]:
print("Train:")
PrintEvaluation(train_emotion, train_labels, Emo)

print("\nValidation:")
PrintEvaluation(val_emotion, validation_labels, Emo)

In [None]:
cm = confusion_matrix(train_labels, train_emotion)
plot_confusion_matrix(cm, Emo)

In [None]:
cm = confusion_matrix(df_val.label, val_emotion)
plot_confusion_matrix(cm, Emo)

# Version

In [75]:
!python --version
print("numpy:",np.__version__)
print("pandas:",pd.__version__)
print("torch:",torch.__version__)
print("seaborn:",sns.__version__)