# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
!pip install contractions
import contractions
import re
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [2]:
from platform import python_version

print(python_version())

In [3]:
tf.version.VERSION

In [4]:
import transformers
print(transformers.__version__)

In [5]:
train_data = pd.read_csv('/kaggle/input/emotions/train.txt', names=['text', 'emotion'], sep=';')
val_data = pd.read_csv('/kaggle/input/emotions/val.txt', names=['text', 'emotion'], sep=';')
test_data = pd.read_csv('/kaggle/input/emotions/test.txt', names=['text', 'emotion'], sep=';')
train_data.head()

In [6]:
train_data.value_counts()

In [7]:
test_data.value_counts()

In [8]:
val_data.value_counts()

# EDA

**Check for null values in train, val and test dataset**

In [9]:
data = {'Train Data': train_data, 'Validation Data': val_data, 'Test Data': test_data}
for temp in data:
    print(temp)
    print(data[temp].isnull().sum())
    print('*'*20)

**Class Distribution in Train, val and test dataset**

In [10]:
bar, ax = plt.subplots(1,3, figsize=(30, 10))
for index, temp in enumerate(data):
    sns.countplot(ax = ax[index],x = 'emotion', data = data[temp])
    ax[index].set_title(temp+' Class Frequency', size=14)
    ax[index].set_ylabel('Frequency', size=14)
    ax[index].set_xlabel(temp+' Class', size=14)

It is evident that dataset is highly imbalanced. "Joy" class has highest frequency  and 'Surprise' have least frequency in all three datasets.

**Word Cloud** to get most frequent words.

In [11]:
def plot_cloud(wordcloud, temp):
    plt.figure(figsize=(10, 10))
    plt.title(temp+' Word Cloud', size = 16)
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

In [12]:
for temp in data:
    temp_text = ' '.join([sentence for sentence in data[temp].text])
    wordcloud = WordCloud(width = 600, height = 600).generate(temp_text)
    plot_cloud(wordcloud, temp)

# Pre-Processing

**Preprocessing includes:**
1. Removing stopwords (without removing negative words)
2. Expand Contractions
3. Lemmatization

**Note:** Negative words are removed from the set of stopwords as it makes "I am not happy" to "happy" after preprocessing. In short, it can change the semantic meaning of sentence and result into wrong training.

In [13]:
def preprocess(sentence):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    sentence = re.sub('[^A-z]', ' ', sentence)
    negative = ['not', 'neither', 'nor', 'but', 'however',
                'although', 'nonetheless', 'despite', 'except',
                        'even though', 'yet']
    stop_words = [z for z in stop_words if z not in negative]
    preprocessed_tokens = [lemmatizer.lemmatize(contractions.fix(temp.lower())) for temp in sentence.split() if temp not in stop_words] #lemmatization
    return ' '.join([x for x in preprocessed_tokens]).strip()


In [14]:
train_data['text'] = train_data['text'].apply(lambda x: preprocess(x))
val_data['text'] = val_data['text'].apply(lambda x: preprocess(x))
test_data['text'] = test_data['text'].apply(lambda x: preprocess(x))

**Note:** As class imbalanced is evident, RandomOverSampler is used to add data(repetition) to all classes except highest frequency class.

In [15]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
train_x, train_y = ros.fit_resample(np.array(train_data['text']).reshape(-1, 1), np.array(train_data['emotion']).reshape(-1, 1))
train = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'emotion'])

Applying OneHotEncoder on target of all dataset

In [16]:
from sklearn import preprocessing
le = preprocessing.OneHotEncoder()
y_train= le.fit_transform(np.array(train['emotion']).reshape(-1, 1)).toarray()
y_test= le.fit_transform(np.array(test_data['emotion']).reshape(-1, 1)).toarray()
y_val= le.fit_transform(np.array(val_data['emotion']).reshape(-1, 1)).toarray()

# Encoding

There is a very helpful function called encode_plus provided in the Tokenizer class. It can seamlessly perform the following operations:

* Tokenize the text and Add special tokens - [CLS] and [SEP]
* create input IDs
* Pad the sentences to a maximum length
* Create attention masks for the above PAD tokens

**Note:** RoBERTa uses byte-level Byte-Pair Encoding (BPE) in contrast to BERT’s character-level BPE.

In [17]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [18]:
def roberta_encode(data,maximum_length) :
  input_ids = []
  attention_masks = []
  

  for i in range(len(data.text)):
      encoded = tokenizer.encode_plus(
        
        data.text[i],
        add_special_tokens=True,
        max_length=maximum_length,
        pad_to_max_length=True,
        
        return_attention_mask=True,
        
      )
      
      input_ids.append(encoded['input_ids'])
      attention_masks.append(encoded['attention_mask'])
  return np.array(input_ids),np.array(attention_masks)

In [19]:
max_len = max([len(x.split()) for x in train_data['text']])
train_input_ids,train_attention_masks = roberta_encode(train, max_len)
test_input_ids,test_attention_masks = roberta_encode(test_data, max_len)
val_input_ids,val_attention_masks = roberta_encode(val_data, max_len)

# Create Model

**How RoBERTa is better than BERT ??**

Changes in Pre-Training:
* without NSP objective 
* with dynamic mask generation

Changes in Data:
* Trained on more data (16GB BERT vs 160GB RoBERTa)
* Trained on large batches

**Note:** Pre-requistics is to go through BERT

References:
* Bert: https://arxiv.org/pdf/1810.04805.pdf
* The Illustrated BERT, ELMo, and co. (How NLP Cracked Transfer Learning): http://jalammar.github.io/illustrated-bert/
* The Illustrated Transformer: https://jalammar.github.io/illustrated-transformer/
* Attention Is All You Need: https://arxiv.org/pdf/1706.03762.pdf
* Query,key,value vector: https://stats.stackexchange.com/questions/421935/what-exactly-are-keys-queries-and-values-in-attention-mechanisms

In [24]:
def create_model(bert_model, max_len):
    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    
    output = tf.keras.layers.Dense(4, activation='softmax')(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [25]:
from transformers import TFRobertaModel
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

In [26]:
model = create_model(roberta_model, max_len)
model.summary()

# Model Training

In [27]:
history = model.fit([train_input_ids,train_attention_masks],
                    y_train, validation_data=([val_input_ids,val_attention_masks], y_val),
                    epochs=2,
                    batch_size=100)

**Plotting Accuracy and Loss (Training and Validation)**

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

**RoBERTa Model prediction on Test Data**

In [29]:
result = model.predict([test_input_ids,test_attention_masks])
y_pred = np.zeros_like(result)
y_pred[np.arange(len(result)), result.argmax(1)] = 1

**Accuracy and F1 Score of Model**

In [30]:
from sklearn.metrics import accuracy_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy ', accuracy)
f1 = f1_score(y_test, y_pred, average = 'macro')
print('F1 Score :', f1)

In [32]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print(classification_report(y_test, y_pred, target_names=[ "sadness", "joy", "anger", "fear"], digits=4))

In [33]:
# Save the weights
model.save_weights('my_checkpoint.h5')

In [None]:
import os
os.chdir(r'./')

In [None]:
from IPython.display import FileLink
FileLink(r'./my_checkpoint.h5')

# Model Inference

In [None]:
def plot_result(result):
    sns.barplot(x = 'Category', y = 'Confidence', data = result)
    plt.xlabel('Categories', size=14)
    plt.ylabel('Confidence', size=14)
    plt.title('Emotion Classification', size=16)

In [None]:
def roberta_inference_encode(data,maximum_length) :
    input_ids = []
    attention_masks = []
  

  
    encoded = tokenizer.encode_plus(
    data,
    add_special_tokens=True,
    max_length=maximum_length,
    pad_to_max_length=True,

    return_attention_mask=True

    )

    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [None]:
def inference(text_sentence, max_len):
    preprocessed_text = preprocess(text_sentence)
    input_ids, attention_masks = roberta_inference_encode(preprocessed_text, maximum_length = max_len)
    model = create_model(roberta_model, 35)
    model.load_weights('my_checkpoint.h5')
    result = model.predict([input_ids, attention_masks])
#     le.categories_[0] = ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']
    result = pd.DataFrame(dict(zip(list(le.categories_[0]), [round(x*100, 2)for x in result[0]])).items(), columns = ['Category', 'Confidence'])
    plot_result(result)
    return result

In [None]:
inference