In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Importing Libraries
#!pip install nlp
#!pip install datasets
import tensorflow as tf
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nlp
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers import GlobalAvgPool1D

In [None]:
train = pd.read_csv("/kaggle/input/emotion-dataset/training.csv")
test = pd.read_csv("/kaggle/input/emotion-dataset/test.csv")
val = pd.read_csv("/kaggle/input/emotion-dataset/validation.csv")

In [None]:
train.head()

In [None]:
labels_dict = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
train['description'] = train['label'].map(labels_dict )
train.head()

# Tokenizing with NLTK

In [None]:
def tokenization(inputs):
    return word_tokenize(inputs) #REFERENCE[1]


train['text_tokenized'] = train['text'].apply(tokenization)
val['text_tokenized'] = val['text'].apply(tokenization)

In [None]:
train.head()

By using tokenization, I split each data point into words. Tokenization is one of the key steps for NLP applications.

# Stopwords Removal

In [None]:
stop_words = set(stopwords.words('english'))

def stopwords_remove(inputs):
    return [item for item in inputs if item not in stop_words]

train['text_stop'] = train['text_tokenized'].apply(stopwords_remove)
val['text_stop'] = val['text_tokenized'].apply(stopwords_remove)

train.head()

# Lemmatization

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatization(inputs):
    return [lemmatizer.lemmatize(word=x, pos='v') for x in inputs]

train['text_lemmatized'] = train['text_stop'].apply(lemmatization)
val['text_lemmatized'] = val['text_stop'].apply(lemmatization)

train.head()

# Joining Tokens into Sentences

In [None]:
train['text_cleaned'] = train['text_lemmatized'].str.join(' ')
val['text_cleaned'] = val['text_lemmatized'].str.join(' ')

train.head() # Final form of the dataset

In [None]:
WordCloud = WordCloud(max_words=100,
                      random_state=30,
                      collocations=True).generate(str((train['text_cleaned'])))

plt.figure(figsize=(15, 8))
plt.imshow(WordCloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Tokenizing with Tensorflow

In [None]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train['text_cleaned'])

word_index = tokenizer.word_index

In [None]:
Tokenized_train = tokenizer.texts_to_sequences(train['text_cleaned'])
Tokenized_val = tokenizer.texts_to_sequences(val['text_cleaned'])

# Padding

In [None]:
maxlen = 40
Padded_train = pad_sequences(Tokenized_train, maxlen=maxlen, padding='pre')
Padded_val = pad_sequences(Tokenized_val, maxlen=maxlen, padding='pre')

print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][0]]))
print('Padded Version: ', Padded_train[0])
print('--'*50)
print('Non-padded Version: ', tokenizer.texts_to_sequences([train['text_cleaned'][10]]))
print('Padded Version: ', Padded_train[10])

# Creating the Model

In [None]:
model = Sequential()

model.add(Embedding(num_words, 16, input_length=maxlen))
model.add(GlobalAvgPool1D())

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, activation='relu'))
model.add(Dropout(0.3))

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, activation='relu', return_sequences=True))
model.add(Dropout(0.3))

tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(40, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(6, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
epochs = 20
hist = model.fit(Padded_train, train['label'], epochs=epochs,
                 validation_data=(Padded_val, val['label']), 
                 )

# Train and Validation Loss Graphs

In [None]:
plt.figure(figsize=(5, 4))
plt.plot(hist.history['loss'], label='Train Loss')
plt.plot(hist.history['val_loss'], label='Validation Loss')
plt.title('Train and Validation Loss Graphs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Preparing the Test Data

In [None]:
test['text_tokenized'] = test['text'].apply(tokenization)
test['text_stop'] = test['text_tokenized'].apply(stopwords_remove)
test['text_lemmatized'] = test['text_stop'].apply(lemmatization)
test['text_cleaned'] = test['text_lemmatized'].str.join(' ')

Tokenized_test = tokenizer.texts_to_sequences(test['text_cleaned'])
Padded_test = pad_sequences(Tokenized_test, maxlen=maxlen, padding='pre')

test_evaluate = model.evaluate(Padded_test, test['label'])

# Making Predictions in the Test Data

In [None]:
labels_dict = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
test['description'] = test['label'].map(labels_dict )

In [None]:
test.head()

In [None]:
def make_predictions(text_input):
    text_input = str(text_input)
    text_input = tokenization(text_input)
    text_input = stopwords_remove(text_input)
    text_input = lemmatization(text_input)
    text_input = ' '.join(text_input)
    text_input = tokenizer.texts_to_sequences([text_input])
    text_input = pad_sequences(text_input, maxlen=maxlen, padding='pre')
    text_input = np.argmax(model.predict(text_input))
    
    if text_input == 0:
        print('Predicted Emotion: Sadness')
    elif text_input == 1:
        print('Predicted Emotion: Joy')
    elif text_input == 2:
        print('Predicted Emotion: Love')
    elif text_input == 3:
        print('Predicted Emotion: Anger')
    elif text_input == 4:
        print('Predicted Emotion: Fear')
    else:
        print('Predicted Emotion: Surprise')
    return text_input

import random
# Randomly chosen Test Dataset data points
i = random.randint(0, len(test) - 1)

print('Test Text:', test['text'][i])
print(' ')
print('Actual Emotion:', test['description'][i])
make_predictions(test['text'][i])
print('-'*50)
print('Test Text:', test['text'][i+1])
print(' ')
print('Actual Emotion:', test['description'][i+1])
make_predictions(test['text'][i+1])

# Some Fun

In [None]:
make_predictions("Grandpa was very proud of me when I got a promotion at work. He took me out to dinner to celebrate.")

In [None]:
make_predictions("Sometimes the people who appear to be the most confident are actually afraid of their own shadows.")