## Importing libraries

In [1]:
import numpy as np
import pandas as pd 
import os
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

## Preparing dataset

In [2]:
train=pd.read_csv(
    os.path.join(os.getcwd(),"../static/dataset/train.txt"),
    sep=";",
    names=["Text","Emotion"],
    encoding='utf-8')

validate=pd.read_csv(
   os.path.join(os.getcwd(),"../static/dataset/val.txt"),
    sep=";",
    names=["Text","Emotion"],
    encoding='utf-8')

test=pd.read_csv( 
    os.path.join(os.getcwd(),"../static/dataset/test.txt"),
    sep=";",
    names=["Text","Emotion"],
    encoding='utf-8')

merged = pd.concat([train, validate, test], ignore_index=True)

datasets = [train, validate, test]
datasetnames = ['Train', 'Validate', 'Test']


In [3]:
train.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


### Removing stopwords

In [4]:
from nltk.corpus import stopwords

train['Text'] = train['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))
validate['Text'] = validate['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))

In [5]:
train.head()

Unnamed: 0,Text,Emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger


Converting words to lower case

In [6]:
train['Text'] = train['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train

Unnamed: 0,Text,Emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger
...,...,...
15995,brief time beanbag said anna feel like beaten,sadness
15996,turning feel pathetic still waiting tables sub...,sadness
15997,feel strong good overall,joy
15998,feel like rude comment im glad,anger


Removing Punctuation, Symbols

In [7]:
train['Text'] = train['Text'].str.replace('[^\w\s]',' ')
train

Unnamed: 0,Text,Emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger
...,...,...
15995,brief time beanbag said anna feel like beaten,sadness
15996,turning feel pathetic still waiting tables sub...,sadness
15997,feel strong good overall,joy
15998,feel like rude comment im glad,anger


Lemmatisation

In [8]:
train['Text'] = train['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train

Unnamed: 0,Text,Emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger
...,...,...
15995,brief time beanbag said anna feel like beaten,sadness
15996,turning feel pathetic still waiting table subb...,sadness
15997,feel strong good overall,joy
15998,feel like rude comment im glad,anger


Correcting Letter Repetitions

In [9]:
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

train['Text'] = train['Text'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))
train


Unnamed: 0,Text,Emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger
...,...,...
15995,brief time beanbag said anna feel like beaten,sadness
15996,turning feel pathetic still waiting table subb...,sadness
15997,feel strong good overall,joy
15998,feel like rude comment im glad,anger


## 1. Deep learning approach

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_texts = train['Text']

tokenizer = Tokenizer(15212,lower=True,oov_token='UNK')

tokenizer.fit_on_texts(train_texts)

print('Found %d unique words.' % len(tokenizer.word_index))

# texts_to_sequences: Transforms each text in texts to a sequence of integers. 
# It basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary.

train_texts_sequences = tokenizer.texts_to_sequences(train_texts)

# pad_sequences: Ensure that all sequences in a list have the same length. 
train_texts_pad_sequences = pad_sequences(train_texts_sequences, maxlen=80, padding='post') 

Found 13454 unique words.


In [11]:
train.Emotion.unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [12]:
from tensorflow.keras.utils import to_categorical
emotions = {'sadness': 0, 'joy': 1, 'surprise': 2, 'love': 3, 'anger': 4, 'fear': 5}

# Step 1: Replace all emotion values with integers
train['Emotion'] = train.Emotion.replace(emotions)
train_emotion_integers = train['Emotion'].values

# Step 2: Changing the integers to binary
train_emotion_categorical = to_categorical(train_emotion_integers)

train_emotion_categorical[:6] 

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.]], dtype=float32)

In [13]:
validate_texts = validate['Text']
validate_emotion_integers = validate.Emotion.replace(emotions)
validate_texts_sequences = tokenizer.texts_to_sequences(validate_texts)
validate_texts_pad_sequences = pad_sequences(validate_texts_sequences, maxlen=80, padding='post')
validate_emotion_categorical = to_categorical(validate_emotion_integers.values)
validate_emotion_categorical[:6]

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]], dtype=float32)

In [14]:
import tensorflow as tf
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  print("All devices: ", tf.config.list_logical_devices('TPU'))
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
  tpu_strategy = tf.distribute.get_strategy() 

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
    model=Sequential()
    model.add(Embedding(15212,64,input_length=80))
    model.add(Dropout(0.6))
    model.add(Bidirectional(LSTM(80,return_sequences=True)))
    model.add(Bidirectional(LSTM(160)))
    model.add(Dense(len(emotions),activation='softmax'))
    print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 64)            973568    
                                                                 
 dropout (Dropout)           (None, 80, 64)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 80, 160)          92800     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 320)              410880    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 6)                 1926      
                                                                 
Total params: 1,479,174
Trainable params: 1,479,174
Non-

In [16]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
hist=model.fit(train_texts_pad_sequences, train_emotion_categorical, epochs=10, validation_data = (validate_texts_pad_sequences, validate_emotion_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
test['Text'] = test['Text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords.words('english')]))

test_texts = test['Text']
test_emotion_integers = test.Emotion.replace(emotions)
test_texts_sequences = tokenizer.texts_to_sequences(test_texts)
test_texts_pad_sequences = pad_sequences(test_texts_sequences, maxlen=80, padding='post')
test_emotion_categorical = to_categorical(test_emotion_integers.values)
test_emotion_categorical[:7]

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [19]:
model.evaluate(test_texts_pad_sequences, test_emotion_categorical)



[0.24799197912216187, 0.9194999933242798]

In [20]:
from nltk.tokenize import word_tokenize

def get_key(value):
    for key,val in emotions.items():
          if (val==value):
            return key

def remove_stopwords(sentence):
    text_tokens = word_tokenize(sentence)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]
    return (" ").join(tokens_without_sw)
        
def predict(sentence):
    sentence = remove_stopwords(sentence.lower())
    print(sentence)
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tokenizer.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=80,padding='post')
    certaintyprediction = model.predict(sentence_padded)[0]
    for key,val in emotions.items():
          print(key + ': ' + str(round(certaintyprediction[val]*100, 2)) + ' %')
    bestpredictionindex = np.argmax(certaintyprediction)
    certainty = str(round(certaintyprediction[bestpredictionindex]*100, 2))
    print('\nI am '+ certainty + ' % sure the emotion is ' + get_key(bestpredictionindex) + '.')

In [21]:
predict("You are being very rude.")

rude .
sadness: 0.37 %
joy: 0.03 %
surprise: 0.02 %
love: 0.05 %
anger: 99.44 %
fear: 0.09 %

I am 99.44 % sure the emotion is anger.


In [22]:
predict("I surprised my dog")

surprised dog
sadness: 6.32 %
joy: 6.01 %
surprise: 60.18 %
love: 3.81 %
anger: 2.04 %
fear: 21.64 %

I am 60.18 % sure the emotion is surprise.


## 2. Machine learning models Approach 

In [23]:
X_train, X_val, y_train, y_val = train_test_split(train.Text.values, train.Emotion, random_state=42, test_size=0.1)

In [24]:
from sklearn.pipeline import Pipeline

In [25]:
exm = ['I am very happy today! The atmosphere looks cheerful',
        'Things are looking great. It was such a good day',
        'Success is right around the corner. Lets celebrate this victory',
        'Everything is more beautiful when you experience them with a smile!',
        'Now this is my worst, okay? But I am gonna get better.',
        'I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain. I am tired of all the pain I feel',
        'This is quite depressing. I am filled with sorrow',
        'His death broke my heart. It was a sad day', 
        'i hate this',
        'I dont love you anymore..!',
        'This looks so impressive',
        'surprised',
        'like']

In [26]:
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [27]:
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(train['Text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

### 2.1 Logistic regression 

#### 2.1.1 Logistic regression with tfidf

In [28]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('logistic regression tfidf accuracy %s' % accuracy_score(y_pred, y_val))

logistic regression tfidf accuracy 0.283125


#### 2.1.2 Logistic regression with count vectors

In [29]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('logistic regression count vectors accuracy %s' % accuracy_score(y_pred, y_val))

logistic regression count vectors accuracy 0.90375


### 2.2  Multinomial naive bayes

#### 2.2.1 Multinomial naive bayes with tfdif

In [30]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes tfidf accuracy 0.289375


#### 2.2.2  Multinomial naive bayes with count vectors

In [31]:
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vectors accuracy 0.795625


### 2.3 Multilayer perceptron

#### 2.3.1 Multilayer perceptron with tfidf

In [32]:
mlp = MLPClassifier()
mlp.fit(X_train_tfidf, y_train)
y_pred = mlp.predict(X_val_tfidf)
print('MLP tfidf accuracy %s' % accuracy_score(y_pred, y_val))

MLP tfidf accuracy 0.265


#### 2.3.2 Multilayer perceptron with count vectors

In [33]:
mlp = MLPClassifier()
mlp.fit(X_train_count, y_train)
y_pred = mlp.predict(X_val_count)
print('MLP count vectors accuracy %s' % accuracy_score(y_pred, y_val))

MLP count vectors accuracy 0.864375


### 2.4 Linear SVM

#### 2.4.1  LSVM with tfdif

In [34]:
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('lsvm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using tfidf accuracy 0.28375


#### 2.4.2  LSVM with count vectors

In [35]:
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.90125


-----------------------------------------------------------------------------

### Model Storing

In [37]:
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense
import numpy
import os
model_json = model.to_json()
with open("../static/model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("../static/model/model.h5")
print("Saved model to disk")

Saved model to disk


In [39]:
import pickle
# dump information to that file
pickle.dump(tokenizer, open('../static/model/tokenizer.pkl', 'wb'))

### Model Testing

In [40]:
import numpy as np
from flask import Flask, request, render_template
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import model_from_json
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')
import pickle

In [41]:
def get_key(value):
    for key,val in emotions.items():
          if (val==value):
            return key

def remove_stopwords(sentence):
    text_tokens = word_tokenize(sentence)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]
    return (" ").join(tokens_without_sw)
        
def predict_deep(sentence,model):
    sentence = remove_stopwords(sentence.lower())
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tokenizer.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=80,padding='post')
    certaintyprediction = model.predict(sentence_padded)[0]
    rescertainity = [round(x*100) for x in certaintyprediction]
    #print(rescertainity)
    bestpredictionindex = np.argmax(certaintyprediction)
    
    certainty = str(round(certaintyprediction[bestpredictionindex]*100, 2))
    return [rescertainity, get_key(bestpredictionindex)]

In [44]:
emotions = {'sadness': 0, 'joy': 1, 'surprise': 2, 'love': 3, 'anger': 4, 'fear': 5}

tokenizer_file = open("../static/model/tokenizer.pkl","rb")
tokenizer = pickle.load(tokenizer_file)
tokenizer_file.close()

json_file = open('../static/model/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("../static/model/model.h5")

In [45]:
sentence = "I love animals"
percents, mood = predict_deep(sentence,loaded_model)
print(percents)

[11, 25, 9, 12, 15, 28]


### Conclusion
Successfully implemented emotion detection for text documents.