In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, SpatialDropout1D, Dropout, Activation
from tensorflow.keras.optimizers import Adam

In [3]:
from text_cleaning import text_cleaning

# Cleaning of data

In [28]:
data = pd.read_csv('tweet_emotions.csv')
data = data.drop(columns=['tweet_id'])
data

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [29]:
# removing duplicate values

a = len(data)
data = data.drop_duplicates(subset=['content']).reset_index(drop=True)
b = len(data)
print('number of duplicate values removed: ', a-b)

number of duplicate values removed:  173


In [30]:
# removing rows with nan values

a = len(data)
data = data.dropna()
data = data.reset_index(drop=True)
b = len(data)
print('number of rows with NaN values removed: ', a-b)

number of rows with NaN values removed:  0


In [31]:
# shuffling the dataset

data = data.sample(frac=1, random_state=123).reset_index(drop=True)   
data

Unnamed: 0,sentiment,content
0,sadness,@fossiloflife was being sarcy as usual
1,worry,@evatweets feel better soon! Your immune syste...
2,worry,@TrevorAB aww you poor dear! But it was awesom...
3,worry,"it would sicken most, the number of ways I am ..."
4,fun,@lauramorris1983 good luck with it..Do well!! ...
...,...,...
39822,worry,@kakoivisto checked with the hubster and I thi...
39823,surprise,@LoriBartolozzi Wow That had to be difficult
39824,love,@swtcupcake Not sure it didn't say it was 2 bi...
39825,love,"@brinahaha i'm just starting it brinn, guess w..."


In [32]:
data.sentiment.nunique()

13

In [33]:
data.sentiment.value_counts()

neutral       8598
worry         8437
happiness     5184
sadness       5154
love          3785
surprise      2181
fun           1775
relief        1522
hate          1322
empty          822
enthusiasm     758
boredom        179
anger          110
Name: sentiment, dtype: int64

In [34]:
'''
clean_data = data.copy()

for i in tqdm(range(len(data))):
    clean_data.loc[i, 'content'] = str(text_cleaning(data.loc[i, 'content']))
    
clean_data.to_csv('clean_data.csv', index=False)
    '''

clean_data = pd.read_csv('clean_data.csv')

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 39827/39827 [01:55<00:00, 346.06it/s]


In [35]:
clean_data

Unnamed: 0,sentiment,content
0,sadness,sarcy usual
1,worry,feel better soon immune system must away holid...
2,worry,aww poor dear awesome came race
3,worry,would sicken number ways able tweet
4,fun,good luck itdo well best wishes
...,...,...
39822,worry,checked hubster think well pass feeling lazy c...
39823,surprise,wow difficult
39824,love,sure didnt say 2 big jst saw pics u ur last bd...
39825,love,im starting brinn guess get laptop back cant u...


In [36]:
type(clean_data.content[39824])

str

# Tokenization of tweets

In [37]:
vocab_size = 10000
max_length = 20
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [38]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(clean_data.content.to_list())

In [39]:
tokenizer.word_index.items()



In [40]:
len(tokenizer.word_index.items())

35375

In [211]:
clean_data.content[345]

'jealousur koi fish getting ur attention im notlmao im kidding'

In [41]:
data_sequences = tokenizer.texts_to_sequences(clean_data.content.to_list())
data_sequences[345]

[1, 5688, 1495, 66, 111, 1647, 2, 1, 2, 1049]

In [42]:
data_padded = pad_sequences(data_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
data_padded[345]

array([   1, 5688, 1495,   66,  111, 1647,    2,    1,    2, 1049,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

# Tokenization of labels

In [43]:
label_maping = {'neutral':0,
                'worry':1,
                'happiness':2,
                'sadness':3,
                'love':4,
                'surprise':5,
                'fun':6,
                'relief':7,
                'hate':8,
                'empty':9,
                'enthusiasm':10,
                'boredom':11,
                'anger':12 }
y_data = list()
for i in range(len(clean_data)):
    y_data.append(np.eye(13)[label_maping[clean_data.sentiment[i]]])

y_data = np.array(y_data, dtype=int)
y_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

# Mapping tweets with glove word embeddings

In [44]:
embeddings = {}
glove = open('glove.6B.200d.txt', 'r', encoding = 'utf-8').read()

In [45]:
for line in tqdm(glove.split("\n")):
    values = line.split(" ")
    word = values[0]
    indices = np.asarray(values[1: ], dtype='float32')
    embeddings[word] = indices

100%|█████████████████████████████████████████████████████████████████████████████████████████| 400001/400001 [00:21<00:00, 18722.57it/s]


In [46]:
embedding_dim = 200
emb_matrix = np.zeros((vocab_size+1, embedding_dim)) 
for word, i in tokenizer.word_index.items():
    if i<vocab_size+1:
        embedding_vec = embeddings.get(word) 
        if embedding_vec is not None: 
            emb_matrix[i] = embedding_vec 
        
emb_matrix.shape

(10001, 200)

In [47]:
emb_matrix[2]

array([ 0.63559997, -0.061983  ,  0.68572998,  0.35927001,  0.77188998,
       -0.17863999,  0.42192999,  0.36329001,  0.17679   ,  0.39195001,
       -0.34388   ,  0.061045  ,  0.51356   ,  0.0051416 ,  0.41281   ,
        0.13254   ,  0.24344   , -0.0025233 ,  0.096909  ,  0.035513  ,
       -0.32201001,  0.66012001, -0.027213  , -0.68074   , -0.0055015 ,
       -0.067808  ,  0.59719002,  0.59227002, -0.087605  ,  0.73422003,
       -0.25418001,  0.083747  , -0.61884999,  0.01206   ,  0.53399003,
        0.48798001, -0.89778   , -0.44477999, -0.66118002,  0.41505   ,
       -0.43263   ,  0.34933001, -0.81779999,  0.065648  , -0.39328   ,
       -0.37671   ,  0.24209   , -0.70943999, -0.17702   , -0.080336  ,
       -0.22130001,  0.52446002,  0.92953998, -0.27930999, -0.48824   ,
        0.19778   ,  0.55462998,  0.78573   , -1.13349998, -0.10334   ,
        0.55264997,  0.69869   , -0.13896   ,  0.68254   , -0.14053001,
       -0.067644  ,  0.060842  , -0.67729002,  0.17878   ,  0.08

# Splitting the dataset

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_padded, y_data, test_size=0.1, random_state=42)

In [49]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(35844, 20) (35844, 13)
(3983, 20) (3983, 13)


# Model training

In [258]:
model = Sequential([
    Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights= [emb_matrix], trainable=False),
    LSTM(128, activation='tanh'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(13, activation='softmax')
    ])


optimizer = Adam(learning_rate=0.001)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 20, 200)           2000200   
_________________________________________________________________
lstm_20 (LSTM)               (None, 128)               168448    
_________________________________________________________________
dropout_30 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_31 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 13)                845       
Total params: 2,177,749
Trainable params: 2,177,749
Non-trainable params: 0
___________________________________________

In [259]:
history = model.fit(X_train, y_train, validation_split=0.2, epochs = 10, batch_size=16)

Train on 28675 samples, validate on 7169 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [246]:
score = model.evaluate(X_test, y_test, verbose=1)
print("Test Loss:", score[0])
print("Test Accuracy:", score[1])

Test Loss: 2.123980106434689
Test Accuracy: 0.322119


In [249]:
i = 564
b = list([X_test[i]])
print(tokenizer.sequences_to_texts(b))

['hey david im wondering received letter song <OOV> please reply nice day <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>']


In [250]:
inv_map = {v: k for k, v in label_maping.items()}

myList = predictions[i]
a = np.flipud(np.array([i for i in sorted(enumerate(myList), key=lambda x:x[1])]))
print(inv_map[np.argmax(y_test[i])]  , '\n')
print(inv_map[a[0, 0]], ': ', a[0, 1]*100, '%')
print(inv_map[a[1, 0]], ': ', a[1, 1]*100, '%')
print(inv_map[a[2, 0]], ': ', a[2, 1]*100, '%')
print(inv_map[a[3, 0]], ': ', a[3, 1]*100, '%')
print(inv_map[a[4, 0]], ': ', a[4, 1]*100, '%')

happiness 

worry :  19.655297696590424 %
happiness :  17.645500600337982 %
love :  16.457316279411316 %
sadness :  13.430427014827728 %
neutral :  8.293849974870682 %


In [253]:
text = ['I am very happy today']
data_sequences = tokenizer.texts_to_sequences(text)
data_padded = pad_sequences(data_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
p = model.predict(data_padded)

myList = p[0]
a = np.flipud(np.array([i for i in sorted(enumerate(myList), key=lambda x:x[1])]))

print(data_sequences, '\n')
print(inv_map[a[0, 0]], ': ', a[0, 1]*100, '%')
print(inv_map[a[1, 0]], ': ', a[1, 1]*100, '%')
print(inv_map[a[2, 0]], ': ', a[2, 1]*100, '%')
print(inv_map[a[3, 0]], ': ', a[3, 1]*100, '%')
print(inv_map[a[4, 0]], ': ', a[4, 1]*100, '%')

[[1, 1, 1, 15, 11]] 

love :  27.83176302909851 %
happiness :  25.67422389984131 %
neutral :  12.188868224620819 %
relief :  9.449155628681183 %
worry :  9.38776209950447 %
