In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

# nlp library
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer

### deep learning 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN
from sklearn.model_selection import train_test_split
import re
import string
from textblob import TextBlob
import joblib 


In [24]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yasha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
test_data=open(r'Data\test.txt','r').readlines()
train_data=open(r'Data\train.txt','r').readlines()
val_data=open(r'Data\val.txt','r').readlines()

In [26]:
complete_data=val_data+train_data+test_data

In [27]:
len(complete_data)

20000

In [28]:
complete_data

['im feeling quite sad and sorry for myself but ill snap out of it soon;sadness\n',
 'i feel like i am still looking at a blank canvas blank pieces of paper;sadness\n',
 'i feel like a faithful servant;love\n',
 'i am just feeling cranky and blue;anger\n',
 'i can have for a treat or if i am feeling festive;joy\n',
 'i start to feel more appreciative of what god has done for me;joy\n',
 'i am feeling more confident that we will be able to take care of this baby;joy\n',
 'i feel incredibly lucky just to be able to talk to her;joy\n',
 'i feel less keen about the army every day;joy\n',
 'i feel dirty and ashamed for saying that;sadness\n',
 'i feel bitchy but not defeated yet;anger\n',
 'i was dribbling on mums coffee table looking out of the window and feeling very happy;joy\n',
 'i woke up often got up around am feeling pukey radiation and groggy;sadness\n',
 'i was feeling sentimental;sadness\n',
 'i walked out of there an hour and fifteen minutes later feeling like i had been beaten 

In [29]:
x=[]
y=[]
for item in complete_data:
    text,label=item.split(";")
    label=label.replace('\n','')
    x.append(text)
    y.append(label)

In [32]:
stem =PorterStemmer()
def text_preprocessing(text):
    clean_Text = []
    for sent in text:
        lower_Sent=sent.lower()
        word_tokenise=word_tokenize(lower_Sent)
        remove_stopwords=[word for word in word_tokenise if word not in stopwords.words('english')]
        stemmed_sent =  [stem.stem(word)for word in remove_stopwords]
        clean_test=" ".join(stemmed_sent)
        clean_Text.append(clean_test)

    return clean_Text

In [34]:
clean_Text=text_preprocessing(text=x)

In [35]:
Tokenizer = Tokenizer(oov_token="<nothing>")

In [36]:
Tokenizer.fit_on_texts(clean_Text)

In [37]:
Tokenizer.word_index

{'<nothing>': 1,
 'feel': 2,
 'like': 3,
 'im': 4,
 'get': 5,
 'time': 6,
 'know': 7,
 'realli': 8,
 'make': 9,
 'go': 10,
 'want': 11,
 'love': 12,
 'littl': 13,
 'think': 14,
 'peopl': 15,
 'day': 16,
 'thing': 17,
 'one': 18,
 'would': 19,
 'even': 20,
 'still': 21,
 'ive': 22,
 'life': 23,
 'bit': 24,
 'way': 25,
 'need': 26,
 'someth': 27,
 'much': 28,
 'dont': 29,
 'work': 30,
 'start': 31,
 'could': 32,
 'say': 33,
 'look': 34,
 'see': 35,
 'tri': 36,
 'back': 37,
 'good': 38,
 'pretti': 39,
 'come': 40,
 'right': 41,
 'alway': 42,
 'help': 43,
 'also': 44,
 'today': 45,
 'year': 46,
 'take': 47,
 'friend': 48,
 'use': 49,
 'around': 50,
 'cant': 51,
 'person': 52,
 'made': 53,
 'though': 54,
 'hate': 55,
 'well': 56,
 'got': 57,
 'happi': 58,
 'thought': 59,
 'someon': 60,
 'didnt': 61,
 'never': 62,
 'felt': 63,
 'find': 64,
 'write': 65,
 'lot': 66,
 'hope': 67,
 'quit': 68,
 'live': 69,
 'week': 70,
 'everi': 71,
 'sure': 72,
 'less': 73,
 'read': 74,
 'enough': 75,
 'give':

In [38]:
Tokenizer.document_count

20000

In [39]:
sequence =Tokenizer.texts_to_sequences(clean_Text)

In [44]:
sequence[0:5]
#output

[[4, 2, 68, 134, 262, 120, 1868, 458],
 [2, 3, 21, 34, 508, 2082, 508, 592, 887],
 [2, 3, 329, 6002],
 [2, 509, 1017],
 [573, 2, 534]]

In [45]:
sequence

[[4, 2, 68, 134, 262, 120, 1868, 458],
 [2, 3, 21, 34, 508, 2082, 508, 592, 887],
 [2, 3, 329, 6002],
 [2, 509, 1017],
 [573, 2, 534],
 [31, 2, 274, 143, 189],
 [2, 240, 113, 47, 96, 293],
 [2, 431, 373, 113, 83],
 [2, 73, 861, 2083, 71, 16],
 [2, 550, 333, 33],
 [2, 522, 496, 154],
 [4452, 1229, 1199, 1200, 34, 1061, 2, 58],
 [339, 155, 57, 50, 2, 6003, 2375, 514],
 [2, 786],
 [141, 196, 2844, 432, 638, 2, 3, 547, 1142, 126, 3184, 1499],
 [62, 127, 2, 148, 897, 144, 574, 373, 1685, 4453, 4454, 1610, 4455, 144],
 [2, 575, 68, 535, 53, 16, 13, 85],
 [7, 2, 3, 110, 2376, 1869, 3185, 976, 6004],
 [44,
  12,
  32,
  8,
  2,
  898,
  6005,
  202,
  3,
  109,
  3186,
  6006,
  2217,
  350,
  1093,
  1269,
  4456,
  2377,
  81,
  6],
 [1500, 197, 48, 414, 106, 4, 2, 481],
 [7, 298, 10, 2, 482],
 [2, 36, 86, 4, 653, 86, 4, 1062, 1393, 918, 787, 100],
 [2, 27, 62, 8, 551],
 [2, 3, 682, 472, 937, 6],
 [2, 395, 693, 320, 93, 178, 162, 145, 179, 1230, 788, 182],
 [247,
  2084,
  3696,
  4,
  101,


In [47]:
sequences=pad_sequences(sequence,maxlen=35,padding='post')

In [48]:
sequences[0:5]
# OUTPUT:

array([[   4,    2,   68,  134,  262,  120, 1868,  458,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   2,    3,   21,   34,  508, 2082,  508,  592,  887,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   2,    3,  329, 6002,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   2,  509, 1017,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [ 573,    2,  534,   

In [49]:
unique_labels=list(pd.Series(np.array(y)).unique())

In [50]:
unique_labels

['sadness', 'love', 'anger', 'joy', 'fear', 'surprise']

In [51]:
label_dict={
    'sadness':0,'love':1,'anger':2,'joy':3,'fear':4,'surprise':5,
}

In [52]:
y =np.array(y)

In [53]:

def label_encoding(labels):
    labals = []
    for x in labels:
        labals.append(label_dict[x])
        labels =np.array(labals)
    return labels

In [54]:
label_encoding(labels=y)

array([0, 0, 1, ..., 3, 3, 4])

In [55]:
type(y )

numpy.ndarray

In [57]:
def label_encoding(labels):
    label=[]
    
    for x in labels:
        x_clean = x.strip()
        label.append(label_dict[x_clean])
    labels=np.array(label)
    return label

In [58]:
labels=label_encoding(labels=y)

In [59]:
labels

[0,
 0,
 1,
 2,
 3,
 3,
 3,
 3,
 3,
 0,
 2,
 3,
 0,
 0,
 0,
 3,
 0,
 2,
 1,
 3,
 4,
 0,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 1,
 0,
 3,
 0,
 0,
 3,
 0,
 0,
 2,
 3,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 1,
 0,
 0,
 3,
 3,
 0,
 3,
 3,
 2,
 0,
 0,
 1,
 1,
 3,
 0,
 3,
 3,
 0,
 0,
 1,
 4,
 0,
 3,
 3,
 3,
 4,
 1,
 3,
 0,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 2,
 1,
 3,
 3,
 1,
 2,
 2,
 3,
 1,
 2,
 4,
 3,
 2,
 2,
 1,
 0,
 2,
 1,
 1,
 0,
 3,
 2,
 0,
 3,
 0,
 0,
 3,
 3,
 0,
 0,
 0,
 0,
 3,
 0,
 2,
 2,
 2,
 3,
 0,
 1,
 4,
 4,
 3,
 0,
 4,
 0,
 0,
 3,
 2,
 0,
 4,
 4,
 1,
 3,
 3,
 0,
 3,
 0,
 3,
 2,
 5,
 1,
 4,
 1,
 0,
 1,
 3,
 4,
 2,
 3,
 2,
 3,
 5,
 2,
 1,
 0,
 4,
 2,
 3,
 0,
 3,
 5,
 3,
 4,
 4,
 0,
 0,
 5,
 0,
 3,
 4,
 3,
 4,
 0,
 1,
 1,
 0,
 2,
 2,
 3,
 3,
 2,
 2,
 0,
 0,
 0,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 3,
 4,
 3,
 2,
 0,
 4,
 3,
 0,
 3,
 2,
 2,
 3,
 4,
 0,
 4,
 3,
 3,
 0,
 0,
 0,
 3,
 5,
 4,
 0,
 0,
 3,
 5,
 0,
 4,
 0,
 5,
 4,
 3,
 2,
 4,
 3,
 3,
 2,
 5,
 3,
 1,
 3,
 3,
 4,
 3,
 1,
 4,
 0,
 0,
 0,
 3,
 3,
 5,
