# Import The Necessary Modules

In [1]:
import os
import tensorflow
os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
from termcolor import colored



# Uploading Train and Test data

In [3]:
# Load data
print(colored("Loading train and test data", "yellow"))
train_data = pd.read_csv('clean_train.csv') #FOR MODEL TRAINING
test_data = pd.read_csv('clean_test.csv') #FOR MODEL TESTING
print(colored("Data loaded", "yellow"))



[33mLoading train and test data[0m
[33mData loaded[0m


In [5]:
train_data.head()

Unnamed: 0,Tweet,Sentiment,Clean_tweet
0,@SweetCandiesXXX if u came to visit here in 17...,0,came visit choic
1,thanks @Just4Julia! good advice for this day ...,4,thank good advic day quotsmil fear sorrow smil...
2,"@RogJ Thank you, Roger! Oh, and very nice to ...",4,thank roger Oh nice see
3,"@MattMazur Hi Matt, how are you today? I am im...",4,Hi matt today improv french tweet
4,"@MrsNickJonass that's cool, i like it",4,that' cool like


In [6]:
test_data.head()

Unnamed: 0,Tweet,Sentiment,Clean_tweet
0,@nicholasmw 1 day u will find that girl worry,4,day find girl worri
1,there is nothing on tv and im so desperate to ...,0,noth tv im desper entertain im watch hihow sad
2,Very excited that greys is on tonight. Not so ...,0,veri excit grey tonight not happi season
3,2pm's again and again is a great song. Nichkhu...,4,pm' great song nichkhun hwait
4,My teeth hurt,0,My teeth hurt


# Convert The tweet Into Sequence of number array 

In [7]:
print(colored("Tokenizing and padding data", "yellow"))
tokenizer = Tokenizer(num_words = 2000, split = ' ')

[33mTokenizing and padding data[0m


In [8]:
tokenizer.fit_on_texts(train_data['Clean_tweet'].astype(str).values)


In [10]:
train_data['Clean_tweet']

0                                           came visit choic
1          thank good advic day quotsmil fear sorrow smil...
2                                    thank roger Oh nice see
3                          Hi matt today improv french tweet
4                                            that' cool like
                                 ...                        
1279995    alreadi saw mtv movi award adv global tvbut te...
1279996                         So appar wait fuel truck min
1279997    happi bank holiday weekend saturday open shop ...
1279998                                             i'm cold
1279999    just dropp sister train station she' go back u...
Name: Clean_tweet, Length: 1280000, dtype: object

In [12]:
train_tweets = tokenizer.texts_to_sequences(train_data['Clean_tweet'].astype(str).values)
train_tweets[:5]

[[352, 449, 1017],
 [13, 6, 1581, 5, 1703, 482, 167, 45],
 [13, 38, 71, 20],
 [219, 1501, 10, 1847, 867, 84],
 [82, 139, 8]]

In [11]:
max_len = max([len(i) for i in train_tweets])
max_len

40

In [13]:
train_tweets = pad_sequences(train_tweets, maxlen = max_len)
train_tweets[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  352,  449, 1017],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   13,
           6, 1581,    5, 1703,  482,  167,   45],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   13,   38,   71,   20],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  

In [14]:
test_tweets = tokenizer.texts_to_sequences(test_data['Clean_tweet'].astype(str).values)
test_tweets = pad_sequences(test_tweets, maxlen = max_len)
print(colored("Tokenizing and padding complete", "yellow"))


[33mTokenizing and padding complete[0m


In [15]:
test_tweets[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    5,  120,  128,  386],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  197,
         346,   23, 1729, 1308,   23,   28,   51],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         544,  166, 1776,   67,    1,   58,  450],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  

# Building The LSTM model Architecture

In [10]:
# Building the model
print(colored("Creating the LSTM model", "yellow"))
model = Sequential()
model.add(Embedding(2000, 128, input_length = train_tweets.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(256, dropout = 0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()



[33mCreating the LSTM model[0m
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 40, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 256)               394240    
                                                                 
 dense (Dense)               (None, 2)                 514       
                                                                 
Total params: 650,754
Trainable params: 650,754
Non-trainable params: 0
_________________________________________________________________


# Training The LSTM MODEL on 10 Epocs 

In [19]:
import numpy as np
li = ['s', 'a', 't']
print(pd.get_dummies(li))

   a  s  t
0  0  1  0
1  1  0  0
2  0  0  1


In [16]:
pd.get_dummies(train_data['Sentiment'])

Unnamed: 0,0,4
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
1279995,1,0
1279996,1,0
1279997,0,1
1279998,1,0


In [11]:
# Training the model
print(colored("Training the LSTM model", "green"))
history = model.fit(train_tweets, pd.get_dummies(train_data['Sentiment']).values, epochs = 10, batch_size = 128, validation_split = 0.2)
print(colored(history, "green"))



[32mTraining the LSTM model[0m
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[32m<keras.callbacks.History object at 0x0000024C807BACD0>[0m


# Evaluating how accurate is our model

In [12]:
# Testing the model
print(colored("Testing the LSTM model", "green"))
score, accuracy = model.evaluate(test_tweets, pd.get_dummies(test_data['Sentiment']).values, batch_size = 128)
print("Test accuracy: {}".format(accuracy))

[32mTesting the LSTM model[0m
Test accuracy: 0.787987470626831
