# Preprocessing

In [1]:
# Import required Libs for Preprocessing

import pandas as pd
from sklearn.model_selection import train_test_split

df= pd.read_csv('Tweets.csv', sep=',')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [2]:
# Select Required Columns Only

tweet_df = df[['text','airline_sentiment']]

In [3]:
tweet_df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [4]:
# Select only Positive and Negative Reviews

tweet_df = tweet_df[tweet_df['airline_sentiment'] != 'neutral']

In [5]:
# Attribute and Labels
X = tweet_df.text
y = tweet_df.airline_sentiment

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [6]:
# Checking Train Set

print("Training Set X Items: " + str(len(X_train)))
print("Training Set y Items: " + str(len(y_train)))

Training Set X Items: 9232
Training Set y Items: 9232


In [7]:
# Checking Test Set

print("Test Set X Items: " + str(len(X_test)))
print("Test Set y Items: " + str(len(y_test)))

Test Set X Items: 2309
Test Set y Items: 2309


In [8]:
# Getting required labels only and encoding

review_labels = y_train.factorize()

In [76]:
review_labels[0]

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [9]:
# Check Review Labels
review_labels[1]

Index(['negative', 'positive'], dtype='object')

# Next

In [10]:
# Importing required tf modules
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
tweet = X_train.values
tweet

array(['@USAirways Another dead end.  They only handle AA L&amp;F.  They gave me the same failed # I already had. 610-362-7498(99) VM full.  #lost',
       '@USAirways #2066. Was on plane from PBI to CLT and knew about the frozen water. Also saw a plane to NYC take off at the gate next door!',
       '@USAirways waiting for bags now over 25min in Phl bag claim!',
       ...,
       'Lovely! RT @JetBlue: Our fleet’s on fleek. http://t.co/Hi6Fl1AX9E',
       "@united Okay thanks if you could please update me. I was told at the airport someone would call me today but they haven't.",
       '@USAirways IS THIS RINGLING BROTHERS BARNUM AND BAILEY???  SHOULD I KEEP MY EYES PEELED FOR THE CLOWN CAR???'],
      dtype=object)

In [12]:
# Vectorize a text corpus, by turning each text into sequence of integers

tokenizer = Tokenizer(num_words=7000)
tokenizer.fit_on_texts(tweet)

vocab_size = len(tokenizer.word_index) + 1

print(tokenizer)
print(vocab_size)

<keras_preprocessing.text.Tokenizer object at 0x00000223D4A89790>
11635


In [79]:
tokenizer.word_index

{'to': 1,
 'the': 2,
 'i': 3,
 'a': 4,
 'you': 5,
 'united': 6,
 'for': 7,
 'flight': 8,
 'and': 9,
 'on': 10,
 'my': 11,
 'usairways': 12,
 'americanair': 13,
 'is': 14,
 'in': 15,
 'southwestair': 16,
 'of': 17,
 'jetblue': 18,
 'me': 19,
 'your': 20,
 'it': 21,
 'was': 22,
 'not': 23,
 'no': 24,
 'have': 25,
 'at': 26,
 'that': 27,
 'with': 28,
 'this': 29,
 'get': 30,
 'but': 31,
 'cancelled': 32,
 'be': 33,
 'thanks': 34,
 'now': 35,
 'we': 36,
 'from': 37,
 'are': 38,
 'service': 39,
 'an': 40,
 'been': 41,
 'just': 42,
 'so': 43,
 '2': 44,
 'can': 45,
 'help': 46,
 't': 47,
 'time': 48,
 'co': 49,
 'customer': 50,
 'http': 51,
 'up': 52,
 'hours': 53,
 'do': 54,
 'hold': 55,
 'they': 56,
 'out': 57,
 'amp': 58,
 "i'm": 59,
 'plane': 60,
 'all': 61,
 'us': 62,
 'will': 63,
 'thank': 64,
 'why': 65,
 'delayed': 66,
 'our': 67,
 'still': 68,
 'what': 69,
 'when': 70,
 'one': 71,
 'how': 72,
 'call': 73,
 'hour': 74,
 'gate': 75,
 'flights': 76,
 "can't": 77,
 'bag': 78,
 'flightled

In [69]:
len(tokenizer.word_index)

11634

In [13]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# Store and Padding Converted Sequences

tweet_seqs = tokenizer.texts_to_sequences(tweet)

padded_sequence = pad_sequences(tweet_seqs, maxlen=200)
print(padded_sequence)

[[   0    0    0 ... 3559  363  140]
 [   0    0    0 ...   75  195  753]
 [   0    0    0 ...  303   78  353]
 ...
 [   0    0    0 ...   51   47   49]
 [   0    0    0 ...   31   56  352]
 [   0    0    0 ...    2 4196  568]]


In [15]:
print(tokenizer.word_index)



In [16]:
# Check padded sequence element

print(padded_sequence[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0   12  142 1589  548   56  121  659  220 2104
   58  751   56  427   19    2  256  854    3  223   80 5033 3558 5034
 2829 

In [60]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           372320    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 388,971
Trainable params: 388,971
Non-trainable params: 0
_________________________________________________________________
None


# Train Model

In [59]:
# history = model.fit(padded_sequence,review_label[0],
#                   validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# Encoding and Padding Test Data to Check Accuracy

encoded_docs = tokenizer.texts_to_sequences(X_test)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)
print(padded_sequence)

[[   0    0    0 ...  988    9   17]
 [   0    0    0 ...   51   47   49]
 [   0    0    0 ...    1 6449  528]
 ...
 [   0    0    0 ...   98    2  313]
 [   0    0    0 ...   90 1541   90]
 [   0    0    0 ...  199  105   29]]


In [18]:
sentiment_label_test = y_test.factorize()
sentiment_label_test[0]

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [19]:
score = model.evaluate(padded_sequence,sentiment_label_test[0],verbose=0)

NameError: name 'model' is not defined

In [68]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.9216110706329346


# Load Model

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models
from tensorflow.keras.datasets import imdb




In [22]:
#Code to load the saved model
model = models.load_model('sentiment_analysis.h5')
print("Model Loaded")
model.summary()

Model Loaded
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           372320    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 388,971
Trainable params: 388,971
Non-trainable params: 0
_________________________________________________________________


In [71]:
score = model.evaluate(padded_sequence,sentiment_label_test[0],verbose=0)

In [72]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.9216110706329346


# Test Run

In [47]:
# Test Review Sentence
test_word ="""
Good headset but the mic quality is terrible... it is worse than my ear bud mic
"""

# To
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)

tw

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [64]:
prediction = int(model.predict(tw).round().item())
outcome = (review_labels[1][prediction]).capitalize()

print("Actual Review: " + test_word)
print("\nSentiment Analysis Outcome ==> The review shows " + (review_labels[1][prediction]).capitalize() + " sentiment.")
print("\n======================================================================================")

print("\nAccuracy Criteria \n\nProbability Closer to 0 == Negative Sentiment\nProbability Closer to 1 == Positive Sentiment")

prob = model.predict(tw)[0][0]

print("\n ==> Probability is " + str(prob)+ " (" + outcome + ")")

Actual Review: 
Good headset but the mic quality is terrible... it is worse than my ear bud mic


Sentiment Analysis Outcome ==> The review shows Negative sentiment.


Accuracy Criteria 

Probability Closer to 0 == Negative Sentiment
Probability Closer to 1 == Positive Sentiment

 ==> Probability is 0.014747739 (Negative)
