# 
1. Basics of LSTM
2. Types of RNN
3. Use case: Sentiment Analysis on the Twitter data set

In [2]:
import pandas as pd #Basic packages for creating dataframes and loading dataset
import numpy as np

import matplotlib.pyplot as plt #Package for visualization

import re #importing package for Regular expression operations

from sklearn.model_selection import train_test_split #Package for splitting the data

from sklearn.preprocessing import LabelEncoder #Package for conversion of categorical to Numerical

from keras.preprocessing.text import Tokenizer #Tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences #Add zeros or crop based on the length
from keras.models import Sequential #Sequential Neural Network
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D #For layers in Neural Network
from keras.utils.np_utils import to_categorical


import warnings
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd

# Load the dataset as a Pandas DataFrame
dataset = pd.read_csv('Sentiment (1).csv')

# Select only the necessary columns 'text' and 'sentiment'
mask = dataset.columns.isin(['text', 'sentiment'])
data = dataset.loc[:, mask]

# Keeping only the necessary columns
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [4]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') #Removing Retweets
    max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ') #Maximum words is 2000 to tokenize sentence
tokenizer.fit_on_texts(data['text'].values) 
X = tokenizer.texts_to_sequences(data['text'].values) #taking values to feature matrix
X = pad_sequences(X) #Padding the feature matrix

embed_dim = 128 #Dimension of the Embedded layer
lstm_out = 196 #Long short-term memory (LSTM) layer neurons
def createmodel():
    model = Sequential() #Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) #input dimension 2000 Neurons, output dimension 128 Neurons
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #Drop out 20%, 196 output Neurons, recurrent dropout 20%
    model.add(Dense(3,activation='softmax')) #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model
# print(model.summary())
labelencoder = LabelEncoder() #Applying label Encoding on the label matrix
integer_encoded = labelencoder.fit_transform(data['sentiment']) #fitting the model
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42) #67% training data, 33% test data split
batch_size = 32 #Batch size 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the model
print(score)
print(acc)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/1
 - 14s - loss: 0.8250 - acc: 0.6423
0.7661085366330432
0.6730013106159896


In [5]:
print(model.metrics_names) #metrics of the model

['loss', 'acc']


In [6]:
model.save('sentimentAnalysis.h5') #Saving the model

In [7]:
from keras.models import load_model #Importing the package for importing the saved model
model= load_model('sentimentAnalysis.h5') #loading the saved model

In [8]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


Predicting on the text data

In [9]:
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")

[0.6813752  0.1598489  0.15877591]
Neutral


GridSearchCV on the source code

In [10]:
from keras.wrappers.scikit_learn import KerasClassifier #importing Keras classifier
from sklearn.model_selection import GridSearchCV #importing Grid search CV

model = KerasClassifier(build_fn=createmodel,verbose=2) #initiating model to test performance by applying multiple hyper parameters
batch_size= [10, 20, 40] #hyper parameter batch_size
epochs = [1, 2] #hyper parameter no. of epochs
param_grid= {'batch_size':batch_size, 'epochs':epochs} #creating dictionary for batch size, no. of epochs
grid  = GridSearchCV(estimator=model, param_grid=param_grid) #Applying dictionary with hyper parameters
grid_result= grid.fit(X_train,Y_train) #Fitting the model
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #best score, best hyper parameters

Epoch 1/1
 - 25s - loss: 0.8306 - acc: 0.6464
Epoch 1/1
 - 26s - loss: 0.8286 - acc: 0.6465
Epoch 1/1
 - 32s - loss: 0.8302 - acc: 0.6437
Epoch 1/1
 - 32s - loss: 0.8336 - acc: 0.6393
Epoch 1/1
 - 42s - loss: 0.8266 - acc: 0.6482
Epoch 1/2
 - 44s - loss: 0.8314 - acc: 0.6419
Epoch 2/2
 - 42s - loss: 0.6883 - acc: 0.7085
Epoch 1/2
 - 49s - loss: 0.8255 - acc: 0.6442
Epoch 2/2
 - 49s - loss: 0.6913 - acc: 0.7092
Epoch 1/2
 - 55s - loss: 0.8228 - acc: 0.6447
Epoch 2/2
 - 51s - loss: 0.6844 - acc: 0.7121
Epoch 1/2
 - 56s - loss: 0.8314 - acc: 0.6440
Epoch 2/2
 - 55s - loss: 0.6849 - acc: 0.7107
Epoch 1/2
 - 59s - loss: 0.8204 - acc: 0.6449
Epoch 2/2
 - 57s - loss: 0.6755 - acc: 0.7131
Epoch 1/1
 - 36s - loss: 0.8408 - acc: 0.6411
Epoch 1/1
 - 52s - loss: 0.8293 - acc: 0.6454
Epoch 1/1
 - 50s - loss: 0.8350 - acc: 0.6431
Epoch 1/1
 - 42s - loss: 0.8364 - acc: 0.6385
Epoch 1/1
 - 62s - loss: 0.8354 - acc: 0.6409
Epoch 1/2
 - 73s - loss: 0.8361 - acc: 0.6402
Epoch 2/2
 - 71s - loss: 0.6902 - 

KeyboardInterrupt: 