In [17]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
import re

In [19]:
#Reading the dataset
data = pd.read_csv('Sentiment.csv') 
# Keeping only the neccessary columns
data = data[['text','sentiment']] 
#converting to lower case
data['text'] = data['text'].apply(lambda x: x.lower()) 
# Removing special characters from the data
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))) 
#Removing Retweets
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') 

In [20]:
#taking 2000 as max value to tokenize sentence
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values) 
# values for feature matrix
X = tokenizer.texts_to_sequences(data['text'].values) 
#Padding the feature matrix
X = pad_sequences(X) 
embed_dim = 128 
lstm_out = 196 

In [22]:
#Defining function to create model using Sequential Neural Network
def createmodel():
    model = Sequential() 
    #taking input dimension 2000 Neurons, output dimension 128 Neurons
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) 
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
    #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.add(Dense(3,activation='softmax'))
    #Compiling the model
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
    return model


In [23]:
#Applying label Encoding
labelencoder = LabelEncoder() 
integer_encoded = labelencoder.fit_transform(data['sentiment']) 
y = to_categorical(integer_encoded)
#Splitting data into 67% training data, 33% test data 
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [27]:
# Model Creation
batch_size = 32 
#Function call to Sequential Neural Network
model = createmodel() 
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) 
#model evaluation
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) 
print(score)
print(acc)
print(model.metrics_names) #metrics of the model

291/291 - 43s - loss: 0.8261 - accuracy: 0.6412
144/144 - 2s - loss: 0.7544 - accuracy: 0.6695
0.7544063329696655
0.669506311416626
['loss', 'accuracy']


# 1. Save the model and use the saved model to predict on new text data 

In [28]:
from keras.models import load_model 

#Saving the model
model.save('sentimentAnalysis.h5') 
#loading the saved model
model= load_model('sentimentAnalysis.h5') 
print(data['sentiment'])

0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [31]:
#predicting on the text data
textData = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
#tokenizing given textData
sentence = tokenizer.texts_to_sequences(textData) 
#padding the tokenized sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) 
#predicting the text
sentimentPrediction = model.predict_classes(sentence,batch_size=1,verbose = 2)[0] 
if sentimentPrediction == 0:
  print("Neutral")
elif sentimentPrediction < 0:
  print("Negative")
elif sentimentPrediction > 0:
  print("Positive")
else:
  print("Can not be determined")

1/1 - 0s
Neutral




# 2. Apply GridSearchCV on the source code provided in the class

In [39]:
from keras.wrappers.scikit_learn import KerasClassifier 
from sklearn.model_selection import GridSearchCV

#Applying multiple hyper parameters on the model
model = KerasClassifier(build_fn=createmodel,verbose=2) 
#batch size
batch_size= [10, 20, 40] 
#epochs
epochs = [1, 2]
param_grid= {'batch_size':batch_size, 'epochs':epochs} 
grid  = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result= grid.fit(X_train,Y_train) 
# summarizing results
print("Best Score: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) 

744/744 - 93s - loss: 0.8235 - accuracy: 0.6473
186/186 - 2s - loss: 0.7492 - accuracy: 0.6783
744/744 - 93s - loss: 0.8157 - accuracy: 0.6494
186/186 - 2s - loss: 0.7965 - accuracy: 0.6643
744/744 - 94s - loss: 0.8314 - accuracy: 0.6426
186/186 - 2s - loss: 0.7709 - accuracy: 0.6783
744/744 - 93s - loss: 0.8216 - accuracy: 0.6477
186/186 - 2s - loss: 0.7385 - accuracy: 0.6814
744/744 - 94s - loss: 0.8196 - accuracy: 0.6457
186/186 - 2s - loss: 0.7726 - accuracy: 0.6658
Epoch 1/2
744/744 - 92s - loss: 0.8247 - accuracy: 0.6455
Epoch 2/2
744/744 - 90s - loss: 0.6787 - accuracy: 0.7107
186/186 - 2s - loss: 0.7520 - accuracy: 0.6837
Epoch 1/2
744/744 - 94s - loss: 0.8218 - accuracy: 0.6466
Epoch 2/2
744/744 - 92s - loss: 0.6834 - accuracy: 0.7137
186/186 - 2s - loss: 0.7386 - accuracy: 0.6643
Epoch 1/2
744/744 - 93s - loss: 0.8199 - accuracy: 0.6484
Epoch 2/2
744/744 - 90s - loss: 0.6718 - accuracy: 0.7174
186/186 - 2s - loss: 0.7595 - accuracy: 0.6772
Epoch 1/2
744/744 - 93s - loss: 0.82