In [1]:

import pandas as pd # packages for creating dataframes and loading dataset
import numpy as np
# package for plotting
import matplotlib.pyplot as plt
# for regular expressions
import re
# for implementing machine learning functions
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
# for deep learning models and functions
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical


In [2]:
import pandas as pd
# Load the dataset as a Pandas DataFrame
dataset = pd.read_csv('Sentiment.csv')

# Select only the necessary columns 'text' and 'sentiment'
mask = dataset.columns.isin(['text', 'sentiment'])
data = dataset.loc[:, mask]
# Keeping only the necessary columns



In [3]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))


In [4]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ') # removing retweets
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')   #Maximum words is 2000 to tokenize sentence
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values) #taking values to feature matrix
X = pad_sequences(X) #Padding the feature matrix
embed_dim = 128 #Dimension of the Embedded layer
lstm_out = 196 #Long short-term memory (LSTM) layer neurons
def createmodel():
    model = Sequential() #Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) #input dimension 2000 Neurons, output dimension 128 Neurons
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #Drop out 20%, 196 output Neurons, recurrent dropout 20%
    model.add(Dense(3,activation='softmax')) #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model
# print(model.summary())
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)
batch_size = 32 #Batch size 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the modelprint(score)
print(acc)

291/291 - 64s - loss: 0.8256 - accuracy: 0.6436 - 64s/epoch - 220ms/step
144/144 - 3s - loss: 0.7431 - accuracy: 0.6774 - 3s/epoch - 22ms/step
0.7431066632270813
0.677370011806488


In [5]:
print(model.metrics_names)#metrics of the model

['loss', 'accuracy']


Save the model and use the saved model to predict on new text data (ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)

In [6]:
model.save('sentimentAnalysis.h5') #Saving the model
from keras.models import load_model #Importing the package for importing the saved model
model= load_model('sentimentAnalysis.h5') #loading the saved model

In [7]:
print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [8]:
# Predicting on the text data
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")

1/1 - 1s - 507ms/epoch - 507ms/step
[0.65985817 0.09624108 0.2439008 ]
Neutral


Apply GridSearchCV on the source code provided in the class

In [9]:
from keras.wrappers.scikit_learn import KerasClassifier #importing Keras classifier
from sklearn.model_selection import GridSearchCV #importing Grid search CV
model = KerasClassifier(build_fn=createmodel,verbose=2) #initiating model to test performance by applying multiple hyper parameters
batch_size= [10, 20, 40] #hyper parameter batch_size
epochs = [1, 2] #hyper parameter no. of epochs
param_grid= {'batch_size':batch_size, 'epochs':epochs} #creating dictionary for batch size, no. of epochs
grid  = GridSearchCV(estimator=model, param_grid=param_grid) #Applying dictionary with hyper parameters
grid_result= grid.fit(X_train,Y_train) #Fitting the model
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #best score, best hyper parameters

  model = KerasClassifier(build_fn=createmodel,verbose=2)


744/744 - 106s - loss: 0.8309 - accuracy: 0.6431 - 106s/epoch - 142ms/step
186/186 - 3s - loss: 0.7543 - accuracy: 0.6616 - 3s/epoch - 17ms/step
744/744 - 99s - loss: 0.8231 - accuracy: 0.6466 - 99s/epoch - 133ms/step
186/186 - 4s - loss: 0.7693 - accuracy: 0.6756 - 4s/epoch - 21ms/step
744/744 - 105s - loss: 0.8283 - accuracy: 0.6439 - 105s/epoch - 141ms/step
186/186 - 4s - loss: 0.7757 - accuracy: 0.6724 - 4s/epoch - 20ms/step
744/744 - 109s - loss: 0.8259 - accuracy: 0.6444 - 109s/epoch - 146ms/step
186/186 - 4s - loss: 0.7440 - accuracy: 0.6787 - 4s/epoch - 21ms/step
744/744 - 103s - loss: 0.8211 - accuracy: 0.6429 - 103s/epoch - 138ms/step
186/186 - 3s - loss: 0.7779 - accuracy: 0.6717 - 3s/epoch - 15ms/step
Epoch 1/2
744/744 - 103s - loss: 0.8267 - accuracy: 0.6466 - 103s/epoch - 138ms/step
Epoch 2/2
744/744 - 97s - loss: 0.6817 - accuracy: 0.7082 - 97s/epoch - 131ms/step
186/186 - 3s - loss: 0.7488 - accuracy: 0.6783 - 3s/epoch - 14ms/step
Epoch 1/2
744/744 - 99s - loss: 0.8231 