In [38]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.initializers import Constant
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import re

from sklearn.preprocessing import LabelEncoder


In [23]:
data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [24]:
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

  row[0] = row[0].replace('rt',' ')
  row[0] = row[0].replace('rt',' ')


In [25]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ') #Maximum words is 2000 to tokenize sentence
tokenizer.fit_on_texts(data['text'].values) 
X = tokenizer.texts_to_sequences(data['text'].values) #taking values to feature matrix


In [34]:
X = pad_sequences(X) #Padding the feature matrix

embed_dim = 128 #Dimension of the Embedded layer
lstm_out = 196 #Long short-term memory (LSTM) layer neurons

def createmodel():
    model = Sequential() #Sequential Neural Network
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) #input dimension 2000 Neurons, output dimension 128 Neurons
    # Adding an additional LSTM layer with regularization
    # model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2, return_sequences=True, kernel_regularizer=l2(0.001)))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) #Drop out 20%, 196 output Neurons, recurrent dropout 20%
    model.add(Dense(3,activation='softmax')) #3 output neurons[positive, Neutral, Negative], softmax as activation
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) #Compiling the model
    return model
# print(model.summary())

In [35]:
labelencoder = LabelEncoder() #Applying label Encoding on the label matrix
integer_encoded = labelencoder.fit_transform(data['sentiment']) #fitting the model
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42) #67% training data, 33% test data split
     

In [36]:
batch_size = 32 #Batch size 32
model = createmodel() #Function call to Sequential Neural Network
model.fit(X_train, Y_train, epochs = 2, batch_size=batch_size, verbose = 2) #verbose the higher, the more messages
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size) #evaluating the model
print(score)
print(acc)

Epoch 1/2
291/291 - 15s - loss: 0.8193 - accuracy: 0.6465 - 15s/epoch - 53ms/step
Epoch 2/2
291/291 - 32s - loss: 0.6790 - accuracy: 0.7126 - 32s/epoch - 111ms/step
144/144 - 3s - loss: 0.7427 - accuracy: 0.6828 - 3s/epoch - 23ms/step
0.7426655292510986
0.6828309297561646


In [37]:
print(model.metrics_names) #metrics of the model

['loss', 'accuracy']


In [30]:

model.save('sentimentAnalysis.keras') #Saving the model

In [31]:
from keras.models import load_model #Importing the package for importing the saved model
model= load_model('sentimentAnalysis.keras') #loading the saved model

In [32]:

print(integer_encoded)
print(data['sentiment'])

[1 2 1 ... 2 0 2]
0         Neutral
1        Positive
2         Neutral
3        Positive
4        Positive
           ...   
13866    Negative
13867    Positive
13868    Positive
13869    Negative
13870    Positive
Name: sentiment, Length: 13871, dtype: object


In [33]:

# Predicting on the text data
sentence = ['A lot of good things are happening. We are respected again throughout the world, and that is a great thing.@realDonaldTrump']
sentence = tokenizer.texts_to_sequences(sentence) # Tokenizing the sentence
sentence = pad_sequences(sentence, maxlen=28, dtype='int32', value=0) # Padding the sentence
sentiment_probs = model.predict(sentence, batch_size=1, verbose=2)[0] # Predicting the sentence text
sentiment = np.argmax(sentiment_probs)

print(sentiment_probs)
print(sentiment)
if sentiment == 0:
    print("Neutral")
elif sentiment < 0:
    print("Negative")
elif sentiment > 0:
    print("Positive")
else:
    print("Cannot be determined")


1/1 - 1s - 1s/epoch - 1s/step
[0.63978845 0.14338629 0.21682528]
0
Neutral


2. Apply GridSearchCV on the source code provided in the class

In [13]:
# !pip install scikeras
from scikeras.wrappers import KerasClassifier
# from keras.wrappers.scikit_learn import KerasClassifier #importing Keras classifier
from sklearn.model_selection import GridSearchCV #importing Grid search CV

model = KerasClassifier(model=createmodel,verbose=2) #initiating model to test performance by applying multiple hyper parameters
batch_size= [10, 20, 40] #hyper parameter batch_size
epochs = [1, 2] #hyper parameter no. of epochs
param_grid= {'batch_size':batch_size, 'epochs':epochs} #creating dictionary for batch size, no. of epochs
grid  = GridSearchCV(estimator=model, param_grid=param_grid) #Applying dictionary with hyper parameters
grid_result= grid.fit(X_train,Y_train) #Fitting the model
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #best score, best hyper parameters
     

744/744 - 33s - loss: 0.8260 - accuracy: 0.6499 - 33s/epoch - 44ms/step
186/186 - 2s - 2s/epoch - 10ms/step
744/744 - 27s - loss: 0.8198 - accuracy: 0.6468 - 27s/epoch - 36ms/step
186/186 - 2s - 2s/epoch - 9ms/step
744/744 - 28s - loss: 0.8269 - accuracy: 0.6447 - 28s/epoch - 38ms/step
186/186 - 2s - 2s/epoch - 11ms/step
744/744 - 36s - loss: 0.8297 - accuracy: 0.6409 - 36s/epoch - 49ms/step
186/186 - 3s - 3s/epoch - 14ms/step
744/744 - 33s - loss: 0.8223 - accuracy: 0.6496 - 33s/epoch - 45ms/step
186/186 - 2s - 2s/epoch - 12ms/step
Epoch 1/2
744/744 - 33s - loss: 0.8322 - accuracy: 0.6434 - 33s/epoch - 44ms/step
Epoch 2/2
744/744 - 28s - loss: 0.6805 - accuracy: 0.7092 - 28s/epoch - 38ms/step
186/186 - 2s - 2s/epoch - 11ms/step
Epoch 1/2
744/744 - 37s - loss: 0.8194 - accuracy: 0.6462 - 37s/epoch - 50ms/step
Epoch 2/2
744/744 - 29s - loss: 0.6844 - accuracy: 0.7078 - 29s/epoch - 39ms/step
186/186 - 2s - 2s/epoch - 13ms/step
Epoch 1/2
744/744 - 33s - loss: 0.8245 - accuracy: 0.6480 - 3