In [1]:
import pandas as pd
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize
import itertools
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import codecs
from tqdm import tqdm
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional,GRU
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn import metrics

## Loading a labelled dataset for sentiment classification testing of models.

In [2]:
df=pd.read_csv("sentiment_data.csv",usecols=["Sentiment","Tweet"])

In [3]:
df["Sentiment"].value_counts()

1.0    282794
0.0    282794
Name: Sentiment, dtype: int64

In [4]:
df

Unnamed: 0,Sentiment,Tweet
0,0.0,hahahahaha haha dude always eaat much fckin br...
1,0.0,question make feel pressured thats told u yest...
2,0.0,quot sorry moo mean snob quot
3,0.0,back beach waiting turn shower ugggghhh b cold...
4,0.0,gargh look time still awake good
...,...,...
565583,1.0,scare chad amazing sweet dream
565584,0.0,got mosquito bite west nile still threat
565585,1.0,friggin amazing let u know
565586,1.0,quite delicious easy froze one loaf make reapp...


## Finding tf-idf scores for machine learning models.

In [5]:
count_vect = CountVectorizer()
data= count_vect.fit_transform(df.Tweet.values)
#data
trainx=data[:int(0.8*df.shape[0])+1]
testx=data[int(0.8*df.shape[0])+1:]
tfidf_transformer = TfidfTransformer()
trainx_tfidf = tfidf_transformer.fit_transform(trainx)
testx_tfidf = tfidf_transformer.fit_transform(testx)
y_train=df.loc[:int(0.8*df.shape[0]),"Sentiment"]
y_test=df.loc[int(0.8*df.shape[0])+1:,"Sentiment"]

In [6]:
y_train.shape

(452471,)

## Training a Naive Bayes classification model for labelled data.

In [7]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(trainx_tfidf,y_train)
y_pred_nb = naive_bayes_classifier.predict(testx_tfidf)
score_nb = metrics.accuracy_score(y_test, y_pred_nb)
print("Accuracy of the trained Naive Bayes model is: "+str(100*score_nb)+"%")

Accuracy of the trained Naive Bayes model is: 75.83475516500614%


## Training a KNN classification model for labelled data.

In [8]:
knn = KNeighborsClassifier(n_neighbors=7)
clf = knn.fit(trainx_tfidf,y_train)
y_pred_knn = clf.predict(testx)
score_knn = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy of the trained KNN model is: "+str(100*score_knn )+"%")

Accuracy of the trained KNN model is: 69.82681648204957%


## Training a Rocchio classification model for labelled data.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import pairwise_distances
metric="euclidean"
vectorizer=TfidfVectorizer()
trainX = vectorizer.fit_transform(df.loc[:int(0.8*df.shape[0])+1,"Tweet"].values) 
trainy = df.loc[:int(0.8*df.shape[0]),"Sentiment"].values
n_samples, n_features = trainX.shape
le = LabelEncoder()
y_indices = le.fit_transform(trainy)
classes = le.classes_
n_classes = classes.size
centroids = np.empty((n_classes, n_features), dtype=np.float64)
n_cluster = np.zeros(n_classes)
for current_class in range(n_classes):
    center_mask = y_indices == current_class
    n_cluster[current_class] = np.sum(center_mask)
    centroids[current_class] = trainX[center_mask].mean(axis=0)
def get_vectorizer_array(query):
    return vectorizer.transform([query]).toarray()
def pred(X):
    return classes[pairwise_distances(X, centroids, metric=metric).argmin(axis=1)]
testdata = [[a_, b_] for a_, b_ in zip(df.loc[int(0.8*df.shape[0])+1:,"Tweet"],df.loc[int(0.8*df.shape[0]):,"Sentiment"])]
y_pred_rcc=[pred(get_vectorizer_array(testcase[0]))[0] for testcase in testdata]
score_rcc = metrics.accuracy_score(y_test, y_pred_rcc)

print("Accuracy of the trained Rocchio Classification model is: "+str(100*score_rcc)+"%")

Accuracy of the trained Rocchio Classification model is: 72.05106217456262%


In [10]:
train_embed=df.loc[:int(0.8*df.shape[0])]
test_embed=df.loc[int(0.8*df.shape[0]):]

In [11]:
y_train_embed = pd.get_dummies(train_embed['Sentiment']).values
y_test_embed= pd.get_dummies(test_embed['Sentiment']).values

## Loading pre trained word embeddings for embedding layer of neural network.

In [12]:
li_train1=[]
for i in train_embed["Tweet"]:
    words=word_tokenize(i)
    li_train1.append(words)
li_train = list(itertools.chain(*li_train1))
li_test1=[]
for i in test_embed["Tweet"]:
    words=word_tokenize(i)
    li_test1.append(words)
li_test = list(itertools.chain(*li_test1))
li_all=li_train+li_test
li_uniquetrain=set(li_train)
li_uniquetest=set(li_test)
li_unique=set(li_train+li_test)
tokenizer = Tokenizer(num_words=len(li_unique), lower=True, char_level=False)
tokenizer.fit_on_texts(li_all)
word_seq_train = tokenizer.texts_to_sequences(li_train1)
word_seq_test = tokenizer.texts_to_sequences(li_test1)
word_index = tokenizer.word_index
li_all1=li_train1+li_test1
max=0
for i in li_all1:
    if len(i)>max:
        max=len(i)
train=pad_sequences(word_seq_train, maxlen=100)
test=pad_sequences(word_seq_test, maxlen=100)
embeddings_index = {}
f = codecs.open('glove.twitter.27B.200d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
words_not_found = []
nb_words = min(li_unique.__len__(), len(word_index)+1)
embedding_matrix = np.zeros((nb_words, 200))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

1193515it [02:40, 7445.52it/s] 


## Embedding layer

In [6]:
embedding_layer = Embedding(len(li_unique) ,
                            200,
                            weights=[embedding_matrix],
                            input_length=100,
                            trainable=False)

## LSTM RNN.

In [7]:
embedding_dim = 64
model1 = Sequential([
    embedding_layer,
  Bidirectional(LSTM(embedding_dim, return_sequences=True)),
  Bidirectional(LSTM(embedding_dim,)),
  Dense(6, activation='relu'),
  Dense(2, activation='sigmoid')
])
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          28996800  
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 128)          135680    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 14        
Total params: 29,232,084
Trainable params: 235,284
Non-trainable params: 28,996,800
_________________________________________________________________


## GRU RNN

In [8]:
embedding_dim = 64
model2 = Sequential([
    embedding_layer,
  Bidirectional(GRU(embedding_dim, return_sequences=True)),
  Bidirectional(GRU(embedding_dim,)),
  Dense(6, activation='relu'),
  Dense(2, activation='sigmoid')
])
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          28996800  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          102144    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               74496     
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 14        
Total params: 29,174,228
Trainable params: 177,428
Non-trainable params: 28,996,800
_________________________________________________________________


## CNN

In [9]:
from keras.layers import Flatten
embedding_dim = 64
model3 = Sequential([
    embedding_layer,
  Conv1D(filters=embedding_dim, kernel_size=5, activation='relu'),
  MaxPooling1D(pool_size=2),
  Flatten(),
  #Dense(6, activation='relu'),
  Dense(2, activation='sigmoid')
])
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model3.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          28996800  
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 64)            64064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 48, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 3072)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 6146      
Total params: 29,067,010
Trainable params: 70,210
Non-trainable params: 28,996,800
_________________________________________________________________


In [10]:
num_epochs = 5
history = model1.fit(train,y_train_embed, epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
model1.save("lstm_sentiment1.h5")

In [12]:
num_epochs = 5
history = model2.fit(train, y_train_embed, epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
model2.save("gru_sentiment1.h5")

In [14]:
num_epochs = 5
history = model3.fit(train,y_train_embed,epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
model3.save("cnn_sentiment1.h5")

In [13]:
from keras.models import load_model
model1 = load_model("lstm_sentiment1.h5")
model2 = load_model("gru_sentiment1.h5")
model3 = load_model("cnn_sentiment1.h5")

## Neural network predicting sentiment classes for test data

In [27]:
y_lstm = model1.predict(test)

In [16]:
y_lstm

array([[0.72291374, 0.282772  ],
       [0.31963602, 0.69048655],
       [0.00905776, 0.9916127 ],
       ...,
       [0.01855001, 0.9812727 ],
       [0.3229503 , 0.6856467 ],
       [0.02841553, 0.97106516]], dtype=float32)

In [17]:
y_gru = model2.predict(test)

In [18]:
y_cnn = model3.predict(test)

In [29]:
fin_lstm=[]
for i in range(len(y_lstm)):
    if y_lstm[i][0]>y_lstm[i][1]:
        fin_lstm.append(0)
    else:
        fin_lstm.append(1)

In [33]:
fin_gru=[]
for i in range(len(y_gru)):
    if y_gru[i][0]>y_gru[i][1]:
        fin_gru.append(0)
    else:
        fin_gru.append(1)

In [34]:
fin_cnn=[]
for i in range(len(y_cnn)):
    if y_cnn[i][0]>y_cnn[i][1]:
        fin_cnn.append(0)
    else:
        fin_cnn.append(1)

In [45]:
y_actual=list(test_embed["Sentiment"].values)

In [46]:
y_actual

[0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0

## Classification report of each neural network

In [59]:
from sklearn.metrics import classification_report
target_names = ["positive","negative"]
print(classification_report(y_test, y_pred_knn, target_names=target_names))
score_lstm = metrics.accuracy_score(y_test, y_pred_knn)
print("Accuracy of the trained KNN model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.72      0.65      0.68     56587
    negative       0.68      0.75      0.71     56530

    accuracy                           0.70    113117
   macro avg       0.70      0.70      0.70    113117
weighted avg       0.70      0.70      0.70    113117

Accuracy of the trained KNN model is: 69.82681648204957%


In [60]:
from sklearn.metrics import classification_report
target_names = ["positive","negative"]
print(classification_report(y_test, y_pred_nb, target_names=target_names))
score_lstm = metrics.accuracy_score(y_test, y_pred_nb)
print("Accuracy of the trained Naive Bayes model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.75      0.77      0.76     56587
    negative       0.76      0.75      0.76     56530

    accuracy                           0.76    113117
   macro avg       0.76      0.76      0.76    113117
weighted avg       0.76      0.76      0.76    113117

Accuracy of the trained Naive Bayes model is: 75.83475516500614%


In [61]:
from sklearn.metrics import classification_report
target_names = ["positive","negative"]
print(classification_report(y_test, y_pred_rcc, target_names=target_names))
score_lstm = metrics.accuracy_score(y_test, y_pred_rcc)
print("Accuracy of the trained Rocchio Classification model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.72      0.73      0.72     56587
    negative       0.72      0.71      0.72     56530

    accuracy                           0.72    113117
   macro avg       0.72      0.72      0.72    113117
weighted avg       0.72      0.72      0.72    113117

Accuracy of the trained Rocchio Classification model is: 72.05106217456262%


In [54]:
from sklearn.metrics import classification_report
target_names = ["positive","negative"]
print(classification_report(y_actual, fin_lstm, target_names=target_names))
score_lstm = metrics.accuracy_score(y_actual, fin_lstm)
print("Accuracy of the trained LSTM model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.79      0.80      0.79     56588
    negative       0.80      0.78      0.79     56530

    accuracy                           0.79    113118
   macro avg       0.79      0.79      0.79    113118
weighted avg       0.79      0.79      0.79    113118

Accuracy of the trained LSTM model is: 79.07229618628335%


In [55]:
target_names = ["positive","negative"]
print(classification_report(y_actual, fin_gru, target_names=target_names))
score_lstm = metrics.accuracy_score(y_actual, fin_gru)
print("Accuracy of the trained GRU model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.78      0.81      0.79     56588
    negative       0.80      0.77      0.78     56530

    accuracy                           0.79    113118
   macro avg       0.79      0.79      0.79    113118
weighted avg       0.79      0.79      0.79    113118

Accuracy of the trained GRU model is: 78.91759048073693%


In [56]:

target_names = ["positive","negative"]
print(classification_report(y_actual, fin_cnn, target_names=target_names))
score_lstm = metrics.accuracy_score(y_actual, fin_cnn)
print("Accuracy of the trained CNN model is: "+str(100*score_lstm )+"%")

              precision    recall  f1-score   support

    positive       0.78      0.76      0.77     56588
    negative       0.76      0.78      0.77     56530

    accuracy                           0.77    113118
   macro avg       0.77      0.77      0.77    113118
weighted avg       0.77      0.77      0.77    113118

Accuracy of the trained CNN model is: 77.00277586237381%


## LSTM gives the highest accuracy so this RNN was chosen for future sentiment classification