# User Sentiment Classification on Coronavirus Tweets

In [16]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import \
classification_report, accuracy_score,confusion_matrix 

# Model using tf-idf and bow

In [10]:
dataset= pd.read_csv(
    '/Documents/kaggleDatasets/coronavirus_tweets/Corona_NLP_test.csv')

In [14]:
dataset.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [15]:
#tokenization
tokenizer =ToktokTokenizer()
#stopwords removal
stopword_list= nltk.corpus.stopwords.words('english')

In [18]:
#remving hmtl strips
def strip_html(text):
    soup= BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_special_characters(text):
    pattern= r'[^a-zA-Z0-0\s]'
    text=re.sub(pattern, '', text)
    return text
def simple_stemmer(text):
    ps= nltk.porter.PorterStemmer()
    text=' '.join([ps.stem(word) for word in \
                   text.split()])
    return text

stop= set(stopwords.words("english"))

def remove_stopwords(text):
    tokens= tokenizer.tokenize(text)
    tokens= [token.strip() for token in tokens]
    filtered_tokens= [token for token in tokens \
                      if token \
                      not in stopword_list]
    filtered_text= ' '.join(filtered_tokens)
    return filtered_text


In [67]:
#apply preprocessing functions
df=dataset.copy()

#do convert to lowercase before preprocessing
df['text']=df['OriginalTweet'].str.lower()

df['text']=df['text'].apply(strip_html)

df['text']=df['text'].apply(remove_special_characters)

df['text']=df['text'].apply(simple_stemmer)

df['text']=df['text'].apply(remove_stopwords)

#df.text has now all the preprocessing steps applied sequentially 


In [68]:
#label encode the taregt column, it has 5 classes
df['Sentiment']=df['Sentiment'].astype('category')
df['Sentiment_coded']= df.Sentiment.cat.codes

In [110]:
df.shape

(3798, 8)

In [194]:
#shuffle dataset at random
df= df.sample(df.shape[0])

#train tets split
xtrain, ytrain, xtest, ytest = df.text[:3000], \
            df.Sentiment_coded[:3000],\
            df.text[3000:], \
            df.Sentiment_coded[3000:]

            

In [195]:
#count vectorizer for bag of words
cv= CountVectorizer(min_df=0, max_df=1 ,\
                    ngram_range=(1,3))

#transformed train data
cv_train= cv.fit_transform(xtrain)

#transform test data
cv_test= cv.transform(xtest)

print('bow_cv_train',cv_train.shape)
print('bow_cv_test', cv_test.shape)

bow_cv_train (3000, 94019)
bow_cv_test (798, 94019)


In [196]:
#tfidf vectorizer
tv= TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))
#transform train data
tv_train= tv.fit_transform(xtrain)

#transform text data
tv_test= tv.transform(xtest)


print('tfidf_train',tv_train.shape)
print('tfids_test', tv_test.shape)

tfidf_train (3000, 94019)
tfids_test (798, 94019)


In [None]:
#training model
lr= LogisticRegression(penalty='l2',max_iter=50,\
                       C=1, random_state=42)

#fitting model for bow
lr_bow= lr.fit(cv_train, ytrain)
# print(lr_bow)

#fititng model for tfidf
lr_tfidf= lr.fit(tv_train, ytrain)
# print(lr_tfidf)

In [123]:
#predicting model for bow
lr_bow_predict= lr_bow.predict(cv_test)

#predict for tfidf
lr_tfidf_predict= lr_tfidf.predict(tv_test)

In [124]:
#classification report 
report_bow= classification_report(ytest,\
                                  lr_bow_predict)

print(report_bow)

report_tfidf= classification_report(ytest,\
                                    lr_tfidf_predict)
print(report_tfidf)

              precision    recall  f1-score   support

           0       0.86      0.05      0.09       130
           1       0.42      0.04      0.08       114
           2       0.30      0.75      0.43       224
           3       0.57      0.06      0.11       133
           4       0.30      0.32      0.31       197

    accuracy                           0.31       798
   macro avg       0.49      0.24      0.20       798
weighted avg       0.45      0.31      0.24       798

              precision    recall  f1-score   support

           0       1.00      0.02      0.05       130
           1       1.00      0.02      0.03       114
           2       0.29      0.85      0.43       224
           3       0.83      0.04      0.07       133
           4       0.28      0.19      0.23       197

    accuracy                           0.30       798
   macro avg       0.68      0.22      0.16       798
weighted avg       0.60      0.30      0.20       798



In [125]:
cm_bow=confusion_matrix(ytest, lr_bow_predict)
print(cm_bow)

cm_tfidf= confusion_matrix(ytest, lr_tfidf_predict)
print(cm_tfidf)

[[  6   0 101   2  21]
 [  0   5  65   1  43]
 [  0   4 167   2  51]
 [  0   1  92   8  32]
 [  1   2 130   1  63]]
[[  3   0 117   0  10]
 [  0   2  82   0  30]
 [  0   0 190   0  34]
 [  0   0 103   5  25]
 [  0   0 158   1  38]]


# Neural Network for tfidf 

In [258]:
#convert sparse representaion to dense for tfidf matrix
xtrain = scipy.sparse.csr_matrix.todense(tv_train)
xtest = scipy.sparse.csr_matrix.todense(tv_test)
# ytrain= ytrain
# ytest=ytest

In [259]:
x_train.shape

(3000, 94019)

In [260]:
#with scipy.sparse representaion no need to specify the dim of input layer

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(48, activation='relu', 
                    input_shape=(tv_train.shape[1],)),
   tf.keras.layers.Dropout(0.2),
   tf.keras.layers.Dense(24, activation='relu'),
   tf.keras.layers.Dropout(0.1),
   tf.keras.layers.Dense(5, 
                         activation='softmax')
])
#last layer is softmax as it is multiclass classification
#numbr of nodes in last layer= number of classes in target

In [261]:
#sparse crosentropy loss as most values in tfidf matrix is zero

nn_model.compile(loss='sparse_categorical_crossentropy'
              , optimizer='adam',
              metrics=['accuracy'])

In [262]:
history = nn_model.fit(x_train, y_train, epochs=10,
                    validation_data=(x_test, y_test), verbose=2)

Epoch 1/10
94/94 - 14s - loss: 1.5968 - accuracy: 0.2637 - val_loss: 1.5799 - val_accuracy: 0.2820
Epoch 2/10
94/94 - 3s - loss: 1.4564 - accuracy: 0.3947 - val_loss: 1.5594 - val_accuracy: 0.2957
Epoch 3/10
94/94 - 4s - loss: 0.7890 - accuracy: 0.8023 - val_loss: 1.5696 - val_accuracy: 0.3108
Epoch 4/10
94/94 - 5s - loss: 0.1992 - accuracy: 0.9847 - val_loss: 1.6455 - val_accuracy: 0.3045
Epoch 5/10
94/94 - 5s - loss: 0.0535 - accuracy: 0.9967 - val_loss: 1.7167 - val_accuracy: 0.3070
Epoch 6/10
94/94 - 5s - loss: 0.0307 - accuracy: 0.9970 - val_loss: 1.7630 - val_accuracy: 0.2982
Epoch 7/10
94/94 - 6s - loss: 0.0217 - accuracy: 0.9970 - val_loss: 1.8006 - val_accuracy: 0.3008
Epoch 8/10
94/94 - 4s - loss: 0.0150 - accuracy: 0.9977 - val_loss: 1.8554 - val_accuracy: 0.2932
Epoch 9/10
94/94 - 4s - loss: 0.0138 - accuracy: 0.9970 - val_loss: 1.8782 - val_accuracy: 0.2970
Epoch 10/10
94/94 - 6s - loss: 0.0126 - accuracy: 0.9973 - val_loss: 1.9225 - val_accuracy: 0.2870


In [None]:
#getting model prediction
test_pred_nn_tfidf =  nn_model.predict(xtest)

#getting index of maximum probability
test_pred_nn_tfidf=[ np.argmax(x)\
                    for x in test_pred_nn_tfidf]
#classification report
report_nn_vector= classification_report(ytest,\
                                test_pred_nn_tfidf)

print(report_nn_vector)

#confusion matrix
cm_nn_vector= confusion_matrix(ytest, test_pred_nn1)

print(cm_nn_vector)

# Model using pretrained models and logistic

In [164]:
model_url=
"https://tfhub.dev/google/universal-sentence-encoder/4"
model= hub.load(model_url)

In [204]:
def embed(input):
    return model([input])

#making one copy of dataset for pretrained models
df1= dataset.copy()



In [205]:
#vectorizing tweet column using universal snetence encoder
df1['text_vector']= df1.OriginalTweet.map(embed)
df1['text_vector']= df1.text_vector.map(np.array)

#save transformed data locally 
pickle.dump(df1, open('dataset_covid_tweets.pkl'))

In [206]:
#encoding dsentiment column 
df1['Sentiment']=df1['Sentiment'].astype('category')
df1['Sentiment_coded']= df1.Sentiment.cat.codes

In [240]:
#shuffle dataset at random
df1= df1.sample(df1.shape[0])

#spiltting test and train data
xtrain1, ytrain1, xtest1, ytest1 = 
            df1['text_vector'][:3000], \
            df1.Sentiment_coded[:3000],\
            df1['text_vector'][3000:], \
            df1.Sentiment_coded[3000:]

In [241]:
#convert 2d list to 1 d list for text vector
import itertools
xtrain1= list(itertools.chain(*xtrain1))
print(len(xtrain1))
xtrain1=np.mat(xtrain1)
print(xtrain1.shape)
xtest1= list(itertools.chain(*xtest1))
xtest1=np.mat(xtest1)

#xtrain1 and xtest1 are lists of vectors

3000
(3000, 512)


In [134]:
lr= LogisticRegression(penalty='l2',\
                max_iter=500,C=1, random_state=42)

#fitting model for embedding vectors
lr_vector= lr.fit( xtrain1, ytrain1)

In [135]:
#predicting model for vector embedding
lr_vector_predict= lr_vector.predict(xtest1)

#classification report
report_lr_vector= classification_report(ytest1,\
                                lr_vector_predict)

print(report_lr_vector)

#confusion matrix
cm_lr_vector= confusion_matrix(ytest1,\
                            lr_vector_predict)

print(cm_lr_vector)

              precision    recall  f1-score   support

           0       0.50      0.42      0.46       120
           1       0.53      0.42      0.47       123
           2       0.35      0.48      0.40       201
           3       0.56      0.53      0.54       140
           4       0.40      0.36      0.38       214

    accuracy                           0.44       798
   macro avg       0.47      0.44      0.45       798
weighted avg       0.45      0.44      0.44       798

[[51  2 54  2 11]
 [ 9 52 23  5 34]
 [29 11 96 20 45]
 [ 5  5 34 74 22]
 [ 9 28 70 31 76]]


# NN Model on encoding

In [227]:
#if data has (bacthsize*d2) then input layer should be of dim- d2
#data here has 512 features , so input size is set to 512
nn_model1 = tf.keras.Sequential([
   tf.keras.layers.InputLayer(input_shape=(512,)),
   tf.keras.layers.Dense(20, activation='relu', 
                input_shape=(tv_train.shape[1],)),
   tf.keras.layers.Dropout(0.2),
   tf.keras.layers.Dense(24, activation='relu'),
   tf.keras.layers.Dropout(0.1),
   tf.keras.layers.Dense(5, 
                         activation='softmax')
])

In [228]:
nn_model1.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_46 (Dense)             (None, 20)                10260     
_________________________________________________________________
dropout_26 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_47 (Dense)             (None, 24)                504       
_________________________________________________________________
dropout_27 (Dropout)         (None, 24)                0         
_________________________________________________________________
dense_48 (Dense)             (None, 5)                 125       
Total params: 10,889
Trainable params: 10,889
Non-trainable params: 0
_________________________________________________________________


In [229]:
nn_model1.compile(loss='sparse_categorical_crossentropy'
              , optimizer='adam',
              metrics=['accuracy'])

In [230]:
history1= nn_model1.fit(xtrain1,
                        ytrain1, epochs=10,
                    validation_data=(xtest1, ytest1), verbose=2)

Epoch 1/10
94/94 - 2s - loss: 1.5795 - accuracy: 0.2557 - val_loss: 1.5451 - val_accuracy: 0.2907
Epoch 2/10
94/94 - 0s - loss: 1.5170 - accuracy: 0.3100 - val_loss: 1.4812 - val_accuracy: 0.3471
Epoch 3/10
94/94 - 0s - loss: 1.4341 - accuracy: 0.3743 - val_loss: 1.4109 - val_accuracy: 0.3759
Epoch 4/10
94/94 - 0s - loss: 1.3561 - accuracy: 0.4213 - val_loss: 1.3701 - val_accuracy: 0.3922
Epoch 5/10
94/94 - 0s - loss: 1.3066 - accuracy: 0.4517 - val_loss: 1.3518 - val_accuracy: 0.4148
Epoch 6/10
94/94 - 0s - loss: 1.2765 - accuracy: 0.4550 - val_loss: 1.3481 - val_accuracy: 0.4185
Epoch 7/10
94/94 - 0s - loss: 1.2590 - accuracy: 0.4620 - val_loss: 1.3491 - val_accuracy: 0.3972
Epoch 8/10
94/94 - 0s - loss: 1.2302 - accuracy: 0.4760 - val_loss: 1.3460 - val_accuracy: 0.4135
Epoch 9/10
94/94 - 0s - loss: 1.2176 - accuracy: 0.4990 - val_loss: 1.3461 - val_accuracy: 0.4085
Epoch 10/10
94/94 - 0s - loss: 1.2136 - accuracy: 0.4847 - val_loss: 1.3535 - val_accuracy: 0.4135


In [251]:
#getting model prediction
test_pred_nn =  nn_model1.predict(xtest1)

#getting index of maximum probability
test_pred_nn1=[ np.argmax(x) for x in test_pred_nn]
#classification report
report_nn_vector= classification_report(ytest1,\
                                        test_pred_nn1)

print(report_nn_vector)

#confusion matrix
cm_nn_vector= confusion_matrix(ytest1, test_pred_nn1)

print(cm_nn_vector)

              precision    recall  f1-score   support

           0       0.59      0.46      0.51       120
           1       0.62      0.42      0.50       114
           2       0.46      0.61      0.53       219
           3       0.58      0.55      0.57       134
           4       0.48      0.47      0.48       211

    accuracy                           0.52       798
   macro avg       0.54      0.50      0.52       798
weighted avg       0.53      0.52      0.51       798

[[ 55   1  56   5   3]
 [  3  48  18   1  44]
 [ 28   6 134  22  29]
 [  0   1  26  74  33]
 [  8  22  56  25 100]]
