# Covid-19 Twitter Sentiment Analysis - NLP Project

In [30]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
# Loading the data

data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [15]:
# Checking the shape of the dataset

data.shape

(31962, 2)

In [16]:
# Analyzing the columns present in the dataset

data.columns

Index(['label', 'tweet'], dtype='object')

In [63]:
#Distribution of label present in the data

data["label"].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [17]:
# Consise summary of the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 2 columns):
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(1), object(1)
memory usage: 499.5+ KB


In [6]:
#dropping the "id" column, as it is not necessary of analysis and it is just for identity purposes

data.drop("id", axis = 1, inplace = True)
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [18]:
# removing the '@' mention words from the tweet

import warnings
warnings.filterwarnings("ignore")

for i in range(len(data.tweet)):
    data.tweet[i] = re.sub(r"@\w+", "", data.tweet[i])

data.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for #lyft credit i can't use cause th...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [19]:
# removing the '#' hashtags from the tweet

for i in range(len(data.tweet)):
    data.tweet[i] = re.sub(r"#\w+", "", data.tweet[i])

data.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for credit i can't use cause they do...
2,0,bihday your majesty
3,0,i love u take with u all the time in urð±...
4,0,factsguide: society now


In [23]:
# applying regular expressions - restricting only alphabets

for i in range(len(data.tweet)):
    data.tweet[i] = re.sub('[^a-zA-Z]',' ', data.tweet[i])

data.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for credit i can t use cause they do...
2,0,bihday your majesty
3,0,i love u take with u all the time in ur ...
4,0,factsguide society now


In [26]:
# converting all the texts in the tweet to lowercase characters

for i in range(len(data.tweet)):
    data.tweet[i] = data.tweet[i].lower()
    
data.head()

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for credit i can t use cause they do...
2,0,bihday your majesty
3,0,i love u take with u all the time in ur ...
4,0,factsguide society now


In [27]:
# Applying Tokenization Technique - splitting the sentences into words

for i in range(len(data.tweet)):
    data.tweet[i] = nltk.word_tokenize(data.tweet[i])

data.head()

Unnamed: 0,label,tweet
0,0,"[when, a, father, is, dysfunctional, and, is, ..."
1,0,"[thanks, for, credit, i, can, t, use, cause, t..."
2,0,"[bihday, your, majesty]"
3,0,"[i, love, u, take, with, u, all, the, time, in..."
4,0,"[factsguide, society, now]"


In [34]:
#List of stopwords available in English language, i.e., most common words used in English language

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [35]:
# Applying Lemmatization Technique - Converting the words into its canonical form, if those words are not present in stop words

lemmatizer = WordNetLemmatizer()

for i in range(len(data.tweet)):
    data.tweet[i] = [lemmatizer.lemmatize(word) for word in data.tweet[i] if word not in set(stopwords.words('english'))]

data.head()

Unnamed: 0,label,tweet
0,0,"[father, dysfunctional, selfish, drag, kid, dy..."
1,0,"[thanks, credit, use, cause, offer, wheelchair..."
2,0,"[bihday, majesty]"
3,0,"[love, u, take, u, time, ur]"
4,0,"[factsguide, society]"


In [36]:
#joining the seperate words back to sentences

for i in range(len(data.tweet)):
    data.tweet[i] = " ".join(data.tweet[i])

data.head()

Unnamed: 0,label,tweet
0,0,father dysfunctional selfish drag kid dysfunction
1,0,thanks credit use cause offer wheelchair van pdx
2,0,bihday majesty
3,0,love u take u time ur
4,0,factsguide society


In [39]:
# Applying the feature extraction technique using TF-IDF vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data.tweet).toarray()

In [47]:
X.shape

(31962, 5000)

In [65]:
Y = data.label
Y.shape

(31962,)

In [51]:
# Preparing the train - test split of the data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 1)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

In [56]:
# Model Building - Building a Naive Bayes classifier for predicting the test data for text classification

from sklearn.naive_bayes import GaussianNB, MultinomialNB

#implementing the Gaussian Naive Bayes Model(better suited for continuous data)
gaussianNB_model = GaussianNB()
gaussianNB_model.fit(X_train, Y_train)
Y_pred1 = gaussianNB_model.predict(X_test)

#implementing the Multinomial Naive Bayes Model(suitable for classification with discrete features such as word counts for text classification)
multinomialNB_model = MultinomialNB()
multinomialNB_model.fit(X_train, Y_train)
Y_pred2 = multinomialNB_model.predict(X_test)

In [57]:
# Model Evaluation - Evaluating the model using Confustion matrix and accuracy score

from sklearn.metrics import accuracy_score, confusion_matrix

#confusion matrix for GaussianNB model

confusion_matrix(Y_test, Y_pred1)

array([[3756, 2204],
       [ 155,  278]], dtype=int64)

In [64]:
#confusion matrix for MultinomialNB model

print(confusion_matrix(Y_test, Y_pred2))

[[5947   13]
 [ 336   97]]


In [61]:
# Accuracy score for GaussianNB model

print ("Accuracy score for GaussianNB model : ", accuracy_score(Y_test, Y_pred1))
print ("Accuracy score for MultinomialNB model : ", accuracy_score(Y_test, Y_pred2))

Accuracy score for GaussianNB model :  0.6310026591584545
Accuracy score for MultinomialNB model :  0.9454090411387455
