<a href="https://colab.research.google.com/github/vikram0050/Coronavirus-Tweet-Sentiment-Analysis/blob/main/Vikramaditya_Sah_Coronavirus_Tweet_Sentiment_Analysis_Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <b><u> Project Title : Sentiment Analysis : Predicting sentiment of COVID-19 tweets</u></b>

## <b> Problem Description </b>

### This challenge asks you to build a classification model to predict the sentiment of COVID-19 tweets.The tweets have been pulled from Twitter and manual tagging has been done then.

### The names and usernames have been given codes to avoid any privacy concerns.

### You are given the following information:
1. Location
2. Tweet At
3. Original Tweet
4. Label

In [None]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#read csv file
df=pd.read_csv("/content/drive/MyDrive/corona/Coronavirus Tweets.csv" , encoding='latin-1')

# DATA OVERVIEW

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe(include='all')

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.Sentiment.unique()

**DATA INFORMATION**

We have 41157 rows and 6 columns in our dataset with null values present only in location, which will not affect our model as we will not be using this feature. Here our target variable will be Sentiment, which has 5 unique values- 'Neutral', 'Positive', 'Extremely Negative', 'Negative' and 'Extremely Positive'. Out of the other 5 features, the only column we really need for our classification project is OriginalTweet .

In [None]:
#copying data to preserve orignal file
df1= df.copy()

# EDA

In [None]:
#check duplicate entries
len(df1[df1.duplicated()])

In [None]:
#sentiment count
df1.Sentiment.value_counts()

In [None]:
#plotting sentiment count
sns.catplot("Sentiment", data=df1, kind="count",height=7,aspect=1.2)

In [None]:
#sentiment count
count=df1.Location.value_counts().head(10)
df1.Location.value_counts().head(10)

In [None]:
#plotting sentiment count
fig = plt.figure(figsize=(10, 5))
ax = fig.gca()
count.plot(ax = ax, kind='bar')
ax.set_title('Location wise Tweet Count')
ax.set_xlabel('Location') 
ax.set_ylabel('Tweet Count')
plt.show()

Maximum tweets are done from London and US.

# TEXT PREPROCESSING

**REMOVING LINKS/URLs**

In [None]:
import re
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: re.sub('https?://[A-Za-z0-9./]+',' ', str(x)))

In [None]:
#orignal data
df.OriginalTweet[5]

In [None]:
#copied data
df1.OriginalTweet[5]

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: re.sub('@[\w]*',' ', str(x)))

In [None]:
#result
df1.OriginalTweet[5]

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: re.sub('[^a-zA-Z]',' ', str(x)))

In [None]:
#result
df1.OriginalTweet[5]

In [None]:
#Importing Stop-words
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop = stopwords.words('english')

In [None]:
#function to remove stopwords and tokenize
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return (text)

In [None]:
df1['OriginalTweet']= df1['OriginalTweet'].apply(lambda x: remove_stopwords(x))

In [None]:
#result
df1.OriginalTweet[5]

**REMOVING SHORT WORDS**

In [None]:
#Removing words with than 3 letters
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: (w for w in x if len(w)>3))

**STEMMING**

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
#function for stemming
def stemming(text):    
    text = [stemmer.stem(word) for word in text]
    return (" ".join(text))

In [None]:
df1['OriginalTweet'] = df1['OriginalTweet'].apply(lambda x: stemming(x))

In [None]:
#result
df1.OriginalTweet[5]

**DATA ENCODING**

We will now encode our target variable - Sentiment.

In [None]:
#function to encode 

def encode(sentiment):
    if sentiment=='Neutral':  
        return 0                                                         # Changing neutral labels as 0
    elif (sentiment=='Positive') or  (sentiment=='Extremely Positive'):  
        return 1                                                         # Combining Positive and extremely positive labels as 1
    else:
        return -1                                                        # Combining Negative and extremely negative labels as -1
    

In [None]:
df1['Sentiment'] = df1['Sentiment'].apply(encode)

In [None]:
#taking a look at modified data
df1.head(5)

In [None]:
# Plotting the counts of encoded Sentiment
plt.figure(figsize=(10,5))
sns.countplot('Sentiment', data=df1)
plt.title("Counts of Sentiments after encoding")

# WORD CLOUD

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

In [None]:
#word cloud for neutral sentiment
words=' '.join(text for text in df1['OriginalTweet'][df1['Sentiment'] == 0])
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(words)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
#word cloud for positive sentiment
words=' '.join(text for text in df1['OriginalTweet'][df1['Sentiment'] == 1])
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(words)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
#word cloud for negative sentiment
words=' '.join(text for text in df1['OriginalTweet'][df1['Sentiment'] == -1])
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(words)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

#**VECTORIZATION AND DATA SPLIT**

In [None]:
#getting usable features
df2=df1[['OriginalTweet','Sentiment']]

In [None]:
#train test split
from sklearn.model_selection import train_test_split 
train,test = train_test_split(df2,test_size = 0.2,random_state=0,stratify = df2.Sentiment.values)

In [None]:
#vectorization
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop = list(stopwords.words('english'))
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)

x_train = vectorizer.fit_transform(train.OriginalTweet.values)
x_test = vectorizer.transform(test.OriginalTweet.values)

y_train = train.Sentiment.values
y_test = test.Sentiment.values


In [None]:
#shape of split data
print("X_train.shape : ", x_train.shape)
print("X_test.shape : ", x_test.shape)
print("y_train.shape : ", y_train.shape)
print("y_test.shape : ", y_test.shape)

**DATAFRAME TO STORE EVALUATION METRICS**

I will store the evaluation metrics for each model into this data frame to compare at the end.

In [None]:
#empty data frame creation
i=0
error_df=pd.DataFrame()

# LOGISTIC REGRESSION

From this point we'll fit the data in various models and get our output.

In [None]:
#hyperparameter tuning
logr = LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [None]:
#fitting data
logr.fit(x_train, y_train)

In [None]:
#prediction
y_pred = logr.predict(x_test)

In [None]:
#evaluation metrics

#F1 score
f1score = f1_score(y_test,y_pred,average='weighted')

#Accuracy
acc = accuracy_score(y_test,y_pred)

#precision
prec = precision_score(y_test,y_pred, average='weighted')

#recall
recall=recall_score(y_test,y_pred, average='weighted')

print('Accuracy :' ,acc)
print('Precision :', prec)
print('Recall :', recall)
print('F1 score :' ,f1score)

In [None]:
#Inserting errors in dataframe

error_df.loc[i,"Model_Name"]='LOGISTIC REGRESSION'
error_df.loc[i,"Accuracy"]=round(acc,4)
error_df.loc[i,"Precision"]=round(prec,4)
error_df.loc[i,"Recall"]=round(recall,4)
error_df.loc[i,"F1 score"]=round(f1score,4)


i+=1

# XGBoost CLASSIFIER

Note- not doing hyperparameter tuning because i tried giving manual input but it's giving better results without any intervention.

In [None]:
#fitting data
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
#prediction
y_pred = xgb.predict(x_test)

In [None]:
#evaluation metrics

#F1 score
f1score = f1_score(y_test,y_pred,average='weighted')

#Accuracy
acc = accuracy_score(y_test,y_pred)

#precision
prec = precision_score(y_test,y_pred, average='weighted')

#recall
recall=recall_score(y_test,y_pred, average='weighted')

print('Accuracy :' ,acc)
print('Precision :', prec)
print('Recall :', recall)
print('F1 score :' ,f1score)

In [None]:
#Inserting errors in dataframe

error_df.loc[i,"Model_Name"]='XGBOOST CLASSIFIER'
error_df.loc[i,"Accuracy"]=round(acc,4)
error_df.loc[i,"Precision"]=round(prec,4)
error_df.loc[i,"Recall"]=round(recall,4)
error_df.loc[i,"F1 score"]=round(f1score,4)


i+=1


#KNN CLASSIFIER


In [None]:
#hyperparameter tuning
reg = KNeighborsClassifier()
param = {'n_neighbors': [1,2,3,4,5,6,7,8]}

In [None]:
#fitting data
knn = GridSearchCV(estimator=reg,param_grid=param)
knn.fit(x_train, y_train)

In [None]:
#prediction
y_pred = knn.predict(x_test)

In [None]:
#evaluation metrics

#F1 score
f1score = f1_score(y_test,y_pred,average='weighted')

#Accuracy
acc = accuracy_score(y_test,y_pred)

#precision
prec = precision_score(y_test,y_pred, average='weighted')

#recall
recall=recall_score(y_test,y_pred, average='weighted')

print('Accuracy :' ,acc)
print('Precision :', prec)
print('Recall :', recall)
print('F1 score :' ,f1score)


In [None]:
#Inserting errors in dataframe

error_df.loc[i,"Model_Name"]='KNN CLASSIFIER'
error_df.loc[i,"Accuracy"]=round(acc,4)
error_df.loc[i,"Precision"]=round(prec,4)
error_df.loc[i,"Recall"]=round(recall,4)
error_df.loc[i,"F1 score"]=round(f1score,4)


i+=1

# SVM CLASSIFIER

In [None]:
#hyperparameter tuning
reg = SVC()
param = {     'C': [0.1, 1, 10, 1000],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']
        }
svm = GridSearchCV(reg,param)

In [None]:
#fitting data
svm = SVC()
svm.fit(x_train, y_train)

In [None]:
#prediction
y_pred = svm.predict(x_test)

In [None]:
#evaluation metrics

#F1 score
f1score = f1_score(y_test,y_pred,average='weighted')

#Accuracy
acc = accuracy_score(y_test,y_pred)

#precision
prec = precision_score(y_test,y_pred, average='weighted')

#recall
recall=recall_score(y_test,y_pred, average='weighted')

print('Accuracy :' ,acc)
print('Precision :', prec)
print('Recall :', recall)
print('F1 score :' ,f1score)

In [None]:
#Inserting errors in dataframe

error_df.loc[i,"Model_Name"]='SVM CLASSIFIER'
error_df.loc[i,"Accuracy"]=round(acc,4)
error_df.loc[i,"Precision"]=round(prec,4)
error_df.loc[i,"Recall"]=round(recall,4)
error_df.loc[i,"F1 score"]=round(f1score,4)


i+=1

# RANDOM FOREST CLASSIFIER

In [None]:
#hyperparameter tuning
classifier = RandomForestClassifier()
parameters = {'n_estimators':[100, 200, 300], 'max_depth':[80, 90, 100, 110]}
rf = RandomizedSearchCV(classifier, param_distributions= parameters, cv=5,n_jobs=-1)

In [None]:
#fitting data
rf.fit(x_train, y_train)

In [None]:
#prediction
y_pred = rf.predict(x_test)

In [None]:
#evaluation metrics

#F1 score
f1score = f1_score(y_test,y_pred,average='weighted')

#Accuracy
acc = accuracy_score(y_test,y_pred)

#precision
prec = precision_score(y_test,y_pred, average='weighted')

#recall
recall=recall_score(y_test,y_pred, average='weighted')

print('Accuracy :' ,acc)
print('Precision :', prec)
print('Recall :', recall)
print('F1 score :' ,f1score)

In [None]:
#Inserting errors in dataframe

error_df.loc[i,"Model_Name"]='RANDOM FOREST CLASSIFIER'
error_df.loc[i,"Accuracy"]=round(acc,4)
error_df.loc[i,"Precision"]=round(prec,4)
error_df.loc[i,"Recall"]=round(recall,4)
error_df.loc[i,"F1 score"]=round(f1score,4)


i+=1

# MODEL COMPARISION

We will now compare performance of all the classification models-

In [None]:
#sorting by F1 score
error_df.sort_values(by=['F1 score'],ascending=False,inplace=True)
error_df=error_df.reset_index()
error_df.drop(labels='index',axis=1)

#CONCLUSION
We can conclude that Logistic regression is the best model for our dataset, followed closely by SVM classifier and Random Forest classifier. XGboost and KNN classifiers did not give a good result compared to others.