In [28]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, plot_precision_recall_curve, make_scorer, recall_score, brier_score_loss, precision_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# libraries for cleaning
import re
import nltk
nltk.download("stopwords") # helps us get rid of stop words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# tensorflow stuff
import tensorflow as tf


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\titom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
df = pd.read_csv("text_preprocessed.csv")
df = df.loc[:,["text","label"]]

In [30]:
df.head()

Unnamed: 0,text,label
0,coronavirus outbreak live updates odisha conf...,0
1,for today may here s how to follow the lat...,0
2,coronavirus outbreak round the clock updates ...,0
3,watch live gov phil murphy will speak at p...,0
4,to all my friends in the people s republic of ...,0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221461 entries, 0 to 221460
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    221461 non-null  object
 1   label   221461 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB


# VaderSentiment 

I will be using the vader sentiment package to perform some quick sentiment analysis. The link to the github is here: https://github.com/cjhutto/vaderSentiment#python-demo-and-code-examples

VaderSentiment is a rule based sentiment analysis tool that is specifically for social media. So it works perfectly with our twitter data. 


For activating virtual environment: https://stackoverflow.com/questions/58433333/auto-activate-virtual-environment-in-visual-studio-code

In [32]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [33]:
model_sm = SentimentIntensityAnalyzer()

In [34]:
final_scores = []
X = df.text
for phrase in X: 
    scores = model_sm.polarity_scores(phrase)
    compound_scores = scores["compound"]
    final_scores.append(compound_scores)

In [35]:
print(final_scores[:10])
print(f"The average sentiment of the dataset is {np.mean(final_scores)}")

[-0.0772, 0.0, 0.0, 0.0, 0.4767, 0.0, 0.0, 0.1779, 0.0, 0.0]
The average sentiment of the dataset is 0.08583677035685744


In research, the compound score is used. And then it is binned to be positive, negative, or neutral. The boundaries for the binning is as follows: 

- positive sentiment: score >= 0.05
- neutral sentiment: -0.05 < score < 0.05
- negative sentiment: score <= -0.05


In [36]:
# make the scores a new column 
df["raw_sentiment"] = final_scores
df.head()

Unnamed: 0,text,label,raw_sentiment
0,coronavirus outbreak live updates odisha conf...,0,-0.0772
1,for today may here s how to follow the lat...,0,0.0
2,coronavirus outbreak round the clock updates ...,0,0.0
3,watch live gov phil murphy will speak at p...,0,0.0
4,to all my friends in the people s republic of ...,0,0.4767


In [37]:
# based on the binnings mentioned above im going to create a new column with the groups 
pos_bool = df.raw_sentiment > 0.05
neutral_bool = (df.raw_sentiment > -0.05) & (df.raw_sentiment < 0.05)
negative_bool = df.raw_sentiment < -0.05

In [38]:
df["sentiment"] = 0
df.head()

Unnamed: 0,text,label,raw_sentiment,sentiment
0,coronavirus outbreak live updates odisha conf...,0,-0.0772,0
1,for today may here s how to follow the lat...,0,0.0,0
2,coronavirus outbreak round the clock updates ...,0,0.0,0
3,watch live gov phil murphy will speak at p...,0,0.0,0
4,to all my friends in the people s republic of ...,0,0.4767,0


In [39]:
df.loc[pos_bool,"sentiment"] = 1
df.loc[neutral_bool,"sentiment"] = 0
df.loc[negative_bool,"sentiment"] = -1
df.head()

Unnamed: 0,text,label,raw_sentiment,sentiment
0,coronavirus outbreak live updates odisha conf...,0,-0.0772,-1
1,for today may here s how to follow the lat...,0,0.0,0
2,coronavirus outbreak round the clock updates ...,0,0.0,0
3,watch live gov phil murphy will speak at p...,0,0.0,0
4,to all my friends in the people s republic of ...,0,0.4767,1


In [40]:
# export 
df.info()
df.to_csv("text_preprocessed_sentiment.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221461 entries, 0 to 221460
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   text           221461 non-null  object 
 1   label          221461 non-null  int64  
 2   raw_sentiment  221461 non-null  float64
 3   sentiment      221461 non-null  int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 6.8+ MB


In [41]:
# probabilities merged with sentiment 
probs = pd.read_csv("rnn_probabilities_final.csv")

In [42]:
probs.head()

Unnamed: 0.1,Unnamed: 0,text,label,probabilities
0,0,coronavirus outbreak live updates odisha conf...,0,0.00049
1,1,for today may here s how to follow the lat...,0,0.011021
2,2,coronavirus outbreak round the clock updates ...,0,0.015877
3,3,watch live gov phil murphy will speak at p...,0,0.001323
4,4,to all my friends in the people s republic of ...,0,0.001574


In [43]:
df["probability"] = probs.probabilities
df.head()

Unnamed: 0,text,label,raw_sentiment,sentiment,probability
0,coronavirus outbreak live updates odisha conf...,0,-0.0772,-1,0.00049
1,for today may here s how to follow the lat...,0,0.0,0,0.011021
2,coronavirus outbreak round the clock updates ...,0,0.0,0,0.015877
3,watch live gov phil murphy will speak at p...,0,0.0,0,0.001323
4,to all my friends in the people s republic of ...,0,0.4767,1,0.001574


In [44]:
df2 = pd.read_csv("for_tobi_f.csv",lineterminator='\n')
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221461 entries, 0 to 221460
Data columns (total 37 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  221461 non-null  int64  
 1   created_at                  221461 non-null  object 
 2   hashtags                    40892 non-null   object 
 3   media                       33124 non-null   object 
 4   urls                        104470 non-null  object 
 5   favorite_count              221461 non-null  int64  
 6   id                          221461 non-null  int64  
 7   in_reply_to_screen_name     107089 non-null  object 
 8   in_reply_to_status_id       105071 non-null  float64
 9   in_reply_to_user_id         107089 non-null  float64
 10  lang                        221461 non-null  object 
 11  place                       4724 non-null    object 
 12  possibly_sensitive          118759 non-null  object 
 13  retweet_count 

In [45]:
lang_bool = df2.lang == "en"
df2 = df2.loc[lang_bool,:]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221461 entries, 0 to 221460
Data columns (total 37 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Unnamed: 0                  221461 non-null  int64  
 1   created_at                  221461 non-null  object 
 2   hashtags                    40892 non-null   object 
 3   media                       33124 non-null   object 
 4   urls                        104470 non-null  object 
 5   favorite_count              221461 non-null  int64  
 6   id                          221461 non-null  int64  
 7   in_reply_to_screen_name     107089 non-null  object 
 8   in_reply_to_status_id       105071 non-null  float64
 9   in_reply_to_user_id         107089 non-null  float64
 10  lang                        221461 non-null  object 
 11  place                       4724 non-null    object 
 12  possibly_sensitive          118759 non-null  object 
 13  retweet_count 

In [46]:
df2 = df2.iloc[:,1:]
df2.head()

Unnamed: 0,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,...,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified,label,month,year
0,2020-05-01 03:24:58+00:00,,https://twitter.com/worldnews911/status/125606...,https://news.familysouq.net/coronavirus-outbre...,0,1256062026260123648,,,,en,...,,The Medical Weight Loss,MedicalWL,62227,,https://themedicalweightloss.com/,False,0,5,2020
1,2020-05-01 11:35:09+00:00,,https://twitter.com/nytimes/status/12561853840...,https://nyti.ms/2SG4zSv https://nyti.ms/2YlMmN...,89,1256185384050065408,,,,en,...,New York City,The New York Times,nytimes,435062,,http://www.nytimes.com/,True,0,5,2020
2,2020-05-01 06:56:33+00:00,,,https://www.thisdaylive.com/index.php/2020/05/...,2,1256115272114675712,,,,en,...,Nigeria,THISDAY LIVE,THISDAYLIVE,118195,,http://www.thisdaylive.com,False,0,5,2020
3,2020-05-01 14:28:45+00:00,,,https://trib.al/shMX8l9,0,1256229075687014400,,,,en,...,"Belleville, NJ",Belleville NJ Patch,BellevillePatch,14869,,https://patch.com/new-jersey/belleville,False,0,5,2020
4,2020-04-30 17:57:21+00:00,,,https://www.boston.com/news/coronavirus/2020/0...,0,1255919179984785410,,,,en,...,Florida,June,June13462534,2361,,,False,0,4,2020


In [47]:
df2.drop(["text","label"], inplace = True, axis = 1)
final = pd.concat([df, df2], axis = 1)
print(final.info())
final.to_csv("for_tobi_sentiment.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221461 entries, 0 to 221460
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   text                        221461 non-null  object 
 1   label                       221461 non-null  int64  
 2   raw_sentiment               221461 non-null  float64
 3   sentiment                   221461 non-null  int64  
 4   probability                 221461 non-null  float64
 5   created_at                  221461 non-null  object 
 6   hashtags                    40892 non-null   object 
 7   media                       33124 non-null   object 
 8   urls                        104470 non-null  object 
 9   favorite_count              221461 non-null  int64  
 10  id                          221461 non-null  int64  
 11  in_reply_to_screen_name     107089 non-null  object 
 12  in_reply_to_status_id       105071 non-null  float64
 13  in_reply_to_us

In [48]:
print(f"The median sentiment of the dataset is {np.median(final_scores)}")

The median sentiment of the dataset is 0.0


Now the same thing, but for the second dataset.

In [51]:
df = pd.read_csv("english_test_with_labels.csv")
df = df.loc[:,["tweet","label"]]
label_change = lambda x: 1 if x == "fake" else 0
y = df.iloc[:,1]
df.iloc[:,1] = y.map(label_change)

In [52]:
df.head()

Unnamed: 0,tweet,label
0,Our daily update is published. States reported...,0
1,Alfalfa is the only cure for COVID-19.,1
2,President Trump Asked What He Would Do If He W...,1
3,States reported 630 deaths. We are still seein...,0
4,This is the sixth time a global health emergen...,0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2140 entries, 0 to 2139
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   2140 non-null   object
 1   label   2140 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 33.6+ KB


In [55]:
model_sm = SentimentIntensityAnalyzer()
final_scores = []
X = df.tweet
for phrase in X: 
    scores = model_sm.polarity_scores(phrase)
    compound_scores = scores["compound"]
    final_scores.append(compound_scores)
print(final_scores[:10])
print(f"The average sentiment of the dataset is {np.mean(final_scores)}")

[0.0, 0.0, 0.0, -0.9042, 0.3182, -0.6369, -0.5504, 0.0, 0.3182, -0.5719]
The average sentiment of the dataset is 0.02262051401869159


In [56]:
# make the scores a new column 
df["raw_sentiment"] = final_scores
df.head()

Unnamed: 0,tweet,label,raw_sentiment
0,Our daily update is published. States reported...,0,0.0
1,Alfalfa is the only cure for COVID-19.,1,0.0
2,President Trump Asked What He Would Do If He W...,1,0.0
3,States reported 630 deaths. We are still seein...,0,-0.9042
4,This is the sixth time a global health emergen...,0,0.3182


In [59]:
# probabilities merged with sentiment 
probs = pd.read_csv("dataset2_probabilities.csv")
probs.head()

Unnamed: 0.1,Unnamed: 0,tweet,label,probabilities
0,0,Our daily update is published. States reported...,0,0.02232
1,1,Alfalfa is the only cure for COVID-19.,1,0.880705
2,2,President Trump Asked What He Would Do If He W...,1,0.90461
3,3,States reported 630 deaths. We are still seein...,0,0.007191
4,4,This is the sixth time a global health emergen...,0,0.033745


In [60]:
df["probability"] = probs.probabilities
df.head()

Unnamed: 0,tweet,label,raw_sentiment,probability
0,Our daily update is published. States reported...,0,0.0,0.02232
1,Alfalfa is the only cure for COVID-19.,1,0.0,0.880705
2,President Trump Asked What He Would Do If He W...,1,0.0,0.90461
3,States reported 630 deaths. We are still seein...,0,-0.9042,0.007191
4,This is the sixth time a global health emergen...,0,0.3182,0.033745


In [61]:
np.corrcoef(x = df.raw_sentiment, y = df.probability)

array([[ 1.        , -0.08427296],
       [-0.08427296,  1.        ]])

In [62]:
print(f"The median sentiment of the dataset is {np.median(final_scores)}")

The median sentiment of the dataset is 0.0


In [63]:
df.to_csv("sentiment_probs_df2.csv")