# Data Processing

## Import libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import requests

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

%matplotlib inline
plt.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')

## Import csv to DataFrame

#### Tweets data

In [3]:
df_tweets = pd.read_csv('api_csv/tweets_results.csv')
df_tweets.head(2)

Unnamed: 0,item.id,item.author_id,item.created_at,item.source,"item.public_metrics[""retweet_count""]","item.public_metrics[""reply_count""]","item.public_metrics[""like_count""]","item.public_metrics[""like_count""].1",item.text
0,1487185499915706371,613649581,2022-01-28 22:07:08+00:00,Twitter Web App,0,2,2,2,@FamiLee_Farm @0xBingBong I first dissented in...
1,1487185295757905920,19721574,2022-01-28 22:06:19+00:00,Twitter for iPhone,22,7,105,105,A disaster.\n\nThe sooner aid agencies worldwi...


In [4]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4766 entries, 0 to 4765
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   item.id                               4766 non-null   int64 
 1   item.author_id                        4766 non-null   int64 
 2   item.created_at                       4766 non-null   object
 3   item.source                           4766 non-null   object
 4   item.public_metrics["retweet_count"]  4766 non-null   int64 
 5   item.public_metrics["reply_count"]    4766 non-null   int64 
 6   item.public_metrics["like_count"]     4766 non-null   int64 
 7   item.public_metrics["like_count"].1   4766 non-null   int64 
 8   item.text                             4766 non-null   object
dtypes: int64(6), object(3)
memory usage: 335.2+ KB


In [5]:
#Remove duplicated columns
df_tweets.drop(columns=['item.public_metrics["like_count"].1'], inplace=True)

In [6]:
#Update column names
df_tweets.columns = ['tweet_id', 
              'author_id', 
              'created_at', 
              'source', 
              'retweet_count', 
              'reply_count', 
              'like_count', 
              'text']
df_tweets.head()

Unnamed: 0,tweet_id,author_id,created_at,source,retweet_count,reply_count,like_count,text
0,1487185499915706371,613649581,2022-01-28 22:07:08+00:00,Twitter Web App,0,2,2,@FamiLee_Farm @0xBingBong I first dissented in...
1,1487185295757905920,19721574,2022-01-28 22:06:19+00:00,Twitter for iPhone,22,7,105,A disaster.\n\nThe sooner aid agencies worldwi...
2,1487184488719269890,21230289,2022-01-28 22:03:07+00:00,Twitter Web App,17,7,40,With the very real threat of the government tr...
3,1487184473670180870,970207298,2022-01-28 22:03:03+00:00,Twitter Web App,192,233,886,Bitcoin mining's energy use has more than trip...
4,1487184321022767115,3367334171,2022-01-28 22:02:27+00:00,Zapier.com,21,11,49,Tori Zero NFT Project Launched a Joint Coopera...


#### Authors data

In [10]:
df_author = pd.read_csv('api_csv/author_results.csv')
df_author.head(2)

Unnamed: 0,item.name,item.id,item.username,"item.public_metrics[""followers_count""]","item.public_metrics[""following_count""]","item.public_metrics[""tweet_count""]","item.public_metrics[""listed_count""]"
0,Hester Peirce,613649581,HesterPeirce,79198,917,1562,851
1,Alex Gladstein 🌋 ⚡,19721574,gladstein,159098,2704,76435,2001


In [11]:
df_author.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3736 entries, 0 to 3735
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   item.name                               3736 non-null   object
 1   item.id                                 3736 non-null   int64 
 2   item.username                           3736 non-null   object
 3   item.public_metrics["followers_count"]  3736 non-null   int64 
 4   item.public_metrics["following_count"]  3736 non-null   int64 
 5   item.public_metrics["tweet_count"]      3736 non-null   int64 
 6   item.public_metrics["listed_count"]     3736 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 204.4+ KB


In [12]:
#Update column names
df_author.columns = ['name', 
              'author_id', 
              'username ', 
              'followers_count', 
              'following_count', 
              'tweet_count', 
              'listed_count']
df_author.head()

Unnamed: 0,name,author_id,username,followers_count,following_count,tweet_count,listed_count
0,Hester Peirce,613649581,HesterPeirce,79198,917,1562,851
1,Alex Gladstein 🌋 ⚡,19721574,gladstein,159098,2704,76435,2001
2,Rodney Glassman,21230289,rodneyglassman,1983,1424,546,107
3,Elizabeth Warren,970207298,SenWarren,7030559,515,9191,19890
4,Bitcoin News,3367334171,BTCTN,2312348,921,24849,10327


In [13]:
df_author.to_csv('data/tweet_author.csv', sep=',', encoding='utf-8', index=False)

#### Tweet counts

In [14]:
df_counts = pd.read_csv('api_csv/tweets_counts.csv', names=['start','end', 'count'])
df_counts.head(2)

Unnamed: 0,start,end,count
0,item.start,item.end,item.tweet_count
1,2022-01-22T00:09:51.000Z,2022-01-22T01:00:00.000Z,16166


In [15]:
df_counts.drop(0, inplace=True)
df_counts.head()

Unnamed: 0,start,end,count
1,2022-01-22T00:09:51.000Z,2022-01-22T01:00:00.000Z,16166
2,2022-01-22T01:00:00.000Z,2022-01-22T02:00:00.000Z,18475
3,2022-01-22T02:00:00.000Z,2022-01-22T03:00:00.000Z,16978
4,2022-01-22T03:00:00.000Z,2022-01-22T04:00:00.000Z,16762
5,2022-01-22T04:00:00.000Z,2022-01-22T05:00:00.000Z,16834


In [16]:
df_counts.to_csv('data/tweet_counts.csv', sep=',', encoding='utf-8', index=False)

The box plots on the left compared the TextBlob and VADER polarity score results with and without removing StopWords and Lemmatization for the 12,642 Yelp Review data points. The r’s in the plots represent the correlation coefficient between the actual Yelp review ratings and the polarity scores. **Based upon the r values alone, we can conclude that VADER sentiment analysis with zero text preprocessing appears to perform the best**. The VADER sentiment polarity compound scores slightly outperformed the TextBlob pattern analyzer polarity scores. This could be because VADER tends to take punctuation such as “!” , capitalization, ex. “GREAT” vs. “great”, preceding trigrams, and StopWords such as “but” into consideration as well as the fact that TextBlob doesn’t punish negation. One interesting thing to note is that both libraries tend to perform quite poorly for negative scores (Yelp reviews < 3). Moreover, removing StopWords and Lemmatization actually made identifying negative reviews more difficult. Intuitively, it actually makes perfect sense. StopWords such as “not”, “very”, and “but” can be quite helpful when it comes to identifying negative emotions. Words with the same base roots such as “worse” and “bad” or “better” and “good” exhibit different emotional intensities but will be ignored after Lemmatization. For these reasons, it is not always a great idea to remove StopWords or perform Lemmatization for sentiment analysis. The limitations of the dataset as mentioned earlier could also be negatively impacting the polarity score prediction results.

https://medium.com/data-science-blogs/stopwords-and-lexicon-normalization-for-sentiment-analysis-f9f10f0d4108

In [17]:
# Initiate Vader Sentiment Analyzer
analyser = SentimentIntensityAnalyzer()

In [18]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

In [19]:
#Check initial results
sentiment_analyzer_scores(df_tweets.text[0])

@FamiLee_Farm @0xBingBong I first dissented in 2018 from a spot bitcoin ETP denial.  It's 4 years later, and still no approval.  The SEC remains mired in "the unbounded, dangerous territory of merit regulation for which the Commission is ill-equipped." https://t.co/stmEtRqPQn {'neg': 0.124, 'neu': 0.814, 'pos': 0.062, 'compound': -0.4318}


In [20]:
#Check initial results
sentiment_analyzer_scores(df_tweets.text[1])

A disaster.

The sooner aid agencies worldwide study, learn about, and adopt Bitcoin, the better. https://t.co/wQX4qyXgbM {'neg': 0.189, 'neu': 0.599, 'pos': 0.212, 'compound': -0.128}


In [23]:
#Apply Vader Analyzer to tweets' contents and determine sentiment
df_tweet_text = df_tweets[['created_at','text']]
df_tweet_text['sentiment'] = df_tweet_text['text'].apply(lambda x: analyser.polarity_scores(x))
df_tweet_text.head()

Unnamed: 0,created_at,text,sentiment
0,2022-01-28 22:07:08+00:00,@FamiLee_Farm @0xBingBong I first dissented in...,"{'neg': 0.124, 'neu': 0.814, 'pos': 0.062, 'co..."
1,2022-01-28 22:06:19+00:00,A disaster.\n\nThe sooner aid agencies worldwi...,"{'neg': 0.189, 'neu': 0.599, 'pos': 0.212, 'co..."
2,2022-01-28 22:03:07+00:00,With the very real threat of the government tr...,"{'neg': 0.12, 'neu': 0.88, 'pos': 0.0, 'compou..."
3,2022-01-28 22:03:03+00:00,Bitcoin mining's energy use has more than trip...,"{'neg': 0.082, 'neu': 0.794, 'pos': 0.125, 'co..."
4,2022-01-28 22:02:27+00:00,Tori Zero NFT Project Launched a Joint Coopera...,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'comp..."


In [24]:
df_tweet_text['compound']  = df_tweet_text['sentiment'].apply(lambda x: x['compound'])

In [25]:
df_tweet_text.head()

Unnamed: 0,created_at,text,sentiment,compound
0,2022-01-28 22:07:08+00:00,@FamiLee_Farm @0xBingBong I first dissented in...,"{'neg': 0.124, 'neu': 0.814, 'pos': 0.062, 'co...",-0.4318
1,2022-01-28 22:06:19+00:00,A disaster.\n\nThe sooner aid agencies worldwi...,"{'neg': 0.189, 'neu': 0.599, 'pos': 0.212, 'co...",-0.128
2,2022-01-28 22:03:07+00:00,With the very real threat of the government tr...,"{'neg': 0.12, 'neu': 0.88, 'pos': 0.0, 'compou...",-0.5688
3,2022-01-28 22:03:03+00:00,Bitcoin mining's energy use has more than trip...,"{'neg': 0.082, 'neu': 0.794, 'pos': 0.125, 'co...",-0.0258
4,2022-01-28 22:02:27+00:00,Tori Zero NFT Project Launched a Joint Coopera...,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'comp...",0.3182


It is also useful for researchers who would like to set standardized thresholds for classifying sentences as either positive, neutral, or negative. Typical threshold values (used in the literature cited on this page) are:

positive sentiment: compound score >= 0.05
neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
negative sentiment: compound score <= -0.05

In [26]:
#Function splits compound and sentiment coefficients  
def no_compound(x):
    x.pop('compound')
    return x

In [27]:
df_tweet_text['sentiment']  = df_tweet_text['sentiment'].apply(lambda x: no_compound(x))

In [28]:
df_tweet_text.head()

Unnamed: 0,created_at,text,sentiment,compound
0,2022-01-28 22:07:08+00:00,@FamiLee_Farm @0xBingBong I first dissented in...,"{'neg': 0.124, 'neu': 0.814, 'pos': 0.062}",-0.4318
1,2022-01-28 22:06:19+00:00,A disaster.\n\nThe sooner aid agencies worldwi...,"{'neg': 0.189, 'neu': 0.599, 'pos': 0.212}",-0.128
2,2022-01-28 22:03:07+00:00,With the very real threat of the government tr...,"{'neg': 0.12, 'neu': 0.88, 'pos': 0.0}",-0.5688
3,2022-01-28 22:03:03+00:00,Bitcoin mining's energy use has more than trip...,"{'neg': 0.082, 'neu': 0.794, 'pos': 0.125}",-0.0258
4,2022-01-28 22:02:27+00:00,Tori Zero NFT Project Launched a Joint Coopera...,"{'neg': 0.0, 'neu': 0.829, 'pos': 0.171}",0.3182


In [29]:
#Function determines sentiment based on compund coefficient value
def sentiment_label(x):
    
    if x > 0.05:
        result = 'positive'
    elif x < -0.05:
        result = 'negative'
    else: 
        result = 'neutral'
    
    return result

In [30]:
df_tweet_text['label']  = df_tweet_text['compound'].apply(lambda x: sentiment_label(x))

In [31]:
df_tweet_text['label'].value_counts()

positive    2081
neutral     1378
negative    1307
Name: label, dtype: int64

In [32]:
df_tweet_text.drop(columns=['sentiment','compound'], inplace=True)

In [33]:
#Check for fiinal results
df_tweet_text.head()

Unnamed: 0,created_at,text,label
0,2022-01-28 22:07:08+00:00,@FamiLee_Farm @0xBingBong I first dissented in...,negative
1,2022-01-28 22:06:19+00:00,A disaster.\n\nThe sooner aid agencies worldwi...,negative
2,2022-01-28 22:03:07+00:00,With the very real threat of the government tr...,negative
3,2022-01-28 22:03:03+00:00,Bitcoin mining's energy use has more than trip...,neutral
4,2022-01-28 22:02:27+00:00,Tori Zero NFT Project Launched a Joint Coopera...,positive


In [34]:
#Export results to csv file
df_tweet_text.to_csv('data/tweet_labeled.csv', sep=',', encoding='utf-8', index=False)