# Split files and compute sentiment scores

In [12]:
from time import sleep
import json
import pandas as pd
import io
from tqdm import tqdm

PRICE_FOLDER    = "data/price/"
TWITTER_FOLDER  = "data/twitter/"
tweets_raw_file = './data/twitter/bitcoin_tweets_raw.csv'
tweets_clean_file = './data/twitter/bitcoin_tweets_clean.csv'


In [20]:
df_clean = pd.read_csv(tweets_clean_file,low_memory=False)
print(df_clean.shape)
df_clean.head(5)

(635826, 9)


Unnamed: 0.1,Unnamed: 0,ID,Tweets,Likes,Retweets,Followers,CreatedAt,Hour,NextHour
0,0.0,1.43721e+18,nioctiBmaxi Gib me cheaper bitcoin Baki,0.0,0,206.0,13/9/21 0:40,13/9/21 0:00,13/9/21 1:00
1,1.0,1.43723e+18,BARBARABURGSTA1 john_faidutti So much pain las...,0.0,0,3546.0,13/9/21 1:40,13/9/21 1:00,13/9/21 2:00
2,1.0,1.43724e+18,📄 Transaction Report\n🖥 GMO Coin\n🕚 2021-09-13...,0.0,0,83.0,13/9/21 2:40,13/9/21 2:00,13/9/21 3:00
3,0.0,1.43726e+18,ZaringDavid 3% of Tether reserves is cash. Pe...,0.0,0,15.0,13/9/21 3:40,13/9/21 3:00,13/9/21 4:00
4,0.0,1.43728e+18,United States Money supply since 1960 - 2021 ....,0.0,0,53.0,13/9/21 4:40,13/9/21 4:00,13/9/21 5:00


In [33]:
# Drop na values
df_clean = df_clean.dropna()
# Manage data types
df_clean = df_clean.astype({"Likes": int, "Retweets": float, "Followers": int})
df_clean = df_clean.astype({"Retweets": int})

# Sentiment analysis

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media.

VADER takes into account 
- negations and contractions (not good, wasn’t good)
- Punctuation (good!!!), CAPS, emotes :), emojis 
- Intensificators (very, kind of), acronyms ‘lol’
- Scores between -1.0 (negative) and 1.0 (positive)

We will use this sentiment analysis of the tweets to calculate a score that will represent the importance of each tweet.

In [22]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [23]:
analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(df_clean['Tweets'])):
    vs = analyzer.polarity_scores(s)
    compound.append(vs["compound"])
df_clean["compound"] = compound
df_clean.head(5)

100%|██████████| 635821/635821 [02:40<00:00, 3971.15it/s]


Unnamed: 0.1,Unnamed: 0,ID,Tweets,Likes,Retweets,Followers,CreatedAt,Hour,NextHour,compound
0,0.0,1.43721e+18,nioctiBmaxi Gib me cheaper bitcoin Baki,0.0,0,206.0,13/9/21 0:40,13/9/21 0:00,13/9/21 1:00,0.0
1,1.0,1.43723e+18,BARBARABURGSTA1 john_faidutti So much pain las...,0.0,0,3546.0,13/9/21 1:40,13/9/21 1:00,13/9/21 2:00,-0.5407
2,1.0,1.43724e+18,📄 Transaction Report\n🖥 GMO Coin\n🕚 2021-09-13...,0.0,0,83.0,13/9/21 2:40,13/9/21 2:00,13/9/21 3:00,0.0
3,0.0,1.43726e+18,ZaringDavid 3% of Tether reserves is cash. Pe...,0.0,0,15.0,13/9/21 3:40,13/9/21 3:00,13/9/21 4:00,0.0953
4,0.0,1.43728e+18,United States Money supply since 1960 - 2021 ....,0.0,0,53.0,13/9/21 4:40,13/9/21 4:00,13/9/21 5:00,0.2467


## Calculate a score for each tweet

To calculate the score for each tweet, we use different variables to which we had a weight based on its importance.

The compound column represents the sentiment of the tweets and its value is between -1 and 1.

We also use the number of retweets, the number of likes, and the number of users that follow the tweet's author.

In [34]:
import math

scores = []

for i, s in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    scores.append(s["compound"] * ((int(s["Followers"]))) * ((int(s["Likes"])+1)) * ((int(s["Retweets"])+1)))

df_clean["score"] = scores
df_clean.head(20)

100%|██████████| 635821/635821 [01:46<00:00, 5970.94it/s]


Unnamed: 0.1,Unnamed: 0,ID,Tweets,Likes,Retweets,Followers,CreatedAt,Hour,NextHour,compound,score
0,0.0,1.43721e+18,nioctiBmaxi Gib me cheaper bitcoin Baki,0,0,206,13/9/21 0:40,13/9/21 0:00,13/9/21 1:00,0.0,0.0
1,1.0,1.43723e+18,BARBARABURGSTA1 john_faidutti So much pain las...,0,0,3546,13/9/21 1:40,13/9/21 1:00,13/9/21 2:00,-0.5407,-1917.3222
2,1.0,1.43724e+18,📄 Transaction Report\n🖥 GMO Coin\n🕚 2021-09-13...,0,0,83,13/9/21 2:40,13/9/21 2:00,13/9/21 3:00,0.0,0.0
3,0.0,1.43726e+18,ZaringDavid 3% of Tether reserves is cash. Pe...,0,0,15,13/9/21 3:40,13/9/21 3:00,13/9/21 4:00,0.0953,1.4295
4,0.0,1.43728e+18,United States Money supply since 1960 - 2021 ....,0,0,53,13/9/21 4:40,13/9/21 4:00,13/9/21 5:00,0.2467,13.0751
5,5.0,1.43727e+18,i'm more excited about $ADA ecosystem integrat...,0,0,157,13/9/21 4:40,13/9/21 4:00,13/9/21 5:00,0.4005,62.8785
6,1.0,1.43729e+18,"janeygak Follow diopfode , he will launch a co...",0,0,452,13/9/21 5:40,13/9/21 5:00,13/9/21 6:00,0.0772,34.8944
7,0.0,1.43731e+18,"Bitcoin Millionaires Purchase 50,000 $BTC in t...",0,0,6521,13/9/21 6:40,13/9/21 6:00,13/9/21 7:00,0.0,0.0
8,32.0,1.4373e+18,markjeffrey I don’t want to sound rude but it’...,1,0,971,13/9/21 6:39,13/9/21 6:00,13/9/21 7:00,-0.2144,-416.3648
9,0.0,1.43732e+18,vuyo_gabriel Hey I joined a network called Sur...,0,0,387,13/9/21 7:40,13/9/21 7:00,13/9/21 8:00,0.8118,314.1666


## Group by

In [35]:
# Group by hourly sum of scores
sent = df_clean.groupby('Hour')['score'].sum()
comp = df_clean.groupby('Hour')['compound'].sum()
n = df_clean.groupby('Hour')['Tweets'].count()
likes = df_clean.groupby('Hour')['Likes'].sum()
retweets = df_clean.groupby('Hour')['Retweets'].sum()

sent = pd.Series.to_frame(sent)
comp = pd.Series.to_frame(comp)
n = pd.Series.to_frame(n)
likes = pd.Series.to_frame(likes)
retweets = pd.Series.to_frame(retweets)

conc = pd.concat([n, likes, retweets, comp, sent], axis=1)

In [36]:
# Set index as datetime
conc.index = pd.to_datetime(conc.index)
# Sort by time
conc = conc.sort_index()

## Export to csv

In [37]:
twitter_sentiment_file = './data/twitter/bitcoin_twitter_sentiment.csv'

conc.to_csv(twitter_sentiment_file, mode='a', encoding='utf-8',index=True,header=True)