In [5]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
from sklearn import preprocessing

### Data Preparation

Data for this project was created from the following datasets

- https://www.kaggle.com/datasets/prasoonkottarathil/btcinusd?select=BTC-Hourly.csv

To run the data preparation download the listed files and place them in *raw_data* directory.

<sub>The files needed to run the project are already provided inside *data* directory however.</sub>

In [6]:
data = pd.read_csv('./data/btc_tweets_2018-2022.csv')
data.head()

Unnamed: 0,timestamp,user,replies,likes,retweets,text
0,2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...
1,2022-05-20 22:08:23+00:00,takethatcdc,0,1,0,Elon Musk dazzles world with plan to manufactu...
2,2022-05-20 22:08:21+00:00,dbonatoliv,0,1,0,"@BitcoinMagazine Its fine, we don't need more ..."
3,2022-05-20 22:08:20+00:00,theincomeblog,0,1,0,Bitmain Antminer APW7 PSU 1800W Power Supply f...
4,2022-05-20 22:07:55+00:00,doctoryev,0,1,0,"""Web3"" is uptrending the last 2 months. Other..."


In [7]:
analyzer = SentimentIntensityAnalyzer()
polarity = []
for i,s in enumerate(tqdm(data['text'], position=0, leave=True)):
    vs = analyzer.polarity_scores(str(s))
    polarity.append(vs["compound"])
data["polarity"] = polarity

influence = []
for i, s in tqdm(data.iterrows(), total=data.shape[0], position=0, leave=True):
    try:
        influence.append(s["replies"] + s["likes"] + s["retweets"] + 1)
    except:
        influence.append(np.nan)
data["influence"] = influence

data["influence"] = data["influence"] / data["influence"].max()

score = []
for i, s in tqdm(data.iterrows(), total=data.shape[0], position=0, leave=True):
    try:
        score.append(s["polarity"] * s["influence"])
    except:
        score.append(np.nan)
data["score"] = score

data = data.drop_duplicates()
data.head()

100%|██████████| 2209088/2209088 [05:00<00:00, 7361.89it/s] 
100%|██████████| 2209088/2209088 [01:32<00:00, 23859.95it/s]
100%|██████████| 2209088/2209088 [01:24<00:00, 26061.85it/s]


Unnamed: 0,timestamp,user,replies,likes,retweets,text,polarity,influence,score
0,2022-05-20 22:08:42+00:00,lumberhawk,1,0,0,I need to get better at slow-pilling. I feel l...,0.6705,6e-06,4e-06
1,2022-05-20 22:08:23+00:00,takethatcdc,0,1,0,Elon Musk dazzles world with plan to manufactu...,0.0,6e-06,0.0
2,2022-05-20 22:08:21+00:00,dbonatoliv,0,1,0,"@BitcoinMagazine Its fine, we don't need more ...",0.3607,6e-06,2e-06
3,2022-05-20 22:08:20+00:00,theincomeblog,0,1,0,Bitmain Antminer APW7 PSU 1800W Power Supply f...,0.0,6e-06,0.0
4,2022-05-20 22:07:55+00:00,doctoryev,0,1,0,"""Web3"" is uptrending the last 2 months. Other...",0.6705,6e-06,4e-06


In [17]:
data.set_index('timestamp', inplace=True)
data.to_csv('./data/tweets_sentiment_2018-2022.csv')

data.index = pd.to_datetime(data.index)
tweets_grouped = data.resample('1h').sum()

tweets_grouped.to_csv('./data/tweets_hourly_2018-2022.csv')

tweets_grouped.head()

Unnamed: 0_level_0,replies,likes,retweets,polarity,influence,score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01 00:00:00+00:00,9,67,52,1.8279,0.000387,0.000188
2018-01-01 01:00:00+00:00,6,288,54,0.1164,0.001007,-0.000354
2018-01-01 02:00:00+00:00,7,31,6,0.6754,0.000153,3.6e-05
2018-01-01 03:00:00+00:00,2,17,5,-0.1935,9.6e-05,-3e-06
2018-01-01 04:00:00+00:00,2,23,8,1.8697,0.000119,2.8e-05
