In [1]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
df = pd.read_csv('bitcoin_filtered.csv')
ir_df = pd.read_csv('10y_IR.csv')

df1 = df.set_index('date').join(ir_df[['Date', 'Adj Close']].set_index('Date'), 
                               how='left', rsuffix='_ir')
df1.ffill(inplace=True)
df1['rf_rate'] = df1['Adj Close']/100
df1.drop('Adj Close', axis=1, inplace=True)
df1['momentum'] = np.log(1+df1.ret).rolling(12).sum()

In [2]:
df1['vol_factor'] = df1['volatility'] - df1['volatility'].shift(-1)
df1['volume_factor'] = df1['btc_volume']-df1['btc_volume'].rolling(7).mean()

In [3]:
ioo_df = pd.read_csv('IOO.csv')
ioo_df['ret'] = ioo_df['Adj Close'].pct_change()
df1['ioo_ret'] = ioo_df.set_index('Date')['ret']

In [4]:
data = df1[['ret', 'volatility', 'momentum', 'vol_factor', 'volume_factor', 'ioo_ret', 'rf_rate']].copy()
data['ioo_ret'].ffill(inplace=True)
data['rf_rate'].ffill(inplace=True)
data['market_factor'] = data['ioo_ret'] - data['rf_rate']
data.dropna(inplace=True)

In [5]:
data[['ret', 'momentum', 'vol_factor', 'volume_factor','market_factor']].to_csv('ts_data.csv')

In [6]:
data

Unnamed: 0_level_0,ret,volatility,momentum,vol_factor,volume_factor,ioo_ret,rf_rate,market_factor
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-12,-0.029353,0.000910,0.009148,-0.000208,-1454.940328,0.007199,0.02102,-0.013821
2016-01-13,-0.005082,0.001117,-0.002724,0.000416,1206.122598,-0.018670,0.02066,-0.039330
2016-01-14,-0.007060,0.000702,-0.008782,-0.001558,-2360.948100,0.019025,0.02098,-0.001955
2016-01-15,-0.161983,0.002260,-0.180097,0.000192,26010.648049,-0.029901,0.02033,-0.050231
2016-01-16,0.074750,0.002068,-0.115244,0.000677,11661.408502,-0.029901,0.02033,-0.050231
...,...,...,...,...,...,...,...,...
2019-04-26,0.004411,0.000956,-0.001460,0.000408,6306.335564,0.002442,0.02505,-0.022608
2019-04-27,0.002661,0.000548,0.027777,0.000030,-4711.508900,0.002442,0.02505,-0.022608
2019-04-28,-0.001534,0.000517,-0.007999,-0.000116,-4695.631618,0.002442,0.02505,-0.022608
2019-04-29,-0.002300,0.000634,-0.016298,0.000039,-2723.048777,0.002639,0.02536,-0.022721


In [7]:
tw_df = pd.read_csv('twitter.csv')
tw = tw_df.groupby('date').agg({'pos': 'mean', 'neg': 'mean', 'neu': 'count'})
tw['sentiment'] = 2 * (tw['pos']/(tw['pos']+tw['neg']) - 0.5)
tw.sort_index()

Unnamed: 0_level_0,pos,neg,neu,sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,0.283000,0.000000,2,1.000000
2017-01-02,0.165000,0.164250,4,0.002278
2017-01-03,0.225600,0.000000,5,1.000000
2017-01-04,0.218000,0.059000,2,0.574007
2017-01-05,0.212500,0.044500,4,0.653696
...,...,...,...,...
2019-04-27,0.144538,0.026067,119,0.694414
2019-04-28,0.140233,0.041628,86,0.542199
2019-04-29,0.148932,0.040000,147,0.576567
2019-04-30,0.158429,0.033173,133,0.653730


In [8]:
tw['neu'].corr(df1['usd_volume'])

0.1415755006010707

In [9]:
data['tw_sentiment'] = tw['sentiment']

# Reddit Data

In [10]:
reddit = pd.read_csv('reddit.csv')
reddit['date'] = reddit['timestamp'].str[:10]
reddit['weighted_sentiment'] = reddit['score'] * reddit['sentiment']
reddit['abs_score'] = reddit['score'].abs()
re = reddit.groupby('date').agg({'weighted_sentiment': 'sum', 'abs_score': 'sum', 'sentiment': 'mean'})
re['reaction'] = re['weighted_sentiment'] / re['abs_score']
re['std_reaction'] = 2 * (re['reaction'] - 0.5)
re['std_sentiment'] = 2 * (re['sentiment'] - 0.5)

In [11]:
re

Unnamed: 0_level_0,weighted_sentiment,abs_score,sentiment,reaction,std_reaction,std_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-01,213.50,1086.0,0.299451,0.196593,-0.606814,-0.401099
2016-01-02,178.75,1066.0,0.271959,0.167683,-0.664634,-0.456081
2016-01-03,336.75,1747.0,0.289655,0.192759,-0.614482,-0.420690
2016-01-04,343.00,1616.0,0.310897,0.212252,-0.575495,-0.378205
2016-01-05,396.50,1548.0,0.290584,0.256137,-0.487726,-0.418831
...,...,...,...,...,...,...
2018-12-27,37.75,147.0,0.289474,0.256803,-0.486395,-0.421053
2018-12-28,22.50,111.0,0.291667,0.202703,-0.594595,-0.416667
2018-12-29,18.25,81.0,0.319444,0.225309,-0.549383,-0.361111
2018-12-30,243.25,595.0,0.277027,0.408824,-0.182353,-0.445946


In [12]:
re['std_sentiment'].corr(df1['ret'])

0.009689794255111763

In [14]:
re['std_reaction'].corr(df1['ret'])

-0.024842351444302493

In [15]:
re['std_sentiment'].corr(tw['sentiment'])

0.01581844568897427

# Linear Regression 

In [17]:
data2 = data.copy()
data2['reddit'] = re['std_reaction']
data2.dropna(inplace=True)

In [18]:
data2

Unnamed: 0_level_0,ret,volatility,momentum,vol_factor,volume_factor,ioo_ret,rf_rate,market_factor,tw_sentiment,reddit
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-01,0.032547,0.000625,0.222580,-0.000381,369.973147,-0.001562,0.02446,-0.026022,1.000000,-0.432039
2017-01-02,0.014823,0.001006,0.196501,-0.000070,-44.407256,-0.001562,0.02446,-0.026022,0.002278,-0.469816
2017-01-03,0.022419,0.001077,0.185503,-0.000909,144.294628,0.008212,0.02450,-0.016288,1.000000,-0.483740
2017-01-04,0.076968,0.001986,0.198707,-0.001582,11539.164556,0.004266,0.02452,-0.020254,0.574007,-0.474194
2017-01-05,-0.098823,0.003568,0.118334,0.000899,21783.611347,0.003862,0.02368,-0.019818,0.653696,-0.431111
...,...,...,...,...,...,...,...,...,...,...
2018-12-27,-0.057302,0.001114,0.121886,-0.000212,1094.126136,0.003336,0.02743,-0.024094,0.640775,-0.486395
2018-12-28,0.081655,0.001326,0.195910,0.000096,3053.702914,0.003088,0.02736,-0.024272,0.747919,-0.594595
2018-12-29,-0.041053,0.001230,0.063367,0.000001,-2908.250167,0.003088,0.02736,-0.024272,0.269185,-0.549383
2018-12-30,0.029610,0.001229,0.044958,0.000060,-4411.503184,0.003088,0.02736,-0.024272,0.565696,-0.182353


In [19]:
from sklearn.model_selection import train_test_split
X = data2[['momentum', 'vol_factor', 'market_factor', 'volume_factor', 'tw_sentiment' , 'reddit']]
X = sm.add_constant(X)
Y = data2['ret']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
model = sm.OLS(Y_train, X_train).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.124
Model:                            OLS   Adj. R-squared:                  0.109
Method:                 Least Squares   F-statistic:                     8.264
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           2.20e-08
Time:                        22:38:54   Log-Likelihood:                 606.15
No. Observations:                 358   AIC:                            -1198.
Df Residuals:                     351   BIC:                            -1171.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.0033      0.011     -0.300

In [21]:
data1 = data.copy().dropna()
from sklearn.model_selection import train_test_split
X = data1[['momentum', 'volume_factor', 'tw_sentiment']]
X = sm.add_constant(X)
Y = data1['ret']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)
model = sm.OLS(Y_train, X_train).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    ret   R-squared:                       0.106
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     26.37
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           4.00e-16
Time:                        22:38:58   Log-Likelihood:                 1153.1
No. Observations:                 670   AIC:                            -2298.
Df Residuals:                     666   BIC:                            -2280.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.0030      0.003     -1.094

In [23]:
Y_pred = model.predict(X_test)
from sklearn.metrics import r2_score
r2 = r2_score(Y_test, Y_pred)
print(f"R2 of the model is {r2}")

R2 of the model is 0.15845259579605397


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
