In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import shapiro
import warnings
import statsmodels.api as sm

This notebook covers the modelling exercise of this research. Its goal is to take the final modelling dataset and run it through the HAR model.

Inputs:
- __"finaldataset.csv"__ = Final modelling dataset

Outputs: 
- none, tables exported to the thesis

# Data Prep

In [2]:
df = pd.read_csv('finaldataset.csv')
df.set_index(df['Date'], inplace=True, drop=True)
df = df.iloc[:, 2:]
df

Unnamed: 0_level_0,BTCLogRets,ActAddresses,Hashrate,BTCFees,Inflation,Velocity,VIX,SP500Lret,EURUSDLret,GoogleBitcoin,...,PressRelease,Speech,Statement,Covid,NegGenAICovidECB,NegFinBERTCovidECB,SentimentGenAIPositive,SentimentGenAINegative,SentimentFinBERTPositive,SentimentFinBERTNegative
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-03,0.396975,-0.120920,-0.129225,0.307013,-0.992455,0.247973,-0.052300,0.685162,0.298854,0.000594,...,1,0,1,0,0,0,0,1,0,1
2018-01-04,0.077014,1.271817,0.089652,-0.497525,0.786636,-0.295083,0.004269,0.204528,-0.956008,-0.996124,...,1,0,1,0,0,0,1,0,0,1
2018-01-05,2.894889,-1.903824,-0.225532,4.948268,-1.774724,0.183500,-0.001470,0.524581,1.431492,0.854924,...,0,0,0,0,0,0,0,0,0,0
2018-01-06,0.131462,0.971415,0.212224,-1.552675,1.782556,0.198691,-0.757367,-0.007251,-1.077399,-0.711348,...,0,0,0,0,0,0,0,0,0,0
2018-01-07,-1.287011,-0.340983,-0.059185,-1.598510,-0.422937,-0.281547,-0.001470,-0.007251,-0.001410,-1.366334,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-17,-0.317178,-0.794833,0.911715,-0.219948,0.354112,0.257195,0.172338,-1.024068,0.836383,-0.132302,...,0,0,0,1,0,0,0,0,0,0
2021-09-18,0.483123,-1.280530,0.661250,-0.079588,0.258882,0.543734,-1.707568,-0.007251,-1.108472,-0.530989,...,0,0,0,1,0,0,0,0,0,0
2021-09-19,-0.520001,-1.089008,0.410786,-0.304335,0.163669,0.980595,-0.001470,-0.007251,-0.001410,0.563819,...,0,0,0,1,0,0,0,0,0,0
2021-09-20,-2.368014,3.105516,0.327298,0.181992,0.131921,-5.284473,2.106353,-1.278718,0.463241,2.025672,...,0,1,1,1,0,0,1,0,1,0


In [3]:
df.columns

Index(['BTCLogRets', 'ActAddresses', 'Hashrate', 'BTCFees', 'Inflation',
       'Velocity', 'VIX', 'SP500Lret', 'EURUSDLret', 'GoogleBitcoin',
       'GoogleCrypto', 'WikiBitcoin', 'RealizedVariance', 'RealizedVolatility',
       'DailyVolatility', 'WeeklyVolatility', 'MonthlyVolatility',
       'SentimentFinBERT', 'IsNegativeFinBERT', 'IsPositiveFinBERT',
       'SentimentGenAI', 'IsNegativeGenAI', 'IsPositiveGenAI', 'ECB', 'FED',
       'MonetaryDecision', 'PressRelease', 'Speech', 'Statement', 'Covid',
       'NegGenAICovidECB', 'NegFinBERTCovidECB', 'SentimentGenAIPositive',
       'SentimentGenAINegative', 'SentimentFinBERTPositive',
       'SentimentFinBERTNegative'],
      dtype='object')

# HAR

## Volatility-only model

In [14]:
Xbasic = df[['DailyVolatility', 'WeeklyVolatility', 'MonthlyVolatility']]

Xbasic = sm.add_constant(Xbasic) 
y = df['RealizedVolatility']
HARbaseline = sm.OLS(y, Xbasic).fit()
print(HARbaseline.summary())

                            OLS Regression Results                            
Dep. Variable:     RealizedVolatility   R-squared:                       0.560
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     574.6
Date:                Sun, 20 Oct 2024   Prob (F-statistic):          8.17e-241
Time:                        00:46:58   Log-Likelihood:                -1367.7
No. Observations:                1358   AIC:                             2743.
Df Residuals:                    1354   BIC:                             2764.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                -0.0001      0.01

## Volatility + BTC

In [13]:
Xnosent = df[['Inflation',
        'VIX', 'SP500Lret', 'GoogleBitcoin',
        'GoogleCrypto',
              
        'DailyVolatility', 'WeeklyVolatility', 'MonthlyVolatility']]

Xnosent = sm.add_constant(Xnosent)  
y = df['RealizedVolatility']
HARWithBTC = sm.OLS(y, Xnosent).fit()
print(HARWithBTC.summary())

                            OLS Regression Results                            
Dep. Variable:     RealizedVolatility   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.684
Method:                 Least Squares   F-statistic:                     368.3
Date:                Sun, 20 Oct 2024   Prob (F-statistic):               0.00
Time:                        00:46:51   Log-Likelihood:                -1138.9
No. Observations:                1358   AIC:                             2296.
Df Residuals:                    1349   BIC:                             2343.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.0003      0.01

In [11]:
# Prepare the independent variables: lagged daily, weekly, and monthly volatilities + exogenous variables
Xfinal = df[['Inflation', 'VIX', 
         'SP500Lret', 'GoogleBitcoin','GoogleCrypto',
              
         'DailyVolatility', 'WeeklyVolatility', 'MonthlyVolatility',
       
         'NegFinBERTCovidECB', 
         'NegGenAICovidECB'
            ]]

Xfinal = sm.add_constant(Xfinal)  # Adds a constant term to the predictors
y = df['RealizedVolatility']
HARFinalModel = sm.OLS(y, Xfinal).fit()
print(HARFinalModel.summary())

                            OLS Regression Results                            
Dep. Variable:     RealizedVolatility   R-squared:                       0.687
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                     296.0
Date:                Sun, 20 Oct 2024   Prob (F-statistic):               0.00
Time:                        00:46:30   Log-Likelihood:                -1136.0
No. Observations:                1358   AIC:                             2294.
Df Residuals:                    1347   BIC:                             2351.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -0.0038      0

In [12]:
pvalues = HARFinalModel.pvalues
def assign_stars(p):
    if p < 0.01:
        return '***'
    elif p < 0.05:
        return '**'
    elif p < 0.1:
        return '*'
    else:
        return ''

# Create a DataFrame with p-values and significance stars
pvalues_df = pd.DataFrame({
    'Variable': pvalues.index,
    'P-value': pvalues.values,
    'Significance Stars': pvalues.apply(assign_stars)
})

# Display the DataFrame
pvalues_df

Unnamed: 0,Variable,P-value,Significance Stars
const,const,0.8045917,
Inflation,Inflation,0.05147223,*
VIX,VIX,0.06156448,*
SP500Lret,SP500Lret,0.004475304,***
GoogleBitcoin,GoogleBitcoin,6.2103539999999996e-58,***
GoogleCrypto,GoogleCrypto,0.006384268,***
DailyVolatility,DailyVolatility,2.55291e-48,***
WeeklyVolatility,WeeklyVolatility,2.659976e-51,***
MonthlyVolatility,MonthlyVolatility,0.06273469,*
NegFinBERTCovidECB,NegFinBERTCovidECB,0.3880803,


## Alterinative Variables

In [48]:
# Defining additional variables not used in the model
df['NegGenAICovid'] = df['SentimentGenAINegative'] * df['Covid']
df['NegGenAICovidFED'] = df['SentimentGenAINegative'] * df['Covid'] * df['FED']

df['NegFinBERTCovid'] = df['SentimentFinBERTNegative'] * df['Covid']
df['NegFinBERTCovidFED'] = df['SentimentFinBERTNegative'] * df['Covid'] * df['FED']

df['PosGenAICovid']  = df['SentimentGenAIPositive'] * df['Covid']
df['PosGenAICovidECB'] = df['SentimentGenAIPositive'] * df['Covid'] * df['ECB']
df['PosGenAICovidFED'] = df['SentimentGenAIPositive'] * df['Covid'] * df['FED']

df['PosFinBERTCovid'] = df['SentimentFinBERTPositive'] * df['Covid']
df['PosFinBERTCovidECB'] = df['SentimentFinBERTPositive'] * df['Covid'] * df['ECB']
df['PosFinBERTCovidFED'] = df['SentimentFinBERTPositive'] * df['Covid'] * df['FED']



# List of alternative variables to include in the models
additional_vars = [
    'NegGenAICovidECB', 'NegGenAICovid', 'SentimentFinBERTNegative', 
    'NegFinBERTCovidFED', 'SentimentGenAINegative', 'NegFinBERTCovid',
    'PosGenAICovidECB', 'NegGenAICovidFED', 'SentimentGenAIPositive',
    'SentimentFinBERTPositive', 'NegFinBERTCovidECB', 'PosGenAICovidFED',
    'PosFinBERTCovidECB', 'PosFinBERTCovid', 'PosFinBERTCovidFED', 'PosGenAICovid'
]

# Create an empty list to store the results
results = []

for var in additional_vars:
    # Prepare the independent variables with the additional variable
    Xalt = df[['Inflation', 'VIX', 'SP500Lret', 'GoogleBitcoin', 'GoogleCrypto',
               'DailyVolatility', 'WeeklyVolatility', 'MonthlyVolatility', var]]
    Xalt = sm.add_constant(Xalt)  # Adds a constant term to the predictors
    
    y = df['RealizedVolatility']
    
    # Fit the model
    HARAltModel = sm.OLS(y, Xalt).fit()
    
    # Check if the variable is in the model's parameters before extracting information
    if var in HARAltModel.params:
        result = {
            'Variable': var,
            'Coefficient': HARAltModel.params[var],
            'Standard Error': HARAltModel.bse[var],
            'P-value': HARAltModel.pvalues[var],
            'Significance Stars': assign_stars(HARAltModel.pvalues[var]),
            'R²': HARAltModel.rsquared
        }
        results.append(result)

# Convert the list of results to a pandas DataFrame
results_alt_df = pd.DataFrame(results)
results_alt_df.round(3)

Unnamed: 0,Variable,Coefficient,Standard Error,P-value,Significance Stars,R²
0,NegGenAICovidECB,0.215,0.096,0.024,**,0.687
1,NegGenAICovid,0.16,0.076,0.035,**,0.687
2,SentimentFinBERTNegative,0.072,0.038,0.059,*,0.687
3,NegFinBERTCovidFED,0.042,0.064,0.508,,0.686
4,SentimentGenAINegative,0.038,0.043,0.373,,0.686
5,NegFinBERTCovid,0.05,0.059,0.403,,0.686
6,PosGenAICovidECB,0.036,0.056,0.518,,0.686
7,NegGenAICovidFED,0.049,0.087,0.575,,0.686
8,SentimentGenAIPositive,0.015,0.034,0.662,,0.686
9,SentimentFinBERTPositive,-0.011,0.034,0.754,,0.686
