# HAR Modelling implementation

### In-sample:

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

# Load and preprocess the data
full_df = pd.read_csv("DF_TS.csv")
df = full_df[['daily_vol', 'mean_polarity_vader']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

# Specify the variable of interest (realized volatility) and the external variable
rv = df['daily_vol']  # Replace 'RealizedVolatility' with the column name of your realized volatility variable
pol = df['mean_polarity_vader']  # Replace 'ExternalVariable' with the column name of your external variable

# Specify the lagged realized volatilities at different frequencies
rv_daily = rv.shift(1)  # Lagged realized volatility at daily frequency
rv_weekly = rv.shift(5)  # Lagged realized volatility at weekly frequency (assuming 5 trading days per week)
rv_monthly = rv.shift(22)  # Lagged realized volatility at monthly frequency (assuming 22* trading days per month)


# Combine the lagged realized volatilities and the external variable into a DataFrame
df_for_reg = pd.DataFrame({'RV': rv, 'RV_Daily': rv_daily, 'RV_Weekly': rv_weekly, 'RV_Monthly': rv_monthly,
                   'Pol_Daily': pol}).dropna()

# Prepare the model inputs
X = df_for_reg[['RV_Daily', 'RV_Weekly', 'RV_Monthly', 'Pol_Daily']]
X = sm.add_constant(X)  # Include a constant term
y = df_for_reg['RV']

# Fit the HAR-RV model
model = sm.OLS(y, X)
olsres = model.fit()

# Export the results in LaTeX format
summary_tex = olsres.summary().as_latex()

# Save the LaTeX summary to a file
with open('HAR_vader_model_results.tex', 'w') as f:
    f.write(summary_tex)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility


In [2]:
# Combine the lagged realized volatilities and the external variable into a DataFrame
df_for_reg = pd.DataFrame({'RV': rv, 'RV_Daily': rv_daily, 'RV_Weekly': rv_weekly, 'RV_Monthly': rv_monthly,
                   'ABS_Pol_Daily': np.abs(pol)}).dropna()

# Prepare the model inputs
X = df_for_reg[['RV_Daily', 'RV_Weekly', 'RV_Monthly', 'ABS_Pol_Daily']]
X = sm.add_constant(X)  # Include a constant term
y = df_for_reg['RV']

# Fit the HAR-RV model
model = sm.OLS(y, X)
olsres = model.fit()

# Export the results in LaTeX format
summary_tex = olsres.summary().as_latex()

# Save the LaTeX summary to a file
with open('ABS_HAR_vader_model_results.tex', 'w') as f:
    f.write(summary_tex)

In [3]:
# Changing to text blob

df = full_df[['daily_vol', 'mean_polarity_textblob']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

# Specify the variable of interest (realized volatility) and the external variable
rv = df['daily_vol']  # Replace 'RealizedVolatility' with the column name of your realized volatility variable
pol = df['mean_polarity_textblob']  # Replace 'ExternalVariable' with the column name of your external variable

# Specify the lagged realized volatilities at different frequencies
rv_daily = rv.shift(1)  # Lagged realized volatility at daily frequency
rv_weekly = rv.shift(5)  # Lagged realized volatility at weekly frequency (assuming 5 trading days per week)
rv_monthly = rv.shift(22)  # Lagged realized volatility at monthly frequency (assuming 22* trading days per month)


# Combine the lagged realized volatilities and the external variable into a DataFrame
df_for_reg = pd.DataFrame({'RV': rv, 'RV_Daily': rv_daily, 'RV_Weekly': rv_weekly, 'RV_Monthly': rv_monthly,
                   'Pol_Daily': pol}).dropna()

# Prepare the model inputs
X = df_for_reg[['RV_Daily', 'RV_Weekly', 'RV_Monthly', 'Pol_Daily']]
X = sm.add_constant(X)  # Include a constant term
y = df_for_reg['RV']

# Fit the HAR-RV model
model = sm.OLS(y, X)
olsres = model.fit()

# Export the results in LaTeX format
summary_tex = olsres.summary().as_latex()

# Save the LaTeX summary to a file
with open('HAR_textblob_model_results.tex', 'w') as f:
    f.write(summary_tex)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility


In [4]:
# Combine the lagged realized volatilities and the external variable into a DataFrame
df_for_reg = pd.DataFrame({'RV': rv, 'RV_Daily': rv_daily, 'RV_Weekly': rv_weekly, 'RV_Monthly': rv_monthly,
                   'ABS_Pol_Daily': np.abs(pol)}).dropna()

# Prepare the model inputs
X = df_for_reg[['RV_Daily', 'RV_Weekly', 'RV_Monthly', 'ABS_Pol_Daily']]
X = sm.add_constant(X)  # Include a constant term
y = df_for_reg['RV']

# Fit the HAR-RV model
model = sm.OLS(y, X)
olsres = model.fit()

# Export the results in LaTeX format
summary_tex = olsres.summary().as_latex()

# Save the LaTeX summary to a file
with open('ABS_HAR_textblob_model_results.tex', 'w') as f:
    f.write(summary_tex)

### HAR rolling out-of-sample predictions

In [5]:
# Load and preprocess the data
full_df = pd.read_csv("DF_TS.csv")
df = full_df[['daily_vol', 'mean_polarity_vader']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

window_size = 136
horizon = 1

# HAR features
df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag
df['RV_m'] = df['daily_vol'].rolling(window=22).mean().shift(horizon) # Monthly lag

df.dropna(inplace=True) # Drop rows with missing values

y_pred_list = []
y_true_list = []

for i in range(window_size, len(df) - horizon):

    X_train = df.iloc[i-window_size:i, 1:]
    y_train = df.iloc[i-window_size:i, 0]
    
    X_test = df.iloc[i:i+horizon, 1:]
    y_test = df.iloc[i:i+horizon, 0]

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_true = y_test.values

    y_pred_list.extend(y_pred)
    y_true_list.extend(y_true)

out_of_sample_r2 = r2_score(y_true_list, y_pred_list)

print(f'Out of sample R^2: {out_of_sample_r2:.4f}')
print(model.coef_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag


Out of sample R^2: 0.0555
[-0.01340789  0.06026658  0.26150597  0.37510502]


In [6]:
# Load and preprocess the data
full_df = pd.read_csv("DF_TS.csv")
df = full_df[['daily_vol', 'mean_polarity_vader']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

window_size = 136
horizon = 1

# HAR features
df['mean_polarity_vader'] = df['mean_polarity_vader'].abs() # absolute value
df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag
df['RV_m'] = df['daily_vol'].rolling(window=22).mean().shift(horizon) # Monthly lag

df.dropna(inplace=True) # Drop rows with missing values

y_pred_list = []
y_true_list = []

for i in range(window_size, len(df) - horizon):

    X_train = df.iloc[i-window_size:i, 1:]
    y_train = df.iloc[i-window_size:i, 0]
    
    X_test = df.iloc[i:i+horizon, 1:]
    y_test = df.iloc[i:i+horizon, 0]

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_true = y_test.values

    y_pred_list.extend(y_pred)
    y_true_list.extend(y_true)

out_of_sample_r2 = r2_score(y_true_list, y_pred_list)

print(f'Out of sample R^2: {out_of_sample_r2:.4f}')
print(model.coef_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mean_polarity_vader'] = df['mean_polarity_vader'].abs() # absolute value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
A val

Out of sample R^2: 0.0556
[-0.01049658  0.05802263  0.26586437  0.37119371]


In [7]:
# Load and preprocess the data
full_df = pd.read_csv("DF_TS.csv")
df = full_df[['daily_vol', 'mean_polarity_textblob']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

window_size = 136
horizon = 1

# HAR features
df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag
df['RV_m'] = df['daily_vol'].rolling(window=22).mean().shift(horizon) # Monthly lag

df.dropna(inplace=True) # Drop rows with missing values

y_pred_list = []
y_true_list = []

for i in range(window_size, len(df) - horizon):

    X_train = df.iloc[i-window_size:i, 1:]
    y_train = df.iloc[i-window_size:i, 0]
    
    X_test = df.iloc[i:i+horizon, 1:]
    y_test = df.iloc[i:i+horizon, 0]

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_true = y_test.values

    y_pred_list.extend(y_pred)
    y_true_list.extend(y_true)

out_of_sample_r2 = r2_score(y_true_list, y_pred_list)

print(f'Out of sample R^2: {out_of_sample_r2:.4f}')
print(model.coef_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag


Out of sample R^2: 0.0506
[-0.02557559  0.0576311   0.26755211  0.3934308 ]


In [8]:
# Load and preprocess the data
full_df = pd.read_csv("DF_TS.csv")
df = full_df[['daily_vol', 'mean_polarity_textblob']] # In this df we have daily_vol and mean_polarity_vader which is alredy lagged
df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility

window_size = 136
horizon = 1

# HAR features
df['mean_polarity_textblob'] = df['mean_polarity_textblob'].abs() # absolute value
df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag
df['RV_w'] = df['daily_vol'].rolling(window=5).mean().shift(horizon) # Weekly lag
df['RV_m'] = df['daily_vol'].rolling(window=22).mean().shift(horizon) # Monthly lag

df.dropna(inplace=True) # Drop rows with missing values

y_pred_list = []
y_true_list = []

for i in range(window_size, len(df) - horizon):

    X_train = df.iloc[i-window_size:i, 1:]
    y_train = df.iloc[i-window_size:i, 0]
    
    X_test = df.iloc[i:i+horizon, 1:]
    y_test = df.iloc[i:i+horizon, 0]

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_true = y_test.values

    y_pred_list.extend(y_pred)
    y_true_list.extend(y_true)

out_of_sample_r2 = r2_score(y_true_list, y_pred_list)

print(f'Out of sample R^2: {out_of_sample_r2:.4f}')
print(model.coef_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'daily_vol'] *= np.sqrt(252) #annualize daily volatility
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mean_polarity_textblob'] = df['mean_polarity_textblob'].abs() # absolute value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RV_d'] = df['daily_vol'].shift(horizon) # Daily lag

Out of sample R^2: 0.0485
[-0.03306675  0.05715429  0.27248819  0.4041364 ]
