In [1]:
import pandas as pd
from bahc import BAHC
import numpy as np
from numpy import linalg as LA
from sklearn.preprocessing import StandardScaler
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# format pandas output

pd.options.display.float_format = '{:.2e}'.format

In [3]:
data = pd.read_parquet("data/clean_full_bbo_data.parquet")
data.head()

Unnamed: 0,time,Stock,bid_vwa,ask_vwa,vwap_mid_price
0,2008-04-23 14:26,PG.N,67.1,67.1,67.1
1,2008-08-07 13:28,C.N,18.9,18.9,18.9
2,2008-08-25 15:49,RTN.N,59.8,59.8,59.8
3,2008-11-25 10:38,RTN.N,46.3,46.4,46.4
4,2008-01-15 12:58,TWX.N,16.0,16.0,16.0


In [4]:
data['time'] = pd.to_datetime(data['time'])

data = data.sort_values(by=["time", "Stock"])

# Create the pivot table
pivot_table = data.pivot(index="time", columns="Stock", values="vwap_mid_price")

pivot_table = pivot_table.sort_index(axis=1)

# Forward & Backward fill
pivot_table = pivot_table.fillna(method='bfill', axis=0)
pivot_table = pivot_table.fillna(method='ffill', axis=0)

  pivot_table = pivot_table.fillna(method='bfill', axis=0)
  pivot_table = pivot_table.fillna(method='ffill', axis=0)


In [5]:
# Calculate returns
returns = pivot_table.pct_change() # simple linear returns
log_rets = np.log(1+returns)
log_rets = log_rets.fillna(0)

log_rets

Stock,C.N,EMC.N,HPQ.N,MDT.N,MO.N,NKE.N,PG.N,RTN.N,SO.N,TWX.N,WFC.N
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-01-02 09:30:00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00,0.00e+00
2008-01-02 09:31:00,9.12e-04,-9.63e-04,1.76e-03,1.33e-04,-4.60e-04,0.00e+00,0.00e+00,0.00e+00,-3.61e-03,8.11e-04,-9.38e-04
2008-01-02 09:32:00,3.02e-03,9.37e-04,2.08e-04,2.86e-03,2.82e-03,-8.76e-04,-3.09e-03,1.10e-03,9.64e-04,1.27e-03,-6.11e-04
2008-01-02 09:33:00,-3.20e-03,3.83e-03,1.26e-04,2.19e-04,-2.21e-03,-4.35e-04,-2.16e-03,1.35e-03,1.74e-03,4.58e-04,-1.48e-03
2008-01-02 09:34:00,-3.45e-03,-1.28e-03,-3.53e-05,9.95e-04,-1.03e-03,-2.42e-03,-1.42e-03,-1.55e-03,1.00e-04,8.83e-05,7.96e-04
...,...,...,...,...,...,...,...,...,...,...,...
2008-12-31 15:56:00,1.59e-03,8.25e-04,1.58e-03,2.36e-03,7.13e-04,2.66e-03,7.91e-04,1.69e-03,1.35e-03,1.97e-03,2.25e-03
2008-12-31 15:57:00,1.97e-03,-5.82e-05,5.70e-04,5.58e-04,6.10e-04,-1.19e-03,-3.42e-05,2.05e-04,-2.03e-04,1.97e-03,1.47e-03
2008-12-31 15:58:00,7.97e-04,8.02e-04,-2.69e-04,-9.59e-05,6.90e-04,2.13e-04,4.26e-04,-5.08e-04,8.11e-04,1.39e-03,1.47e-03
2008-12-31 15:59:00,-5.95e-03,-1.90e-03,-4.29e-03,-1.15e-03,-2.45e-03,-2.12e-03,-4.52e-03,-1.36e-03,-1.67e-03,-2.68e-03,-1.82e-03


### In-Sample

In [6]:
# in sample data

T_in = 60                       # lenght of the rolling window in minutes
start_period_in_sample = 8151   # 1st Febraury 2008
end_period_in_sample = 32417    # last day of April 2008
dT = 5
t0s = np.arange(start_period_in_sample, end_period_in_sample ,dT)    # 16031 is the index st the last date is 29/02/2008
t0s

array([ 8151,  8156,  8161, ..., 32406, 32411, 32416], shape=(4854,))

In [7]:
def compute_weights_GVM(covariance_matrix):
    """
    Function to compute the Global Minimum Variance (GMV) portfolio weights.

    Args:
        Sigma (): Covariance matrix of the asset returns.

    Returns:
        _type_: optimal weights
    """
    covariance_matrix_inv = LA.inv(covariance_matrix)
    weights = covariance_matrix_inv.sum(axis=1) / covariance_matrix_inv.sum()
    return weights

In [8]:
riks_in_sample = []
weights = []
means_in_sample = []


for t0 in t0s:
  
  t1 = t0 + T_in
  log_rets_cut = log_rets.iloc[t0:t1]
  
  # Compute means, std and correlation
  mean_values = log_rets_cut.mean()
  
  # center the data to execute the BAHC
  log_rets_cut_centered = StandardScaler(with_mean=True, with_std=False).fit_transform(log_rets_cut)
  bahc_log_rets = BAHC(log_rets_cut_centered.T, K=1, Nboot=100, method='near', filter_type='covariance')
  bahc_covariance_matrix_array = bahc_log_rets.filter_matrix()
  
  # convert the array to a dataframe
  bahc_covariance_matrix = pd.DataFrame(bahc_covariance_matrix_array, index=log_rets_cut.columns, columns=log_rets_cut.columns)
  
  weights_GVM_list = compute_weights_GVM(bahc_covariance_matrix_array)
  weights_GVM = np.array(weights_GVM_list)
  
  
  # compute the risk
  risk = np.dot(weights_GVM.T, np.dot(bahc_covariance_matrix, weights_GVM))
  
  # Append the results to the lists
  means_in_sample.append(mean_values)
  riks_in_sample.append(risk)
  weights.append(weights_GVM)
  

In [9]:
# Create DataFrames

moving_avg_in_sample = pd.DataFrame(means_in_sample, index=log_rets.index[t0s+T_in], columns=pivot_table.columns)
risks_df_in_sample = pd.DataFrame(riks_in_sample, index=log_rets.index[t0s+T_in], columns=['Risk'])
weights_df = pd.DataFrame(weights, index=log_rets.index[t0s+T_in], columns=pivot_table.columns)


In [10]:
risks_df_in_sample

Unnamed: 0_level_0,Risk
time,Unnamed: 1_level_1
2008-02-01 09:30:00,2.94e-07
2008-02-01 09:35:00,3.53e-07
2008-02-01 09:40:00,3.94e-07
2008-02-01 09:45:00,3.90e-07
2008-02-01 09:50:00,4.04e-07
...,...
2008-05-01 09:33:00,1.08e-07
2008-05-01 09:38:00,1.11e-07
2008-05-01 09:43:00,9.81e-08
2008-05-01 09:48:00,1.17e-07


In [11]:
weights_df

Stock,C.N,EMC.N,HPQ.N,MDT.N,MO.N,NKE.N,PG.N,RTN.N,SO.N,TWX.N,WFC.N
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-02-01 09:30:00,-2.03e-01,-2.02e-02,-1.63e-01,4.39e-01,2.83e-01,1.49e-01,1.69e-01,1.62e-01,1.18e-01,1.19e-01,-5.18e-02
2008-02-01 09:35:00,-1.84e-01,3.31e-02,-5.70e-02,2.98e-01,1.91e-01,8.74e-02,3.02e-01,2.17e-01,1.83e-01,-5.47e-02,-1.53e-02
2008-02-01 09:40:00,-1.54e-01,5.42e-02,-9.97e-03,1.72e-01,2.10e-01,9.03e-02,3.00e-01,2.13e-01,1.92e-01,-6.01e-02,-7.58e-03
2008-02-01 09:45:00,-1.34e-01,4.14e-02,-2.65e-02,1.60e-01,2.19e-01,8.72e-02,2.90e-01,2.31e-01,1.88e-01,-5.75e-02,2.47e-03
2008-02-01 09:50:00,-1.20e-01,4.13e-02,-1.35e-02,1.67e-01,2.13e-01,9.32e-02,2.57e-01,2.32e-01,1.94e-01,-5.52e-02,-8.80e-03
...,...,...,...,...,...,...,...,...,...,...,...
2008-05-01 09:33:00,5.09e-01,-5.54e-02,-3.54e-02,2.07e-01,1.40e-02,2.92e-02,8.53e-02,2.15e-01,-3.52e-02,8.56e-02,-1.89e-02
2008-05-01 09:38:00,4.46e-01,-4.14e-02,-2.41e-02,2.23e-01,1.37e-02,6.02e-02,6.94e-02,2.41e-01,-3.09e-02,4.67e-02,-3.37e-03
2008-05-01 09:43:00,2.83e-01,-4.30e-02,8.07e-05,2.48e-01,-4.73e-03,1.09e-01,6.58e-02,2.80e-01,-1.49e-02,2.95e-02,4.76e-02
2008-05-01 09:48:00,2.29e-01,-4.42e-02,-1.46e-02,2.34e-01,1.40e-02,1.16e-01,9.31e-02,2.97e-01,-4.83e-03,3.98e-02,3.90e-02


### Out-of-Sample

In [12]:
T_in = 60  # lenght of the rolling window in minutes
start_period_out_sample = end_period_in_sample
end_period_out_sample = start_period_out_sample + (end_period_in_sample - start_period_in_sample) # keep same length as in-sample
t0s_out_sample = np.arange(start_period_out_sample, end_period_out_sample ,dT)    # 16031 is the index st the last date is 29/02/2008
t0s_out_sample

array([32417, 32422, 32427, ..., 56672, 56677, 56682], shape=(4854,))

In [13]:
riks_out_sample = []
moving_average_out_sample = []

for t1 in enumerate(t0s_out_sample):
  
  t2 = t1[1] + T_in
  log_rets_cut = log_rets.iloc[t1[1]:t2]
  
  # Compute means, covariance and correlation
  mean_values = log_rets_cut.mean()
  covariance_matrix = log_rets_cut.cov()
  
  # compute risk in sample
  risk = np.dot(weights[t1[0]].T, np.dot(covariance_matrix, weights[t1[0]]))
  
  # Append the results to the lists
  moving_average_out_sample.append(mean_values)
  riks_out_sample.append(risk)
  

In [14]:
# Ensure the calculated indices are within bounds
valid_indices = t0s_out_sample + T_in
valid_indices = valid_indices[valid_indices < len(log_rets.index)]

# Create the DataFrame with valid indices only
moving_avg_out_sample = pd.DataFrame(
    moving_average_out_sample[:len(valid_indices)],  # Ensure matching sizes
    index=log_rets.index[valid_indices],
    columns=pivot_table.columns
)

# Create DataFrame for means_in_sample
# moving_avg_out_sample = pd.DataFrame(moving_average_out_sample, index=log_rets.index[t0s_out_sample+T_in], columns=pivot_table.columns)

# Create DataFrame for risks_in_sample
risks_df_out_sample = pd.DataFrame(riks_out_sample, index=log_rets.index[t0s_out_sample+T_in], columns=['Risk'])

In [15]:
risks_df_out_sample.head()

Unnamed: 0_level_0,Risk
time,Unnamed: 1_level_1
2008-05-01 09:54:00,2.15e-07
2008-05-01 09:59:00,1.85e-07
2008-05-01 10:04:00,1.83e-07
2008-05-01 10:09:00,1.73e-07
2008-05-01 10:14:00,1.44e-07


In [16]:
risks_df_in_sample.head()

Unnamed: 0_level_0,Risk
time,Unnamed: 1_level_1
2008-02-01 09:30:00,2.94e-07
2008-02-01 09:35:00,3.53e-07
2008-02-01 09:40:00,3.94e-07
2008-02-01 09:45:00,3.9e-07
2008-02-01 09:50:00,4.04e-07


### Export data into CSV

In [17]:
moving_avg_out_sample

Stock,C.N,EMC.N,HPQ.N,MDT.N,MO.N,NKE.N,PG.N,RTN.N,SO.N,TWX.N,WFC.N
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2008-05-01 09:54:00,1.12e-04,5.92e-06,1.70e-04,6.56e-05,8.07e-05,-2.50e-05,-9.03e-05,9.17e-06,-2.31e-04,6.51e-05,1.35e-04
2008-05-01 09:59:00,1.45e-04,5.24e-05,2.96e-04,7.99e-05,1.02e-04,2.74e-05,-5.24e-05,2.19e-05,-1.71e-04,1.10e-04,1.58e-04
2008-05-01 10:04:00,1.40e-04,4.27e-05,3.04e-04,5.78e-05,1.20e-04,3.44e-05,-7.17e-05,2.00e-05,-1.38e-04,1.41e-04,2.04e-04
2008-05-01 10:09:00,1.26e-04,1.77e-05,2.80e-04,6.88e-05,1.16e-04,1.69e-05,-1.11e-04,1.66e-05,-1.53e-04,1.33e-04,2.06e-04
2008-05-01 10:14:00,1.26e-04,1.03e-05,2.38e-04,8.45e-05,1.08e-04,-2.88e-06,-9.11e-05,1.51e-05,-1.22e-04,1.21e-04,2.73e-04
...,...,...,...,...,...,...,...,...,...,...,...
2008-07-30 12:54:00,-3.11e-04,-2.00e-04,-1.61e-04,-3.29e-05,-1.99e-04,-8.50e-05,-1.40e-04,-2.90e-05,-4.10e-06,-1.62e-04,-1.05e-04
2008-07-30 12:59:00,-3.25e-04,-2.75e-04,-1.06e-04,-2.11e-05,-1.98e-04,1.10e-05,-9.88e-05,-1.89e-05,1.27e-05,-1.59e-04,-7.31e-05
2008-07-30 13:04:00,-2.39e-04,-1.27e-04,-1.08e-04,-2.13e-05,-2.27e-04,-7.77e-05,-1.15e-04,-3.94e-05,2.22e-05,-1.63e-04,-1.49e-05
2008-07-30 13:09:00,-2.72e-04,-1.60e-04,-1.34e-04,-3.47e-05,-2.10e-04,-1.42e-04,-1.00e-04,-5.03e-05,-2.50e-05,-1.95e-04,-3.36e-05


### Export into CSV

In [18]:
folder_name = "data/bahc/full"

if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")


Folder 'data/bahc/small' already exists.


In [19]:
risks_df_in_sample.to_csv(f'{folder_name}/risks_in_sample.csv')
risks_df_out_sample.to_csv(f'{folder_name}/risk_out_sample.csv')
weights_df.to_csv(f'{folder_name}/weights.csv')
moving_avg_in_sample.to_csv(f'{folder_name}/moving_avg_in_sample.csv')
moving_avg_out_sample.to_csv(f'{folder_name}/moving_avg_out_sample.csv')