# Import Packages

In [1]:
# Method #2 HAR-X Model
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

from IPython.display import display, HTML
from sklearn.metrics import mean_squared_error
from set_params import func_train_test_split, count_train_test, vif_check, vif_dynamic_check
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import warnings

def calculate_iqr(values):
    # Calculate Q1
    Q1 = np.percentile(values, 25)
    # Calculate Q3
    Q3 = np.percentile(values, 75)
    # Calculate IQR
    IQR = Q3 - Q1
    return IQR

def detect_outliers_iqr(values):
    # Calculate the IQR of the values
    IQR = calculate_iqr(values)
    # Calculate Q1 and Q3
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    # Define the lower and upper bound for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return a boolean array: True if the value is an outlier, False otherwise
    return lower_bound, upper_bound

def vis_line_plot_results(y_pred, y_test, model = 'HAR', name = 'BARCLAYS', r = 1, dataset = 'm1'):

    plt.figure(figsize=(10,4))
    true, = plt.plot(y_test)
    preds, = plt.plot(y_pred)
    plt.title(f'{model}-{dataset}-{name}', fontsize=15)
    plt.legend(['True Volatility', 'Predicted Volatility'], fontsize=9)
    plt.xticks(rotation=45)
    plt.savefig(f'../outputs/{model}-{dataset}/{str(r+1).zfill(3)}-{model}-{name}.png')
    plt.close()

# display(HTML("<style>.container { width:80% !important; }</style>"))
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [2]:
import time
from set_params import func_train_test_split, count_train_test, series_to_supervised, train_test_split
from numpy import asarray, log1p, expm1

def ori_har_perf(df, df1):
    
    print('m1 datasets:', np.mean(df['MSE^3']))
    print('m3 datasets:', np.mean(df1['MSE^3']))

# Data Processing

## Import Data and Split Train - Test

In [3]:
base_FTSE_df = pd.read_csv('../data/1.3-FTSE_Monthly_ESG_Volatility_Final_v2.csv')
# base_FTSE_df = base_FTSE_df.rename(columns={'Date_x':'date_key'})
# base_FTSE_df = base_FTSE_df.rename(columns={'Date_Key(Price)':'date_key'})

In [4]:
train_df, valid_df, test_df = func_train_test_split(validation = False, threshold = 24)

In [5]:
count_rows_df = count_train_test(train_df, test_df)

In [6]:
coverage_df = pd.read_csv('../data/coverage_dataframe.csv')
coverage_df.PermID = coverage_df.PermID.astype(int)
coverage_df = coverage_df[['PermID', 'Name']]
coverage_df = coverage_df.rename(columns={'PermID':'Asset'})

In [7]:
train_df = pd.merge(train_df, coverage_df, how = 'left', on = 'Asset')
train_df.index = train_df.month_key

In [8]:
train_df.head(3)

Unnamed: 0_level_0,Unnamed: 0,date_key,Asset,Open,High,Low,Close,Return,V^CC,V^RS,V^YZ,month_key,buzz,ESG,ESGCombined,ESGControversies,EnvironmentalPillar,GovernancePillar,SocialPillar,Community,EnvironmentalInnovation,Management,ProductResponsibility,Shareholders,Workforce,vol_series_daily,vol_series_weekly,vol_series_monthly,Name
month_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2020-09-01,0,2020-10-30,5042941681,4.933999,4.935,4.830999,4.847998,-0.018027,0.01526,0.014832,0.016693,2020-09-01,4108.0,75.0,75.0,75.0,64.0,71.0,91.0,100.0,69.0,91.0,97.0,3.0,81.0,,,,B&M European Value Retail SA
2020-10-01,1,2020-11-30,5042941681,4.742,4.824998,4.737,4.778999,0.006105,0.028225,0.025393,0.027841,2020-10-01,4090.5,71.0,71.0,71.0,55.0,69.0,89.0,100.0,66.0,88.0,98.0,4.0,74.0,0.016693,,,B&M European Value Retail SA
2020-11-01,2,2020-12-31,5042941681,5.137998,5.21,5.135999,5.162,-0.009213,0.018095,0.022583,0.023722,2020-11-01,4291.5,72.0,69.0,66.0,52.0,73.0,92.0,100.0,75.0,94.0,98.0,5.0,82.0,0.027841,,,B&M European Value Retail SA


- Buzz score (1)
- ESG Overall score (1)
- ESG Combined score (1)
- ESG Controversy score (1)
- Pillar scores (3)
- Category scores (10)

In total there are 17 scores provided by MP ESG Core

In [9]:
cols = [
    'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']

In [None]:
train_df[cols].head()

# Modeling

## Version 1.1 (Promoted)

### Create Functions

In [10]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()
        print(cols)

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
        
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 

        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        test_size = df_test.shape[0]
        
        # vif for each assets
        # df_merge = pd.concat([df_train, df_test])
        # cols = vif_dynamic_check(df_merge)
        # print(name, cols)
        # df_merge = df_merge[cols]
        # df_train = df_merge[df_merge.Asset == asset].iloc[:-test_size,: ]
        # df_test = df_merge[df_merge.Asset == asset].iloc[-test_size:,:]
        
        df_train = log1p(df_train)
        df_test = log1p(df_test)
        
        X_train = df_train.drop(['V^YZ'], axis=1)
        X_test = df_test.drop(['V^YZ'], axis=1)
        
        y_train = df_train['V^YZ']
        y_test = df_test['V^YZ']
        
        X_train = sm.add_constant(X_train)
        X_test.loc[:, 'const'] = 1
        X_test = X_test[X_train.columns]

        # Fit the model
        model = OLS(y_train, X_train)
        model_fit = model.fit()

        # display(X_test, X_train)
        y_pred = model_fit.predict(X_test)
        if cap:
            y_pred = y_pred.clip(lower = 0)
        
        mse_million = mean_squared_error(y_test,y_pred)*10**3
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse_million
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])
        
        y_test = expm1(y_test)
        y_pred = expm1(y_pred)

        if viz: 
            vis_line_plot_results(y_pred, y_test, model = 'HAR', dataset=dataset, name=name, r = r)

    return mresults

In [10]:
mresults_m1_v11 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m1', viz=True, cap = True)

In [11]:
mresults_m1_v11.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
104,4295895343,Alliance & Leicester Ltd,HAR,8,6.097403
87,4295898951,EDF Energy Nuclear Generation Group Ltd,HAR,8,3.055775
65,5000683618,Currys PLC,HAR,12,2.793378
31,4295894092,Travis Perkins PLC,HAR,12,2.176187
101,4295895499,HBOS Plc,HAR,10,2.074167
...,...,...,...,...,...
20,4295874865,DCC PLC,HAR,25,0.007940
64,4295893850,G4S Ltd,HAR,31,0.005949
59,4295895691,Alliance Trust PLC,HAR,10,0.005820
63,4295894471,Amec Foster Wheeler Ltd,HAR,22,0.005765


In [12]:
np.mean(mresults_m1_v11['MSE^3'])

0.21483258430981284

---

In [11]:
mresults_m3_v11 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m3', viz=True, cap = True)

In [12]:
mresults_m3_v11.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
5,4295894669,JD Sports Fashion PLC,HAR,12,13.002178
56,4295897144,John Wood Group PLC,HAR,8,12.361643
101,4295895499,HBOS Plc,HAR,10,6.555752
65,5000683618,Currys PLC,HAR,12,3.655763
13,5037364885,NMC Health PLC,HAR,8,3.271018
...,...,...,...,...,...
24,4295869210,TUI AG,HAR,17,0.037912
129,4295894756,Reckitt Benckiser Group PLC,HAR,60,0.029652
99,4295895781,GSK plc,HAR,60,0.028447
25,4298449570,Direct Line Insurance Group PLC,HAR,17,0.019992


In [13]:
np.mean(mresults_m3_v11['MSE^3'])

0.5691182939086771

In [14]:
mresults_m3_v11[mresults_m3_v11.Name == 'Meggitt Ltd']

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
39,4295895717,Meggitt Ltd,HAR,20,0.357407


---

In [12]:
mresults_m2_v11 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m2', viz=True, cap = True)

['buzz', 'Community', 'EnvironmentalInnovation', 'ProductResponsibility', 'Workforce', 'V^YZ']


In [20]:
mresults_m2_v11.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
56,4295897144,John Wood Group PLC,HAR,8,23.351436
7,4295894930,Spirax-Sarco Engineering PLC,HAR,14,4.834375
65,5000683618,Currys PLC,HAR,12,3.747595
101,4295895499,HBOS Plc,HAR,10,3.648292
13,5037364885,NMC Health PLC,HAR,8,2.231081
...,...,...,...,...,...
20,4295874865,DCC PLC,HAR,25,0.021263
10,4295899077,Rightmove PLC,HAR,16,0.017555
136,4295894497,Rentokil Initial PLC,HAR,25,0.012566
43,4295894784,IMI PLC,HAR,14,0.009335


In [21]:
np.mean(mresults_m2_v11['MSE^3'])

0.45890887450859313

## Version 1.0

### Create Functions

In [10]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''

    if dataset == 'm1':
        cols = ['V^YZ', 'vol_series_daily', 'vol_series_weekly', 'vol_series_monthly']

    elif dataset == 'm3':
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 

        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        test_size = df_test.shape[0]
        
        X_train = df_train.drop(['V^YZ'], axis=1)
        X_test = df_test.drop(['V^YZ'], axis=1)
        
        y_train = df_train['V^YZ']
        y_test = df_test['V^YZ']
        
        X_train.loc[:, 'const'] = 1
        X_test.loc[:, 'const'] = 1

        # Fit the model
        model = OLS(y_train, X_train)
        model_fit = model.fit()

        # display(X_test, X_train)
        y_pred = model_fit.predict(X_test)
        if cap:
            y_pred = y_pred.clip(lower = 0)
        
        mse_million = mean_squared_error(y_test,y_pred)*10**3
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse_million
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(y_pred, y_test, model = 'HAR', dataset=dataset, name=name, r = r)

    return mresults

### M1

In [12]:
mresults_m1 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m1', viz=False, cap = True)

In [13]:
mresults_m1.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
80,4295893846,Drax Group PLC,HAR,7,77.302356
115,4295895343,Alliance & Leicester Ltd,HAR,8,12.684197
70,5000683618,Currys PLC,HAR,12,3.245059
95,4295898951,EDF Energy Nuclear Generation Group Ltd,HAR,8,2.791600
112,4295895499,HBOS Plc,HAR,10,2.488762
...,...,...,...,...,...
23,4295874865,DCC PLC,HAR,25,0.008105
69,4295893850,G4S Ltd,HAR,31,0.006133
64,4295895691,Alliance Trust PLC,HAR,10,0.005973
68,4295894471,Amec Foster Wheeler Ltd,HAR,22,0.005971


In [14]:
np.mean(mresults_m1['MSE^3'])

0.7637676752645515

In [12]:
mresults_m1.to_csv('../Results/2-HAR-M1-24MONTH-LOWER0.csv', index=None)

### M3

In [15]:
mresults_m3 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m3', viz=False, cap = True)

In [20]:
mresults_m3.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
43,4295895717,Meggitt Ltd,HAR,20,5526.282685
142,4295894757,Cadbury Ltd,HAR,14,5435.397040
91,5000047647,Friends Life FPG Ltd,HAR,12,35.745709
9,4295894930,Spirax-Sarco Engineering PLC,HAR,14,25.615611
10,5001428097,Entain PLC,HAR,11,21.182973
...,...,...,...,...,...
92,5000039682,United Utilities Group PLC,HAR,60,0.027488
94,5000001291,BAE Systems PLC,HAR,60,0.026897
106,4295896447,Abi Sab Group Holding Ltd,HAR,38,0.024991
144,4295894753,Rexam Ltd,HAR,31,0.024385


In [17]:
np.mean(mresults_m3['MSE^3'])

69.87159748352417

## Version 2

- adding data transformatin log1p
- transform the data into supervised

In [39]:
# fit an random forest model and make a one step prediction
def ols_forecast(train, testX, l1_ratio = 0.25):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]

    # scale the dataset
    # scaler = MinMaxScaler()
    # trainX = scaler.fit_transform(pd.DataFrame(trainX))
    # testX = scaler.transform(pd.DataFrame([testX]))
    
    # fit model
    model = OLS(trainy, trainX)
    model_fit = model.fit()
    # make a one-step prediction
    yhat = model_fit.predict([testX])
    return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, verbose = True):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # transform log1p
    train = log1p(train)
    test = log1p(test)
    # seed history with training dataset
    history = [x for x in train]
    # print(history)
    
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = ols_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.4f, predicted=%.4f' % (testy, yhat))
        
    # estimate prediction error
    error = mean_squared_error(test[:, -1], predictions) * 10**3
    # error = mean_absolute_error(test[:, -1], predictions)
    
    return error, test[:, -1], predictions

In [40]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']

    elif dataset == 'm3':
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index

        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        
        # display(df_merge)
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MSE: %.3f'%mse, "---"*10)
        
        # inverse the log1p.
        y = expm1(y)
        yhat = expm1(yhat)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults

In [43]:
mresults_m1[mresults_m1['Name'].isin(['Bunzl plc', 'Natwest Group PLC'])]

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
62,4295894970,Bunzl plc,HAR,51,0.066712
86,8589934212,Natwest Group PLC,HAR,60,0.057812


In [54]:
ori_har_perf()

m1 datasets: 0.7637676752645515
m3 datasets: 69.87159748352417


In [48]:
np.mean(mresults_m1_dev['MSE^3'])

13.75543135123275

In [50]:
np.mean(mresults_m3_dev['MSE^3'])

16.149816685769252

In [49]:
mresults_m3_dev = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m3', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Pershing Square Holdings Ltd) for 3 times..
------------------------------ 0.001497507095336914 seconds | MSE: 0.015 ------------------------------
Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.0013010501861572266 seconds | MSE: 0.024 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0012772083282470703 seconds | MSE: 0.498 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.0022428035736083984 seconds | MSE: 0.036 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.0029931068420410156 seconds | MSE: 0.360 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group 

In [47]:
mresults_m1_dev = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m1', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Pershing Square Holdings Ltd) for 3 times..
------------------------------ 0.0012123584747314453 seconds | MSE: 0.041 ------------------------------
Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.0012063980102539062 seconds | MSE: 1.030 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0010819435119628906 seconds | MSE: 4.144 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.0016472339630126953 seconds | MSE: 0.108 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.002547025680541992 seconds | MSE: 0.071 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group 

## Version 2.1

- adding constant
- adding data transformatin log1p
- transform the data into supervised

In [55]:
# fit an random forest model and make a one step prediction
def ols_forecast(train, testX, l1_ratio = 0.25):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]

    # scale the dataset
    # scaler = MinMaxScaler()
    # trainX = scaler.fit_transform(pd.DataFrame(trainX))
    # testX = scaler.transform(pd.DataFrame([testX]))
    
    # fit model
    model = OLS(trainy, trainX)
    model_fit = model.fit()
    # make a one-step prediction
    yhat = model_fit.predict([testX])
    return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, verbose = True):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # transform log1p
    train = log1p(train)
    test = log1p(test)
    # seed history with training dataset
    history = [x for x in train]
    # print(history)
    
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = ols_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.4f, predicted=%.4f' % (testy, yhat))
        
    # estimate prediction error
    error = mean_squared_error(test[:, -1], predictions) * 10**3
    # error = mean_absolute_error(test[:, -1], predictions)
    
    return error, test[:, -1], predictions

In [56]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']

    elif dataset == 'm3':
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index

        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        df_merge = sm.add_constant(df_merge)
        
        # display(df_merge)
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MSE: %.3f'%mse, "---"*10)
        
        # inverse the log1p.
        y = expm1(y)
        yhat = expm1(yhat)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults

In [57]:
mresults_m1_dev21 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m1', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Pershing Square Holdings Ltd) for 3 times..
------------------------------ 0.0016787052154541016 seconds | MSE: 0.041 ------------------------------
Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.001173257827758789 seconds | MSE: 2.953 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0007693767547607422 seconds | MSE: 0.362 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.0011408329010009766 seconds | MSE: 0.120 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.0013270378112792969 seconds | MSE: 0.025 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group 

In [58]:
mresults_m3_dev21 = compile_train_test(train_df, test_df, sample=False, algo='HAR', dataset='m3', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Pershing Square Holdings Ltd) for 3 times..
------------------------------ 0.0017094612121582031 seconds | MSE: 0.015 ------------------------------
Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.000993967056274414 seconds | MSE: 0.024 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0009794235229492188 seconds | MSE: 0.498 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.001547098159790039 seconds | MSE: 0.036 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.0021903514862060547 seconds | MSE: 0.360 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group L

In [60]:
np.mean(mresults_m1_dev21['MSE^3'])

12.725661997261444

In [61]:
np.mean(mresults_m3_dev21['MSE^3'])

18.252256758710207

In [62]:
ori_har_perf()

m1 datasets: 0.7637676752645515
m3 datasets: 69.87159748352417


In [63]:
ori_har_perf(mresults_m1_dev, mresults_m3_dev)

m1 datasets: 13.75543135123275
m3 datasets: 16.149816685769252


In [64]:
ori_har_perf(mresults_m1_dev21, mresults_m3_dev21)

m1 datasets: 12.725661997261444
m3 datasets: 18.252256758710207


---

In [26]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()
        print(cols)

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
        
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']

    mresults = pd.DataFrame()
    predictions_df = pd.DataFrame()

    if sample:
        # assets = [4295894970, 8589934212]
        assets = [8589934212, 8589934254] # 3rd and 4th most volatile stocks in FTSE 2006 to 2022
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 

        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        test_size = df_test.shape[0]
        
        # vif for each assets
        # df_merge = pd.concat([df_train, df_test])
        # cols = vif_dynamic_check(df_merge)
        # print(name, cols)
        # df_merge = df_merge[cols]
        # df_train = df_merge[df_merge.Asset == asset].iloc[:-test_size,: ]
        # df_test = df_merge[df_merge.Asset == asset].iloc[-test_size:,:]
        
        df_train = log1p(df_train)
        df_test = log1p(df_test)
        
        X_train = df_train.drop(['V^YZ'], axis=1)
        X_test = df_test.drop(['V^YZ'], axis=1)
        
        y_train = df_train['V^YZ']
        y_test = df_test['V^YZ']
        
        X_train = sm.add_constant(X_train)
        X_test.loc[:, 'const'] = 1
        X_test = X_test[X_train.columns]

        # Fit the model
        model = OLS(y_train, X_train)
        model_fit = model.fit()

        # display(X_test, X_train)
        y_pred = model_fit.predict(X_test)
        if cap:
            y_pred = y_pred.clip(lower = 0)
        
        mse_million = mean_squared_error(y_test,y_pred)*10**3
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse_million
                    }
            , index=[r]
        )

        
        y_test = expm1(y_test)
        y_pred = expm1(y_pred)
        
        prediction_df = pd.DataFrame({
            'Asset':asset,
            'Test Data': y_test,
            'Predictions': y_pred
        }, index = y_test.index)
        
        mresults = pd.concat([mresults, mresult])
        predictions_df = pd.concat([predictions_df, prediction_df])

        if viz: 
            vis_line_plot_results(y_pred, y_test, model = 'HAR', dataset=dataset, name=name, r = r)

    return mresults, predictions_df

In [27]:
mresults_m3_v11, pred = compile_train_test(train_df, test_df, sample=True, algo='HAR', dataset='m3', viz=False, cap = True)

In [29]:
mresults_m3_v11

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
0,8589934212,Natwest Group PLC,HAR,60,0.215461
1,8589934254,Lloyds Banking Group PLC,HAR,60,0.21288


In [28]:
pred.to_csv('../data/9-HAR-SAMPLE-PREDICTION-RESULTS.csv')