# Import Packages

In [1]:
# Method #3 Regularisation Model
from sklearn.linear_model import ElasticNet

from numpy import asarray
from IPython.display import display, HTML
from set_params import func_train_test_split, count_train_test, series_to_supervised, train_test_split, vif_check
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import time
import warnings

def calculate_iqr(values):
    # Calculate Q1
    Q1 = np.percentile(values, 25)
    # Calculate Q3
    Q3 = np.percentile(values, 75)
    # Calculate IQR
    IQR = Q3 - Q1
    return IQR

def detect_outliers_iqr(values):
    # Calculate the IQR of the values
    IQR = calculate_iqr(values)
    # Calculate Q1 and Q3
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    # Define the lower and upper bound for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return a boolean array: True if the value is an outlier, False otherwise
    return lower_bound, upper_bound

def vis_line_plot_results(y_pred, y_test, model = 'HAR', name = 'BARCLAYS', r = 1, dataset = 'm1'):

    plt.figure(figsize=(10,4))
    true, = plt.plot(y_test)
    preds, = plt.plot(y_pred)
    plt.title(f'{model}-{dataset}-{name}', fontsize=15)
    plt.legend(['True Volatility', 'Predicted Volatility'], fontsize=9)
    plt.xticks(rotation=45)
    plt.savefig(f'../outputs/{model}-{dataset}/{str(r+1).zfill(3)}-{model}-{name}.png')
    plt.close()

# display(HTML("<style>.container { width:80% !important; }</style>"))
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

# Data Processing

## Import Data and Split Train - Test

In [2]:
base_FTSE_df = pd.read_csv('../data/1.3-FTSE_Monthly_ESG_Volatility_Final_v2.csv')
base_FTSE_df = base_FTSE_df.rename(columns={'Date_x':'date_key'})

In [3]:
train_df, valid_df, test_df = func_train_test_split(validation = False, threshold = 24)

In [4]:
count_rows_df = count_train_test(train_df, test_df)

In [5]:
coverage_df = pd.read_csv('../data/coverage_dataframe.csv')
coverage_df.PermID = coverage_df.PermID.astype(int)
coverage_df = coverage_df[['PermID', 'Name']]
coverage_df = coverage_df.rename(columns={'PermID':'Asset'})

In [6]:
train_df = pd.merge(train_df, coverage_df, how = 'left', on = 'Asset')
train_df.index = train_df.month_key

In [7]:
# fit an random forest model and make a one step prediction
def elasticnet_forecast(train, testX, l1_ratio = 0.25):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]
    # fit model
    model = ElasticNet(l1_ratio= l1_ratio)
    model.fit(trainX, trainy)
    # make a one-step prediction
    yhat = model.predict([testX])
    return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, verbose = True):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # print(history)
    # step over each time-step in the test set
    
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = elasticnet_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.4f, predicted=%.4f' % (testy, yhat))
        
    # estimate prediction error
    error = mean_squared_error(test[:, -1], predictions) * 10**3
    # error = mean_absolute_error(test[:, -1], predictions)
    
    return error, test[:, -1], predictions

In [8]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
        
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 
                'vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index
        # display(df_train)
        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MAE: %.3f'%mse, "---"*10)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults

In [9]:
mresults_m1 = compile_train_test(train_df, test_df, sample=False, algo='EN', dataset='m1', viz=False, cap = True)

Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.001550912857055664 seconds | MAE: 0.019 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0011260509490966797 seconds | MAE: 0.899 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.0016698837280273438 seconds | MAE: 0.011 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.0019681453704833984 seconds | MAE: 0.009 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 8 times..
------------------------------ 0.0024073123931884766 seconds | MAE: 0.339 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC) fo

In [10]:
mresults_m3 = compile_train_test(train_df, test_df, sample=False, algo='EN', dataset='m3', viz=True, cap = True)

Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 0.001733541488647461 seconds | MAE: 0.017 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 0.0014863014221191406 seconds | MAE: 0.844 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 0.0019483566284179688 seconds | MAE: 0.014 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 0.002192974090576172 seconds | MAE: 0.009 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 8 times..
------------------------------ 0.004134654998779297 seconds | MAE: 0.356 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC) for 

In [11]:
mresults_m2 = compile_train_test(train_df, test_df, sample=False, algo='EN', dataset='m2', viz=False, cap = True)

Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 7 times..
------------------------------ 0.0030968189239501953 seconds | MAE: 0.050 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 6 times..
------------------------------ 0.0023326873779296875 seconds | MAE: 0.436 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 8 times..
------------------------------ 0.002931833267211914 seconds | MAE: 0.019 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 10 times..
------------------------------ 0.004168510437011719 seconds | MAE: 0.113 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 11 times..
------------------------------ 0.004671573638916016 seconds | MAE: 0.390 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC) fo

# Results Reporting

In [13]:
np.mean(mresults_m1['MSE^3'])

0.2151216313979742

In [10]:
np.mean(mresults_m2['MSE^3'])

0.21690611096488316

In [14]:
np.mean(mresults_m3['MSE^3'])

0.21696378629816265

In [12]:
np.mean(mresults_m1['MSE^3'])

0.2206788192810873

In [13]:
np.mean(mresults_m2['MSE^3'])

0.188549875778676

In [14]:
np.mean(mresults_m3['MSE^3'])

0.1885493550756858

---

# Results

In [13]:
mresults_m1.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
112,4295895499,HBOS Plc,EN,5,6.964650
70,5000683618,Currys PLC,EN,7,3.130503
59,4295896428,Autonomy Corp Ltd,EN,6,2.392032
35,4295894092,Travis Perkins PLC,EN,7,2.112107
15,5037364885,NMC Health PLC,EN,3,1.607201
...,...,...,...,...,...
63,4295894904,Cobham Ltd,EN,5,0.009929
23,4295874865,DCC PLC,EN,21,0.007946
69,4295893850,G4S Ltd,EN,27,0.006713
64,4295895691,Alliance Trust PLC,EN,6,0.003520


In [14]:
mresults_m3.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
112,4295895499,HBOS Plc,EN,5,7.051525
70,5000683618,Currys PLC,EN,7,3.514539
59,4295896428,Autonomy Corp Ltd,EN,6,2.111806
35,4295894092,Travis Perkins PLC,EN,7,2.069147
42,5036206981,Polymetal International PLC,EN,9,1.520628
...,...,...,...,...,...
63,4295894904,Cobham Ltd,EN,5,0.009929
23,4295874865,DCC PLC,EN,21,0.007946
69,4295893850,G4S Ltd,EN,27,0.004866
64,4295895691,Alliance Trust PLC,EN,6,0.003520


## Exporting the Results

In [None]:
np.mean(mresults_m1['MSE^3'])

0.1431645582698732

In [None]:
MODEL = 'RF'
THRESHOLD = str(24)
CAP = 'm1'
mresults_m1.to_excel(f'../results/1-{MODEL}-{THRESHOLD}MONTH-{CAP}.xlsx', index=None)

In [15]:
np.mean(mresults_m3['MSE^3'])

0.13529247091706614

In [14]:
mresults_m3.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
70,5000683618,Currys PLC,RF,12,3.093467
112,4295895499,HBOS Plc,RF,10,2.146841
59,4295896428,Autonomy Corp Ltd,RF,10,1.218058
35,4295894092,Travis Perkins PLC,RF,12,1.132370
42,5036206981,Polymetal International PLC,RF,14,1.030174
...,...,...,...,...,...
55,4295898751,Serco Group PLC,RF,16,0.012833
51,4295894819,Aggreko Ltd,RF,19,0.010403
68,4295894471,Amec Foster Wheeler Ltd,RF,22,0.009548
47,4295894784,IMI PLC,RF,14,0.006152


In [13]:
MODEL = 'RF'
THRESHOLD = str(24)
CAP = 'm3'
mresults_m3.to_excel(f'../results/1-{MODEL}-{THRESHOLD}MONTH-{CAP}.xlsx', index=None)

---

# Viz

In [43]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
        
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']

    mresults = pd.DataFrame()
    predictions_df = pd.DataFrame()

    if sample:
        # assets = [4295894970, 8589934212]
        assets = [8589934212, 8589934254] # 3rd and 4th most volatile stocks in FTSE 2006 to 2022
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index
        # display(df_train)
        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MAE: %.3f'%mse, "---"*10)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        
        prediction_df = pd.DataFrame({
            'Asset':asset,
            'Test Data': y,
            'Predictions': yhat
        }, index = indices[-len(yhat):])
        

        
        mresults = pd.concat([mresults, mresult])
        predictions_df = pd.concat([predictions_df, prediction_df])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults, predictions_df

In [44]:
viz_df = train_df[['Asset', 'V^YZ']]

In [45]:
viz_df[viz_df.Asset.isin([8589934212, 8589934254])].to_csv('../data/9-data_train.csv')

In [46]:
mresults_m3, pred = compile_train_test(train_df, test_df, sample=True, algo='EN', dataset='m3', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Natwest Group PLC) for 60 times..
------------------------------ 0.06134963035583496 seconds | MAE: 0.182 ------------------------------
Execute Training and Walk Forward Testing for (Lloyds Banking Group PLC) for 60 times..
------------------------------ 0.03344988822937012 seconds | MAE: 0.110 ------------------------------


In [47]:
pred.to_csv('../data/9-EN-SAMPLE-PREDICTION-RESULTS.csv')