# Import Packages

In [1]:
from numpy import asarray
from IPython.display import display, HTML
from set_params import func_train_test_split, count_train_test, series_to_supervised, train_test_split, vif_check
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import time
import warnings

def calculate_iqr(values):
    # Calculate Q1
    Q1 = np.percentile(values, 25)
    # Calculate Q3
    Q3 = np.percentile(values, 75)
    # Calculate IQR
    IQR = Q3 - Q1
    return IQR

def detect_outliers_iqr(values):
    # Calculate the IQR of the values
    IQR = calculate_iqr(values)
    # Calculate Q1 and Q3
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    # Define the lower and upper bound for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return a boolean array: True if the value is an outlier, False otherwise
    return lower_bound, upper_bound

def vis_line_plot_results(y_pred, y_test, model = 'HAR', name = 'BARCLAYS', r = 1, dataset = 'm1'):

    plt.figure(figsize=(10,4))
    true, = plt.plot(y_test)
    preds, = plt.plot(y_pred)
    plt.title(f'{model}-{dataset}-{name}', fontsize=15)
    plt.legend(['True Volatility', 'Predicted Volatility'], fontsize=9)
    plt.xticks(rotation=45)
    plt.savefig(f'../outputs/{model}-{dataset}/{str(r+1).zfill(3)}-{model}-{name}.png')
    plt.close()

# display(HTML("<style>.container { width:80% !important; }</style>"))
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

# Data Processing

## Import Data and Split Train - Test

In [2]:
base_FTSE_df = pd.read_csv('../data/1.3-FTSE_Monthly_ESG_Volatility_Final.csv')
base_FTSE_df = base_FTSE_df.rename(columns={'Date_x':'date_key'})

In [3]:
train_df, valid_df, test_df = func_train_test_split(validation = False, threshold = 24)

In [4]:
count_rows_df = count_train_test(train_df, test_df)

In [5]:
coverage_df = pd.read_csv('../data/coverage_dataframe.csv')
coverage_df.PermID = coverage_df.PermID.astype(int)
coverage_df = coverage_df[['PermID', 'Name']]
coverage_df = coverage_df.rename(columns={'PermID':'Asset'})

In [6]:
train_df = pd.merge(train_df, coverage_df, how = 'left', on = 'Asset')
train_df.index = train_df.month_key

In [7]:
# fit an random forest model and make a one step prediction
def random_forest_forecast(train, testX):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]
    # fit model
    model = RandomForestRegressor(n_estimators=1000)
    model.fit(trainX, trainy)
    # make a one-step prediction
    yhat = model.predict([testX])
    return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, verbose = True):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # print(history)
    # step over each time-step in the test set
    
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = random_forest_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.4f, predicted=%.4f' % (testy, yhat))
        
    # estimate prediction error
    error = mean_squared_error(test[:, -1], predictions) *10**3
    # error = mean_absolute_error(test[:, -1], predictions)
    
    return error, test[:, -1], predictions

In [8]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
                
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']


    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index

        # print(asset, df_train.shape, df_test.shape)
        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MAE: %.3f'%mse, "---"*10)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults

In [9]:
mresults_m3 = compile_train_test(train_df, test_df, sample=False, algo='RF', dataset='m3', viz=True, cap = True)

Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 7 times..
------------------------------ 4.883482456207275 seconds | MAE: 0.046 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 6 times..
------------------------------ 4.217527866363525 seconds | MAE: 0.338 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 8 times..
------------------------------ 5.941842317581177 seconds | MAE: 0.019 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 10 times..
------------------------------ 7.865678787231445 seconds | MAE: 0.044 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 11 times..
------------------------------ 8.690311431884766 seconds | MAE: 0.223 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC) for 11 times..
----

In [9]:
mresults_m2 = compile_train_test(train_df, test_df, sample=False, algo='RF', dataset='m2', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Pershing Square Holdings Ltd) for 7 times..
------------------------------ 4.430306434631348 seconds | MAE: 0.023 ------------------------------
Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 8 times..
------------------------------ 5.079566717147827 seconds | MAE: 0.032 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 7 times..
------------------------------ 4.345094919204712 seconds | MAE: 0.307 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 9 times..
------------------------------ 5.671299934387207 seconds | MAE: 0.026 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 11 times..
------------------------------ 7.384570837020874 seconds | MAE: 0.042 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 12 times.

# Results

In [10]:
mresults_m3.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
65,5000683618,Currys PLC,RF,11,3.376438
101,4295895499,HBOS Plc,RF,9,2.292086
31,4295894092,Travis Perkins PLC,RF,11,1.660446
54,4295896428,Autonomy Corp Ltd,RF,9,1.570479
38,5036206981,Polymetal International PLC,RF,13,1.069694
...,...,...,...,...,...
47,4295894819,Aggreko Ltd,RF,18,0.011693
63,4295894471,Amec Foster Wheeler Ltd,RF,21,0.010681
64,4295893850,G4S Ltd,RF,30,0.009397
20,4295874865,DCC PLC,RF,24,0.008813


In [14]:
mresults_m3.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
70,5000683618,Currys PLC,RF,12,3.093467
112,4295895499,HBOS Plc,RF,10,2.146841
59,4295896428,Autonomy Corp Ltd,RF,10,1.218058
35,4295894092,Travis Perkins PLC,RF,12,1.132370
42,5036206981,Polymetal International PLC,RF,14,1.030174
...,...,...,...,...,...
55,4295898751,Serco Group PLC,RF,16,0.012833
51,4295894819,Aggreko Ltd,RF,19,0.010403
68,4295894471,Amec Foster Wheeler Ltd,RF,22,0.009548
47,4295894784,IMI PLC,RF,14,0.006152


In [10]:
mresults_m2.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
70,5000683618,Currys PLC,RF,12,3.142663
112,4295895499,HBOS Plc,RF,10,2.128162
59,4295896428,Autonomy Corp Ltd,RF,10,1.237753
35,4295894092,Travis Perkins PLC,RF,12,1.047205
42,5036206981,Polymetal International PLC,RF,14,1.027622
...,...,...,...,...,...
68,4295894471,Amec Foster Wheeler Ltd,RF,22,0.011986
48,4295895858,Weir Group PLC,RF,19,0.010993
51,4295894819,Aggreko Ltd,RF,19,0.010586
64,4295895691,Alliance Trust PLC,RF,10,0.006822


## Exporting the Results

In [11]:
np.mean(mresults_m3['MSE^3'])

0.14950075507514893

In [15]:
np.mean(mresults_m3['MSE^3'])

0.13529247091706614

In [14]:
np.mean(mresults_m2['MSE^3'])

0.13798890987628293

In [13]:
MODEL = 'RF'
THRESHOLD = str(24)
CAP = 'm3'
mresults_m3.to_excel(f'../results/1-{MODEL}-{THRESHOLD}MONTH-{CAP}.xlsx', index=None)

In [12]:
MODEL = 'RF'
THRESHOLD = str(24)
CAP = 'm2'
mresults_m2.to_excel(f'../results/1-{MODEL}-{THRESHOLD}MONTH-{CAP}.xlsx', index=None)

---

# VIZ

In [18]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 

    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
                
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']


    mresults = pd.DataFrame()
    predictions_df = pd.DataFrame()

    if sample:
        # assets = [4295894970, 8589934212]
        assets = [8589934212, 8589934254] # 3rd and 4th most volatile stocks in FTSE 2006 to 2022

    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index
        # print(asset, df_train.shape, df_test.shape)
        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MAE: %.3f'%mse, "---"*10)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        
        prediction_df = pd.DataFrame({
            'Asset':asset,
            'Test Data': y,
            'Predictions': yhat
        }, index = indices[-len(yhat):])
        
        mresults = pd.concat([mresults, mresult])
        predictions_df = pd.concat([predictions_df, prediction_df])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults, predictions_df

In [19]:
mresults_m3, pred = compile_train_test(train_df, test_df, sample=True, algo='RF', dataset='m3', viz=False, cap = True)

Execute Training and Walk Forward Testing for (Natwest Group PLC) for 60 times..
------------------------------ 128.08844995498657 seconds | MAE: 0.099 ------------------------------
Execute Training and Walk Forward Testing for (Lloyds Banking Group PLC) for 60 times..
------------------------------ 135.58832836151123 seconds | MAE: 0.102 ------------------------------


In [20]:
pred.to_csv('../data/9-RF-SAMPLE-PREDICTION-RESULTS.csv')