# Import Packages

In [1]:
from numpy import asarray
from IPython.display import display, HTML
from set_params import func_train_test_split, count_train_test, series_to_supervised, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import time
import warnings

def calculate_iqr(values):
    # Calculate Q1
    Q1 = np.percentile(values, 25)
    # Calculate Q3
    Q3 = np.percentile(values, 75)
    # Calculate IQR
    IQR = Q3 - Q1
    return IQR

def detect_outliers_iqr(values):
    # Calculate the IQR of the values
    IQR = calculate_iqr(values)
    # Calculate Q1 and Q3
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    # Define the lower and upper bound for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return a boolean array: True if the value is an outlier, False otherwise
    return lower_bound, upper_bound

def vis_line_plot_results(y_pred, y_test, model = 'HAR', name = 'BARCLAYS', r = 1, dataset = 'm1'):

    plt.figure(figsize=(10,4))
    true, = plt.plot(y_test)
    preds, = plt.plot(y_pred)
    plt.title(f'{model}-{dataset}-{name}', fontsize=15)
    plt.legend(['True Volatility', 'Predicted Volatility'], fontsize=9)
    plt.xticks(rotation=45)
    plt.savefig(f'../outputs/{model}-{dataset}/{str(r+1).zfill(3)}-{model}-{name}.png')
    plt.close()

# display(HTML("<style>.container { width:80% !important; }</style>"))
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

# Data Processing

## Import Data and Split Train - Test

In [2]:
base_FTSE_df = pd.read_csv('../data/1.3-FTSE_Monthly_ESG_Volatility_Final_v2.csv')
# base_FTSE_df = base_FTSE_df.rename(columns={'Date_x':'date_key'})

In [3]:
train_df, valid_df, test_df = func_train_test_split(validation = False, threshold = 24)

In [4]:
count_rows_df = count_train_test(train_df, test_df)

In [5]:
coverage_df = pd.read_csv('../data/coverage_dataframe.csv')
coverage_df.PermID = coverage_df.PermID.astype(int)
coverage_df = coverage_df[['PermID', 'Name']]
coverage_df = coverage_df.rename(columns={'PermID':'Asset'})

In [6]:
train_df = pd.merge(train_df, coverage_df, how = 'left', on = 'Asset')
train_df.index = train_df.month_key

In [7]:
# fit an random forest model and make a one step prediction
def random_forest_forecast(train, testX):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]
    # fit model
    model = RandomForestRegressor(n_estimators=1000)
    model.fit(trainX, trainy)
    # make a one-step prediction
    yhat = model.predict([testX])
    return yhat[0]

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, verbose = True):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # print(history)
    # step over each time-step in the test set
    
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = random_forest_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        # print('>expected=%.4f, predicted=%.4f' % (testy, yhat))
        
    # estimate prediction error
    error = mean_squared_error(test[:, -1], predictions) *10**3
    # error = mean_absolute_error(test[:, -1], predictions)
    
    return error, test[:, -1], predictions

In [8]:
def compile_train_test(train_df, test_df, sample = True, algo = 'HAR', dataset = 'm1', viz = False, cap = True):
    '''
    '''
    test_perc = .3 
    
    if dataset == 'm1':
        cols = ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
        
    elif dataset == 'm2':
        cols = vif_check()

    else:
        cols = [ 'buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar'
                ,'CSRStrategy','Community','Emissions','EnvironmentalInnovation','HumanRights','Management','ProductResponsibility'
                ,'ResourceUse','Shareholders','Workforce', 'vol_series_daily','vol_series_weekly','vol_series_monthly', 'V^YZ']
        
        # version 2
        cols = ['buzz','ESG','ESGCombined','ESGControversies','EnvironmentalPillar','GovernancePillar','SocialPillar','Community',
                'EnvironmentalInnovation','Management','ProductResponsibility','Shareholders','Workforce', 'V^YZ']

    mresults = pd.DataFrame()

    if sample:
        assets = [4295894970, 8589934212]
    else:
        assets = train_df.Asset.unique().tolist()

    for r, asset in enumerate(assets): 
        # print(asset)
        name = train_df[train_df['Asset'] == asset].iloc[0,-1]

        df_train = train_df[train_df.Asset == asset][cols].dropna()
        df_test = test_df[test_df.Asset == asset][cols].dropna()
        indices = test_df[test_df.Asset == asset].index

        # print(asset, df_train.shape, df_test.shape)
        df_merge = pd.concat([df_train, df_test])
        df_merge = series_to_supervised(df_merge, n_in=3, target= ['V^YZ'])
        test_size = int(df_merge.shape[0] * test_perc)
        
        print(f'Execute Training and Walk Forward Testing for ({name}) for {test_size} times..')
        start_time = time.time()
        mse, y, yhat = walk_forward_validation(df_merge, test_size)
        print("---"*10, "%s seconds |"%(time.time() - start_time), 'MAE: %.3f'%mse, "---"*10)
        
        mresult = pd.DataFrame({
            'Asset': asset,
            'Name': name,
            'Model': algo,
            'Test Size': test_size,
            'MSE^3':mse
                    }
            , index=[r]
        )
        mresults = pd.concat([mresults, mresult])

        if viz: 
            vis_line_plot_results(yhat, y, model = algo, dataset=dataset, name=name, r = r)

    return mresults

In [15]:
mresults_m1 = compile_train_test(train_df, test_df, sample=False, algo='RF', dataset='m1', viz=True, cap = True)

Execute Training and Walk Forward Testing for (B&M European Value Retail SA) for 3 times..
------------------------------ 1.8439903259277344 seconds | MAE: 0.018 ------------------------------
Execute Training and Walk Forward Testing for (Avast Ltd) for 3 times..
------------------------------ 1.8188810348510742 seconds | MAE: 0.712 ------------------------------
Execute Training and Walk Forward Testing for (Intermediate Capital Group PLC) for 5 times..
------------------------------ 3.3809242248535156 seconds | MAE: 0.017 ------------------------------
Execute Training and Walk Forward Testing for (M&G PLC) for 6 times..
------------------------------ 3.8431448936462402 seconds | MAE: 0.012 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd) for 8 times..
------------------------------ 5.161790370941162 seconds | MAE: 0.226 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC) for 8 times..
---

---

# Results

In [11]:
mresults_m1

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
0,5039731355,Pershing Square Holdings Ltd,RF,7,0.024810
1,5042941681,B&M European Value Retail SA,RF,8,0.041306
2,5063734194,Avast Ltd,RF,7,0.328285
3,4295895921,Intermediate Capital Group PLC,RF,9,0.023573
4,5021764927,M&G PLC,RF,11,0.039661
...,...,...,...,...,...
155,4295894191,SSE PLC,RF,60,0.058438
156,4295894186,International Power Ltd,RF,22,0.119963
157,4295894168,Capricorn Energy PLC,RF,18,0.130171
158,4295894068,Persimmon PLC,RF,42,0.132630


In [16]:
mresults_m1.sort_values('MSE^3', ascending=False)

Unnamed: 0,Asset,Name,Model,Test Size,MSE^3
101,4295895499,HBOS Plc,RF,6,3.708522
65,5000683618,Currys PLC,RF,7,3.368355
54,4295896428,Autonomy Corp Ltd,RF,6,2.508058
31,4295894092,Travis Perkins PLC,RF,7,2.418915
38,5036206981,Polymetal International PLC,RF,9,1.607559
...,...,...,...,...,...
35,4295894667,Croda International PLC,RF,23,0.011861
6,5066589306,Phoenix Group Holdings PLC,RF,9,0.010450
18,5040053800,Mediclinic Group Ltd,RF,3,0.010225
43,4295894784,IMI PLC,RF,9,0.005857


## Exporting the Results

In [17]:
np.mean(mresults_m1['MSE^3'])

0.18574747490882546

In [None]:
MODEL = 'RF'
THRESHOLD = str(24)
CAP = 'm1'
mresults_m1.to_excel(f'../results/1-{MODEL}-{THRESHOLD}MONTH-{CAP}.xlsx', index=None)