<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [6]:
# adding a feature of daily data

from arch import arch_model
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import time

# Method #3 Regularisation Model
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor

from datetime import timedelta
from numpy import asarray, log1p, expm1
from numpy import number
from sklearn.metrics import mean_squared_error
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from math import ceil

import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import warnings
import numpy as np
import sys

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

def calculate_iqr(values):
    # Calculate Q1
    Q1 = np.percentile(values, 25)
    # Calculate Q3
    Q3 = np.percentile(values, 75)
    # Calculate IQR
    IQR = Q3 - Q1
    return IQR

def detect_outliers_iqr(values):
    # Calculate the IQR of the values
    IQR = calculate_iqr(values)
    # Calculate Q1 and Q3
    Q1 = np.percentile(values, 25)
    Q3 = np.percentile(values, 75)
    # Define the lower and upper bound for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Return a boolean array: True if the value is an outlier, False otherwise
    return lower_bound, upper_bound

def series_to_supervised(data, n_in=1, n_out=1, target = 'y',dropnan=True):
    '''
    transform a time series dataset into a supervised learning dataset
    '''
    cols = list()
    colname = data.columns
    dropcols = [col for col in colname if col not in target]
    # print('dropping columns:', dropcols)
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        temp_df = data.shift(i)
        colname = temp_df.columns + f'_s{i}'
        temp_df.columns = colname
        cols.append(temp_df)
        
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(data.shift(-i))
        
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg = pd.DataFrame(agg)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
        
    return agg.drop(dropcols, axis=1).values

# split a univariate dataset into train/test sets
def spv_train_test_split(data, n_test):
    '''
    train test split based on refer to the array set, with the same style as the random forest.
    '''
    return data[:-n_test, :], data[-n_test:, :]

class Data_Processing():
    '''
    Class to train and infer stock price for one particular stock
    '''
    def __init__(self, mt_start, mt_end, 
                 validation = False,
                 threshold = 24,
                 daily = False, 
                 train_perc = .7,
                 index_path = '../data/1.1-FTSE-IDX_VOL30-PRICES_2006-2023.csv',
                 price_path = '../data/1.1-FTSE_VOL30-PRICES_2006-2023.csv',
                 esg_path = '../data/1.2-FTSE_ESG_COR_2006-2023.csv'):

        self.mt_start = mt_start
        self.mt_end = mt_end
        self.train_perc = train_perc
        self.index_path = index_path
        self.price_path = price_path
        self.esg_path = esg_path
        self.validation = validation
        self.threshold = threshold
        self.daily = daily

    def count_train_test(self, train_df, test_df):
        master_df = pd.DataFrame()

        assets = train_df.Asset.unique().tolist()

        for _, asset in enumerate(assets): 
            df_train = train_df[train_df.Asset == asset]
            # df_valid = valid_df[valid_df.Asset == asset]
            df_test = test_df[test_df.Asset == asset]

            master_df.loc[_ , 'Asset'] = asset
            master_df.loc[_ , 'Train Length'] = df_train.shape[0]
            # master_df.loc[_ , 'Valid Length'] = df_valid.shape[0]
            master_df.loc[_ , 'Test Length'] = df_test.shape[0]
            master_df.loc[_ , 'Total Length'] = df_train.shape[0] + df_test.shape[0]

        return master_df
    
    def min_data_threshold(self, df):
        
        threshold = self.threshold

        if self.daily:
            threshold = 360*2 # at least it has 2 years of datapoints.

        return df[df['Total Length'] >= threshold]['Asset'].tolist()

    def clean_count_missing_rows_assets(self, df):
        master_df = pd.DataFrame()

        for i, asset in enumerate(df.Asset.unique()):
            df_temp = df[df.Asset == asset]
            df_clean = df_temp.dropna()
            
            temp2_df = pd.DataFrame({ 'num_assets': asset,
                            'missing_rows': df_temp.shape[0] - df_clean.shape[0],
                            'perc_missing': (df_temp.shape[0] - df_clean.shape[0]) / df_temp.shape[0],
                            'total_rows': df_temp.shape[0]
                        }, index= [i])
            
            master_df = pd.concat([master_df, temp2_df])

        return master_df

    def clean_filna_assets_df(self, df):
        '''
        Version 1 of fill Null value, the first performance is using this function.
        '''

        Assets = df.Asset.unique()
        master_df = pd.DataFrame()

        for asset in Assets:
            temp_df = df[df['Asset'] == asset]

            # Select numerical columns
            numerical_columns = temp_df.select_dtypes(include=[number]).columns
            # Check for any missing values in these numerical columns
            missing_numerical = [col for col in numerical_columns if temp_df[col].isna().any()]
            
            for col in missing_numerical:
                mrows = temp_df[col].isna().sum() + 1
                roll_mean = temp_df[col][::-1].rolling(window=mrows, min_periods=1).mean()
                temp_df[col] = temp_df[col].fillna(roll_mean)
                temp_df[col] = temp_df[col].fillna(method='ffill')

            temp_df = temp_df.reset_index(drop=True)
            master_df = pd.concat([master_df, temp_df])

        return master_df

    def monthly_last_trading_date(self):

        date_list = []

        df = pd.read_csv(self.index_path)
        dt_trades = df.loc[:,['Date']]
        dt_trades.Date = pd.to_datetime(dt_trades.Date)
        dt_trades.loc[:, 'Month_Key'] = dt_trades.Date.apply(lambda x: x.strftime('%Y-%m-01'))
        dt_trades.Month_Key = pd.to_datetime(dt_trades.Month_Key)
        dt_trades = dt_trades[(dt_trades['Month_Key'] >= self.mt_start) & (dt_trades['Month_Key'] <= self.mt_end)]

        for date in dt_trades.Month_Key.unique():
            temp_df = dt_trades[dt_trades.Month_Key == date].copy()
            temp_df = temp_df.sort_values(by= 'Date', ascending=True)
            dt = temp_df.iloc[-1, 0]
            date_list.append(dt)

        self.date_list = date_list

    def data_preprocessing_price(self):

        select_cols = ['month_key', 'Date', 'Asset', 'Open', 'High', 'Low', 'Close', 'Return', 'V^CC', 'V^RS', 'V^YZ']
        
        price_df = pd.read_csv(self.price_path)
        price_df = price_df.rename(columns={'Month':'month_key'})
        price_df.Date = pd.to_datetime(price_df.Date)
        price_df.month_key = pd.to_datetime(price_df.month_key)
        price_df.Asset = price_df.Asset.astype(int)
        price_df = price_df[select_cols]
        price_df = price_df.dropna()

        if self.daily:
            price_df.loc[:, 'col_merge'] = price_df.Date.apply(lambda x: x - timedelta(days=1))
            price_df.col_merge = pd.to_datetime(price_df.col_merge)
        else:
            price_df.loc[:, 'col_merge'] = price_df.month_key.apply(lambda x: x - timedelta(days=10))
            price_df.loc[:, 'col_merge'] = price_df.col_merge.apply(lambda x: x.strftime('%Y-%m-01'))
            price_df.col_merge = pd.to_datetime(price_df.col_merge)
            price_df = price_df[price_df.Date.isin(self.date_list)].reset_index(drop=True)

        self.price_df = price_df


    def data_preprocessing_esg(self):

        esg_df = pd.read_csv(self.esg_path)

        # set-up df
        esg_df.Asset = esg_df.Asset.astype(int)
        esg_df.Date = pd.to_datetime(esg_df.Date)
        esg_df = esg_df.drop(['windowTimestamp'], axis=1)

        if not self.daily:
            # set-up month_key column
            esg_df = esg_df[esg_df.Date.isin(self.date_list)].reset_index(drop=True)

        esg_df['month_key'] = esg_df.Date.apply(lambda x: x.strftime('%Y-%m-01'))
        esg_df.month_key = pd.to_datetime(esg_df.month_key)

        self.esg_df = esg_df

    def func_train_test_split(self):
        '''
        '''
        
        threshold = self.threshold
        validation = self.validation
        train_rows = self.train_perc
        df = self.clean_df
        
        lag_1, lag_2, lag_3 = 1, 3, 12
        if self.daily:
            lag_1, lag_2, lag_3 = 1, 7, 30
        
        df.index = df.col_merge

        train_df, valid_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
        asset_lists = df.Asset.unique()

        for asset in asset_lists:
            # subset dataframe
            temp_df = df[df['Asset'] == asset].copy()

            # parameters
            rows = temp_df.shape[0]
            train_len = ceil(rows*train_rows)

            # setting up volatility lag to a dataframe
            vol_df = pd.DataFrame({
            'vol_series_daily' : temp_df['V^YZ'].shift(lag_1),
            'vol_series_weekly' : temp_df['V^YZ'].rolling(lag_2).mean().shift(1),
            'vol_series_monthly' : temp_df['V^YZ'].rolling(lag_3).mean().shift(1)
            })

            temp_df = pd.merge(temp_df, vol_df, how = 'left', left_index=True, right_index=True)

            # split the subset into train_df
            train_df = pd.concat([temp_df.iloc[:train_len], train_df])
            test_df = pd.concat([temp_df.iloc[train_len:], test_df])

            if validation:
                # if yes validation has 20% of the portion.
                valid_len = int(rows*.2)

                valid_df = pd.concat([temp_df.iloc[train_len:(train_len+valid_len)], valid_df])
                valid_df = pd.concat([temp_df.iloc[(train_len+valid_len):], valid_df])

        
        master_df = self.count_train_test(train_df, test_df) # count the total rows of each assets
        used_assets = self.min_data_threshold(master_df)     # filter out assets that has least data points

        train_df = train_df[train_df.Asset.isin(used_assets)]
        # valid_df = valid_df[valid_df.Asset.isin(used_assets)]
        test_df = test_df[test_df.Asset.isin(used_assets)]

        return train_df, valid_df, test_df

    def merge_data(self):
        '''
        '''
        self.monthly_last_trading_date()
        self.data_preprocessing_price()
        self.data_preprocessing_esg()

        if not self.daily:
            merge_df = pd.merge(self.price_df, self.esg_df, how = 'left', left_on = ['col_merge', 'Asset'],
                                    right_on = ['month_key', 'Asset'])
            
            # output column arrangement
            merge_df.drop(['month_key_x', 'month_key_y', 'Date_y'], axis = 1, inplace = True)
            merge_df = merge_df.rename(columns={
                            'Date_x': 'date_key',
                            })
            
        else:
            merge_df = pd.merge(self.price_df, self.esg_df, how = 'left', left_on = ['col_merge', 'Asset'],
                        right_on = ['Date', 'Asset'])
            merge_df.drop(['month_key_x', 'month_key_y', 'Date_y'], axis = 1, inplace = True)
            merge_df = merge_df.rename(columns={
                            'Date_x': 'date_key',
                            })

        self.merge_df = merge_df

        return self.merge_df
    
    def cleansing_final1(self, fillna = None):
        '''
        The excluded columns: ResourceUse, HumanRights, CSRStrategy, and Emissions were selected
        Based on columns that mostly contribute null to the FTSE assets.
        '''
        validation = self.validation

        self.merge_data()

        #filter exclude columns
        clean_df = self.merge_df.drop(['ResourceUse', 'HumanRights', 'CSRStrategy', 'Emissions'], axis=1)

        #count how many missing values and total observation
        #exclude missing value that more than 50% and obs less than 24
        cnt_miss_rws = self.clean_count_missing_rows_assets(clean_df)
        exc = cnt_miss_rws[(cnt_miss_rws.perc_missing > 0.5) | (cnt_miss_rws.total_rows <= 24)].num_assets.tolist()
        clean_df = clean_df[~(clean_df.Asset.isin(exc))]

        # fill null value with Original filling method
        if fillna:
            clean_df = self.clean_filna_assets_df(clean_df)

        self.clean_df = clean_df

        train_df, valid_df, test_df = self.func_train_test_split()
        
        return clean_df, train_df, valid_df, test_df

In [7]:
dataprocessing = Data_Processing('2006-01-01', '2022-12-01', daily= False)

In [3]:
clean_df, train_df, valid_df, test_df = dataprocessing.cleansing_final1(fillna = True)

In [4]:
clean_df.shape

(17427, 24)

In [5]:
clean_df.isnull().sum()

date_key                   0
Asset                      0
Open                       0
High                       0
Low                        0
Close                      0
Return                     0
V^CC                       0
V^RS                       0
V^YZ                       0
col_merge                  0
buzz                       0
ESG                        0
ESGCombined                0
ESGControversies           0
EnvironmentalPillar        0
GovernancePillar           0
SocialPillar               0
Community                  0
EnvironmentalInnovation    0
Management                 0
ProductResponsibility      0
Shareholders               0
Workforce                  0
dtype: int64

---

In [8]:
dataprocessing = Data_Processing('2006-01-01', '2022-12-01', daily= True)

In [9]:
clean_df, train_df, valid_df, test_df = dataprocessing.cleansing_final1(fillna = True)

In [10]:
clean_df.shape

(380897, 24)

In [11]:
clean_df.isnull().sum()

date_key                   0
Asset                      0
Open                       0
High                       0
Low                        0
Close                      0
Return                     0
V^CC                       0
V^RS                       0
V^YZ                       0
col_merge                  0
buzz                       0
ESG                        0
ESGCombined                0
ESGControversies           0
EnvironmentalPillar        0
GovernancePillar           0
SocialPillar               0
Community                  0
EnvironmentalInnovation    0
Management                 0
ProductResponsibility      0
Shareholders               0
Workforce                  0
dtype: int64

In [12]:
len(clean_df.Asset.unique())

182

In [13]:
len(train_df.Asset.unique())

133

---

In [14]:
import sys, os
sys.path.append('../python')
from util import *

In [17]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='EN', features='m1', sample= False, plot_export= False, res_export= False)

mresults_EN_m1 = run_algorithms.compile_train_test()

Execute Training and Walk Forward Testing for (M&G PLC-5021764927) for 247 times..
------------------------------ 0.08417701721191406 seconds | MAE: 0.084 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd-4295895969) for 252 times..
------------------------------ 0.2542426586151123 seconds | MAE: 0.217 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC-4295894669) for 277 times..
------------------------------ 0.25112485885620117 seconds | MAE: 0.043 ------------------------------
Execute Training and Walk Forward Testing for (Phoenix Group Holdings PLC-5066589306) for 296 times..
------------------------------ 0.7904052734375 seconds | MAE: 0.021 ------------------------------
Execute Training and Walk Forward Testing for (Spirax-Sarco Engineering PLC-4295894930) for 314 times..
------------------------------ 0.2811417579650879 seconds | MAE: 0.017 ------------------------------
Execute Traini

------------------------------ 1.2379069328308105 seconds | MAE: 0.067 ------------------------------
Execute Training and Walk Forward Testing for (Intertek Group PLC-4295896316) for 1044 times..
------------------------------ 1.2049610614776611 seconds | MAE: 0.079 ------------------------------
Execute Training and Walk Forward Testing for (Serco Group PLC-4295898751) for 340 times..
------------------------------ 0.1823430061340332 seconds | MAE: 0.017 ------------------------------
Execute Training and Walk Forward Testing for (Barrick Gold (Holdings) Ltd-4295896108) for 739 times..
------------------------------ 0.7018280029296875 seconds | MAE: 0.052 ------------------------------
Execute Training and Walk Forward Testing for (Fresnillo PLC-4298007715) for 1037 times..
------------------------------ 1.2236406803131104 seconds | MAE: 0.053 ------------------------------
Execute Training and Walk Forward Testing for (Inmarsat Group Holdings Ltd-4295897579) for 295 times..
--------

------------------------------ 1.963789701461792 seconds | MAE: 0.033 ------------------------------
Execute Training and Walk Forward Testing for (Anglo American PLC-4295896494) for 1294 times..
------------------------------ 1.7047269344329834 seconds | MAE: 0.104 ------------------------------
Execute Training and Walk Forward Testing for (Abi Sab Group Holding Ltd-4295896447) for 796 times..
------------------------------ 0.7659730911254883 seconds | MAE: 0.042 ------------------------------
Execute Training and Walk Forward Testing for (Kingfisher PLC-4295895853) for 1266 times..
------------------------------ 3.2445688247680664 seconds | MAE: 0.113 ------------------------------
Execute Training and Walk Forward Testing for (3i Group PLC-4295895807) for 1050 times..
------------------------------ 1.2981350421905518 seconds | MAE: 0.108 ------------------------------
Execute Training and Walk Forward Testing for (GSK plc-4295895781) for 1294 times..
------------------------------ 

------------------------------ 1.829042911529541 seconds | MAE: 0.053 ------------------------------
Execute Training and Walk Forward Testing for (SSE PLC-4295894191) for 1294 times..
------------------------------ 1.9699039459228516 seconds | MAE: 0.083 ------------------------------
Execute Training and Walk Forward Testing for (Capricorn Energy PLC-4295894168) for 381 times..
------------------------------ 0.34438514709472656 seconds | MAE: 0.104 ------------------------------
Execute Training and Walk Forward Testing for (Persimmon PLC-4295894068) for 906 times..
------------------------------ 0.9800660610198975 seconds | MAE: 0.111 ------------------------------
Execute Training and Walk Forward Testing for (Shell PLC-4295885039) for 1294 times..
------------------------------ 2.1541571617126465 seconds | MAE: 0.132 ------------------------------


In [19]:
np.mean(mresults_EN_m1['MSE^3'])

0.16179244084370759

In [17]:
train_df.to_csv('../../../../../../../Downloads/train_df.csv')
test_df.to_csv('../../../../../../../Downloads/test_df.csv')

In [67]:
clean_df.shape

(17427, 24)

In [21]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='EN', features='m1', sample= True, plot_export= True, res_export= False)

mresults_EN_m1 = run_algorithms.compile_train_test()

Execute Training and Walk Forward Testing for (Natwest Group PLC-8589934212) for 1294 times..
------------------------------ 3.6663548946380615 seconds | MAE: 0.141 ------------------------------
Execute Training and Walk Forward Testing for (Lloyds Banking Group PLC-8589934254) for 1294 times..
------------------------------ 2.1470208168029785 seconds | MAE: 0.136 ------------------------------


In [15]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='RF', features='m1', sample= True, plot_export= True, res_export= False)

mresults_EN_m1 = run_algorithms.compile_train_test()

Execute Training and Walk Forward Testing for (Natwest Group PLC-8589934212) for 1294 times..
------------------------------ 546.5475289821625 seconds | MAE: 0.001 ------------------------------
Execute Training and Walk Forward Testing for (Lloyds Banking Group PLC-8589934254) for 1294 times..
------------------------------ 562.8922989368439 seconds | MAE: 0.003 ------------------------------
