# Module Installation (If you refresh your notebook)

In [1]:
!pip install arch
!pip install mlxtend

Collecting arch
  Downloading arch-6.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (918 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m918.4/918.4 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Collecting statsmodels>=0.12
  Downloading statsmodels-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
Collecting scipy>=1.5
  Downloading scipy-1.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.5/36.5 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.8/233.8 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, pats

# Import Libraries

In [1]:
import sys, os
import pandas as pd
import numpy as np

sys.path.append('../python')
pd.set_option('display.max_columns', 100)

from dataprocessing import *
from runmodels import *
from util import Run_Algorithms

In [4]:
dataprocessing = DataProcessing('2006-01-01', '2022-12-01', daily=True)
df_clean, train_df, test_df = dataprocessing.clean_final(fillna = True)

In [5]:
coverage_df = pd.read_csv('../data/coverage_dataframe.csv')[['PermID', 'Name', 'TRBCEconomicSector']]
coverage_df['PermID'] = coverage_df.PermID.astype(int)
unique_assets_df = pd.DataFrame({
    'Assets': train_df.Asset.unique()
})

unique_assets_df = pd.merge(unique_assets_df, coverage_df, how = 'left', left_on = 'Assets', right_on= 'PermID')
unique_assets_df = unique_assets_df.iloc[:, 1:]
unique_assets_df.columns = ['Assets', 'Firm Name', 'Economic Sector']

In [6]:
unique_assets_df

Unnamed: 0,Assets,Firm Name,Economic Sector
0,5021764927,M&G PLC,Financials
1,4295895969,Aveva Group Ltd,Technology
2,4295894669,JD Sports Fashion PLC,Consumer cyclicals
3,5066589306,Phoenix Group Holdings PLC,Financials
4,4295894930,Spirax-Sarco Engineering PLC,Industrials
...,...,...,...
128,4295894341,AstraZeneca PLC,Healthcare
129,4295894191,SSE PLC,Utilities
130,4295894168,Capricorn Energy PLC,Energy
131,4295894068,Persimmon PLC,Consumer cyclicals


In [7]:
unique_assets_df[['Economic Sector','Assets']].groupby(['Economic Sector']).count()

Unnamed: 0_level_0,Assets
Economic Sector,Unnamed: 1_level_1
Basic materials,16
Consumer cyclicals,24
Consumer non-cyclicals,16
Energy,7
Financials,24
Healthcare,5
Industrials,23
Real estate,3
Technology,10
Utilities,5


In [6]:
list_assets = train_df.Asset.unique()

In [7]:
def split_list(lst, chunks):
    n = len(lst)
    size = n // chunks
    leftovers= n % chunks
    chunk_start = 0
    for i in range(chunks):
        if i < leftovers:
            # take an extra element
            chunk_end = chunk_start + size + 1
        else:
            chunk_end = chunk_start + size
        yield lst[chunk_start:chunk_end] 
        chunk_start = chunk_end

In [12]:
count = 1
for i in split_list(list_assets, 3):
        # Save as .npy
    
    np.save(f'../data/asset_chunks_{count}.npy', i)
    count+=1

In [9]:
    # Load .npy
array_from_npy = np.load('array.npy')

[5021764927 4295895969 4295894669 5066589306 4295894930 5001428097
 5000799571 4295899077 4295895428 4295894074 4295895352 4295874981
 4295897861 5044095778 4295874940 4295874865 4295893899 4295897408
 4295869210 4298449570 5040257207 4295898044 5036383104 4295898967
 4295898932 4295894092 4295895815 5047641169 4295894176 4295894667
 4295894349 5036227579 5036206981 4295895717 5034844193 4295894303
 5001229906 4295894784 4295895858 4295894743 4295895971 4295894819
 4295897734 4295896316 4295898751]
[4295896108 4298007715 4295897579 4295896428 4295894827 4295894970
 4295895691 4298007752 4295897802 4295897780 4295894471 4295893850
 5000683618 4295894926 4295894483 4295894951 4295894799 4295894507
 4295897744 4295895174 5052540916 4295895061 8589934340 8589934333
 8589934275 8589934254 8589934227 8589934212 5067937815 5037958512
 5033562382 5000039682 5000006235 5000001291 4295898763 4295898060
 4295897705 4295897683 4295897467 4295896661 4295896619 4295896494
 4295896447 4295895853]
[42

In [2]:
# Store a variable
# %store train_df
# %store test_df

# Now you can restart the kernel or even close and reopen Jupyter.

# After restarting the kernel, load the variable
# %store -r train_df

%store -r train_df
%store -r test_df

# Run The Algorithms (GARCH)

In [5]:
# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df, test_df, algorithms='GARCH', sample= False, plot_export= True, res_export= True)

mresults = run_algorithms.compile_train_test()

after feature selections ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp

KeyboardInterrupt: 

In [None]:
np.mean(mresults['MSE^3'])

# Run The Algorithms (HAR)

In [4]:
# You can left the 'features' null when running HAR.
run_algorithms = RunModels(train_df, test_df, algorithms='HAR', features='m1', sample= False, plot_export= True, res_export= False)

mresults_HAR_m1 = run_algorithms.compile_train_test()

after feature selections ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']


In [5]:
np.mean(mresults_HAR_m1['MSE^3'])

0.004137271903696506

---

In [8]:
# You can left the 'features' null when running HAR.
run_algorithms = RunModels(train_df, test_df, algorithms='HAR', features='m2', sample= True, plot_export= False, res_export= False)

mresults_HAR_m2 = run_algorithms.compile_train_test()

after feature selections ['ESG', 'EnvironmentalPillar', 'SocialPillar', 'Management', 'ProductResponsibility', 'Shareholders', 'Workforce', 'V^YZ']


In [9]:
np.mean(mresults_HAR_m2['MSE^3'])

0.09256814215989825

---

In [10]:
# You can left the 'features' null when running HAR.
run_algorithms = RunModels(train_df, test_df, algorithms='HAR', features='m3', sample= False, plot_export= True, res_export= False)

mresults_HAR_m3 = run_algorithms.compile_train_test()

after feature selections ['buzz', 'ESG', 'ESGCombined', 'ESGControversies', 'EnvironmentalPillar', 'GovernancePillar', 'SocialPillar', 'Community', 'EnvironmentalInnovation', 'Management', 'ProductResponsibility', 'Shareholders', 'Workforce', 'V^YZ']


In [10]:
np.mean(mresults_HAR_m3['MSE^3'])

0.3377351659948493

---

# Run The Algorithms (EN)

In [1]:
import sys, os
import pandas as pd
import numpy as np

sys.path.append('../python')
pd.set_option('display.max_columns', 100)

from dataprocessing import *
from runmodels import *
from util import Run_Algorithms

%store -r train_df
%store -r test_df

In [2]:
# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df, test_df, algorithms='EN', features='m1', sample= False, plot_export= False, res_export= False)

mresults_EN_m1 = run_algorithms.compile_train_test()

after feature selections ['vol_series_daily', 'vol_series_weekly', 'vol_series_monthly', 'V^YZ']
Execute Training and Walk Forward Testing for (M&G PLC-5021764927) for 200 times..
------------------------------ 0.2128605842590332 seconds | MAE: 0.076 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd-4295895969) for 200 times..
------------------------------ 0.2545199394226074 seconds | MAE: 0.270 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC-4295894669) for 200 times..
------------------------------ 0.3336000442504883 seconds | MAE: 0.031 ------------------------------
Execute Training and Walk Forward Testing for (Phoenix Group Holdings PLC-5066589306) for 200 times..
------------------------------ 0.4247853755950928 seconds | MAE: 0.024 ------------------------------
Execute Training and Walk Forward Testing for (Spirax-Sarco Engineering PLC-4295894930) for 200 times..
------------------

In [3]:
np.mean(mresults_EN_m1['MSE^3'])

0.13715586333068752

---

In [4]:
# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df, test_df, algorithms='EN', features='m2', sample= False, plot_export= False, res_export= False)

mresults_EN_m2 = run_algorithms.compile_train_test()

after feature selections ['ESG', 'EnvironmentalPillar', 'SocialPillar', 'Management', 'ProductResponsibility', 'Shareholders', 'Workforce', 'V^YZ']
Execute Training and Walk Forward Testing for (M&G PLC-5021764927) for 200 times..
------------------------------ 0.4136183261871338 seconds | MAE: 0.073 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd-4295895969) for 200 times..
------------------------------ 0.37251782417297363 seconds | MAE: 0.273 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC-4295894669) for 200 times..
------------------------------ 0.4612550735473633 seconds | MAE: 0.032 ------------------------------
Execute Training and Walk Forward Testing for (Phoenix Group Holdings PLC-5066589306) for 200 times..
------------------------------ 0.810300350189209 seconds | MAE: 0.023 ------------------------------
Execute Training and Walk Forward Testing for (Spirax-Sarco Engineering

In [5]:
np.mean(mresults_EN_m2['MSE^3'])

0.13767549862474754

---

In [7]:
# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df, test_df, algorithms='EN', features='m3', sample= False, plot_export= False, res_export= False)

mresults_EN_m3 = run_algorithms.compile_train_test()

after feature selections ['buzz', 'ESG', 'ESGCombined', 'ESGControversies', 'EnvironmentalPillar', 'GovernancePillar', 'SocialPillar', 'Community', 'EnvironmentalInnovation', 'Management', 'ProductResponsibility', 'Shareholders', 'Workforce', 'V^YZ']
Execute Training and Walk Forward Testing for (M&G PLC-5021764927) for 200 times..
------------------------------ 0.37053704261779785 seconds | MAE: 0.073 ------------------------------
Execute Training and Walk Forward Testing for (Aveva Group Ltd-4295895969) for 200 times..
------------------------------ 0.3475489616394043 seconds | MAE: 0.273 ------------------------------
Execute Training and Walk Forward Testing for (JD Sports Fashion PLC-4295894669) for 200 times..
------------------------------ 0.7355279922485352 seconds | MAE: 0.032 ------------------------------
Execute Training and Walk Forward Testing for (Phoenix Group Holdings PLC-5066589306) for 200 times..
------------------------------ 0.49786972999572754 seconds | MAE: 0.0

In [8]:
np.mean(mresults_EN_m3['MSE^3'])

0.13767549862474754

---

In [7]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='RF', features='m1', sample= True, plot_export= True, res_export= True)

mresults_RF_m1 = run_algorithms.compile_train_test()

Execute Training and Walk Forward Testing for (Natwest Group PLC-8589934212) for 1294 times..
------------------------------ 692.2084374427795 seconds | MAE: 0.001 ------------------------------
Execute Training and Walk Forward Testing for (Lloyds Banking Group PLC-8589934254) for 1294 times..
------------------------------ 697.6710958480835 seconds | MAE: 0.003 ------------------------------


In [None]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='RF', features='m1', sample= False, plot_export= True, res_export= True)

mresults_RF_m1 = run_algorithms.compile_train_test()

In [None]:
# You can left the 'features' null when running GARCH.
run_algorithms = Run_Algorithms(train_df, test_df, algorithms='RF', features='m3', sample= False, plot_export= True, res_export= True)

mresults_RF_m3 = run_algorithms.compile_train_test()

---

## Preparation for M1

### Chunk 1

In [14]:
import sys, os
import pandas as pd
import numpy as np

sys.path.append('../python')
pd.set_option('display.max_columns', 100)

from dataprocessing import *
from runmodels import *

dataprocessing = DataProcessing('2006-01-01', '2022-12-01', daily=True)
df_clean, train_df, test_df = dataprocessing.clean_final(fillna = True)

asset_1 = np.load('../data/asset_chunks_1.npy')

train_df_1 = train_df[train_df.Asset.isin(asset_1)]
test_df_1 = test_df[test_df.Asset.isin(asset_1)]

# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df_1, test_df_1, algorithms='RF', features='m1', sample= False, plot_export= False, res_export= True)
mresults_RF_m1 = run_algorithms.compile_train_test()

### Chunk 2

In [None]:
import sys, os
import pandas as pd
import numpy as np

sys.path.append('../python')
pd.set_option('display.max_columns', 100)

from dataprocessing import *
from runmodels import *

dataprocessing = DataProcessing('2006-01-01', '2022-12-01', daily=True)
df_clean, train_df, test_df = dataprocessing.clean_final(fillna = True)

asset_1 = np.load('../data/asset_chunks_2.npy')

train_df_1 = train_df[train_df.Asset.isin(asset_1)]
test_df_1 = test_df[test_df.Asset.isin(asset_1)]

# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df_1, test_df_1, algorithms='RF', features='m1', sample= False, plot_export= False, res_export= True)
mresults_RF_m1 = run_algorithms.compile_train_test()

### Chunk 3

In [None]:
import sys, os
import pandas as pd
import numpy as np

sys.path.append('../python')
pd.set_option('display.max_columns', 100)

from dataprocessing import *
from runmodels import *

dataprocessing = DataProcessing('2006-01-01', '2022-12-01', daily=True)
df_clean, train_df, test_df = dataprocessing.clean_final(fillna = True)

asset_1 = np.load('../data/asset_chunks_2.npy')

train_df_1 = train_df[train_df.Asset.isin(asset_1)]
test_df_1 = test_df[test_df.Asset.isin(asset_1)]

# You can left the 'features' null when running GARCH.
run_algorithms = RunModels(train_df_1, test_df_1, algorithms='RF', features='m1', sample= False, plot_export= False, res_export= True)
mresults_RF_m1 = run_algorithms.compile_train_test()