# Cross Validation

### Beatriz Loureiro 
### Giovanni Dalvi
### Richard Sousa Antunes

In [2]:
import sys
import importlib
sys.path.append('../')
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso
import math
from sklearn.model_selection import cross_val_score, StratifiedKFold


from src.finance_ml.data_preparation.data_preparation import DataLoader
from src.finance_ml.cross_validation.cross_validation import CrossValidation

In [3]:
# Defining time_index_col (must be the same column in all inputs) and keep_cols refering to the columns that will remain in the dataset
dataloader = DataLoader(time_index_col= 'DATE', 
                    keep_cols = ['VOLUME','OPEN', 'HIGHT', 'LOW', 'CLOSE', 'VW','TRANSACTIONS'])

In [4]:
# Example Files
fname_AAPL = 'equities/AAPL_2020-04-07_2022-04-06.parquet'

# No. of Records from example dataset
N = 30000

In [5]:
# Dataset chosen in this simulation
ticker = 'AAPL'
fname = fname_AAPL

In [6]:
# loading assets into to an unique df
df = dataloader.load_dataset({ticker:'../data/'+fname}).iloc[:N]
display(df)

Unnamed: 0_level_0,AAPL_VOLUME,AAPL_OPEN,AAPL_HIGHT,AAPL_LOW,AAPL_CLOSE,AAPL_VW,AAPL_TRANSACTIONS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-07 12:17:00,14256.0,67.3500,67.3625,67.3275,67.3375,67.3454,45
2020-04-07 12:18:00,16552.0,67.3501,67.3625,67.3250,67.3250,67.3407,57
2020-04-07 12:19:00,80172.0,67.3250,67.4475,67.3250,67.4225,67.3937,149
2020-04-07 12:20:00,183740.0,67.4475,67.4700,67.3125,67.3700,67.3671,159
2020-04-07 12:21:00,35372.0,67.3500,67.3850,67.2975,67.2975,67.3469,102
...,...,...,...,...,...,...,...
2020-06-05 15:42:00,282372.0,82.1317,82.1725,82.0925,82.1168,82.1329,769
2020-06-05 15:43:00,195168.0,82.1125,82.1250,82.0925,82.1100,82.1059,533
2020-06-05 15:44:00,290188.0,82.1075,82.2450,82.1050,82.2400,82.1737,756
2020-06-05 15:45:00,417900.0,82.2375,82.3400,82.2375,82.3220,82.2835,1108


In [7]:
X = df.drop('AAPL_VW', axis=1)
Y = df['AAPL_VW']

In [11]:
import warnings
warnings.filterwarnings('ignore')

CV = CrossValidation(overlap=0.05, embargo_rate = 0.01)

for k in range(3):
    alpha = (k+1)/10
    scores = CV.cross_validation_score(X, Y, Lasso(alpha=alpha), 'mean_squared_error')
    print('\nAlpha: ', alpha,'\nlista_scores: ', scores, '\nmédia score: ', np.mean(scores))
    scores_classico = -cross_val_score(Lasso(alpha=alpha), X, Y, cv=5, scoring='neg_mean_squared_error')
    print('\nCross-Validation normal do sklearn: ', scores_classico, '\nmédia score: ', np.mean(scores_classico))


Alpha:  0.1 
lista_scores:  [0.005387538138882531, 0.004155479045668313, 0.0030086690576702846, 0.01783841414555814, 0.0034915446044032745] 
média score:  0.006776328998436508

Cross-Validation normal do sklearn:  [0.00448302 0.00406024 0.00301328 0.01762743 0.00284644] 
média score:  0.006406084767726789

Alpha:  0.2 
lista_scores:  [0.013899611078896113, 0.006687385262080873, 0.00321349137515545, 0.019774441556173337, 0.010534220132784895] 
média score:  0.010821829881018133

Cross-Validation normal do sklearn:  [0.00955868 0.00635083 0.00323903 0.01885634 0.00790306] 
média score:  0.00918158960043335

Alpha:  0.3 
lista_scores:  [0.028065596262554757, 0.01093180387977309, 0.0035420595708625033, 0.023090926019817718, 0.02221131389721332] 
média score:  0.017568339926044278

Cross-Validation normal do sklearn:  [0.01799717 0.0102126  0.0036034  0.02096862 0.01631125] 
média score:  0.013818608972508362
