<font size="6"><center>**Model Building: Basics**</font>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%run ../nb_config.py

running notebook configuration


In [3]:
import numpy as np
import pandas as pd


In [4]:
from src.data_api import sources, sinks
from src import utils, tags

from src import mle 

In [5]:
conf = utils.get_conf()

In [6]:
fparams = conf['func_params']

start_dt = fparams['start_dt']
end_dt = fparams['end_dt']
avol_top = float(fparams['avol_top'])
avol_window = int(fparams['avol_window'])
tau = int(fparams['target']['tau'])

val_dt = fparams['split_dt']['valid']
test_dt = fparams['split_dt']['test']

In [7]:
start_dt, val_dt, test_dt, end_dt

('2015-01-01', '2016-12-31', '2017-06-30', '2017-12-31')

In [8]:
RND_SEED = 123

# Load Data

In [9]:
feats_and_targets = sources.read_data(
    'feats_and_targets.csv', layer='interim',
    parse_dates=['date'], index_col=['date', 'ticker'])
feats_and_targets.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 374331 entries, (Timestamp('2015-01-02 00:00:00'), 'A') to (Timestamp('2017-12-29 00:00:00'), 'ZTS')
Columns: 28 entries, sector to split_3f
dtypes: bool(1), float64(24), object(3)
memory usage: 79.0+ MB


In [10]:
pd.concat([feats_and_targets.head(2), feats_and_targets.tail(2)], axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sector,SP500,y_fwd_logrets_5d,x_intrad_upshadow,x_intrad_loshadow,x_returns_1d,x_returns_5d,x_zscore_5d,x_zscore_vol_5d,x_macd_5d_vs_20d,x_rsi,x_zscore_20d,x_zscore_vol_20d,x_zscore_60d,x_zscore_vol_60d,x_bb_60d,x_macd_50d_vs_252d,x_returns_1y,x_rets5d_vol_60d,x_rets5d_vol_120d,x_mkt_dispersion,x_wday_cos,x_wday_sin,x_is_eoq,x_mkt_volat_60d,x_mkt_volat_120d,split,split_3f
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2015-01-02,A,Capital Goods,-0.0147,0.0007,0.1264,0.1847,-0.0069,-0.0115,-1.4744,0.9398,0.2672,55.3864,-0.1928,-0.501,0.1058,-0.8054,0.0,0.4378,0.1129,0.0405,0.0336,0.0543,0.309,-0.9511,False,0.0236,0.0192,1.dev,1.train
2015-01-05,A,Capital Goods,-0.0341,0.0078,0.1361,0.0972,-0.0189,-0.0365,-1.5635,1.5077,0.0948,55.6667,-0.8393,-0.0693,-0.3988,-0.4243,0.0,0.454,0.1001,0.0406,0.0338,0.0628,1.0,0.0,False,0.0241,0.0194,1.dev,1.train
2017-12-28,ZTS,Health Care,0.0011,0.0133,0.11,0.25,-0.0008,-0.0044,0.5586,-1.2001,0.258,59.9842,0.6349,-1.4923,1.082,-1.6623,0.0,8.692,0.3552,0.0237,0.0218,0.0547,-0.809,-0.5878,False,0.0057,0.0068,2.test,3.test
2017-12-29,ZTS,Health Care,-0.0036,0.0301,0.21,0.0,-0.0048,-0.0033,-0.9547,1.3702,0.2225,65.2919,0.0366,-0.0578,0.9494,-0.533,0.0,8.7416,0.3557,0.0238,0.0218,0.0637,0.309,-0.9511,False,0.0056,0.0068,2.test,3.test


In [11]:
feats_and_targets.columns

Index(['sector', 'SP500', 'y_fwd_logrets_5d', 'x_intrad_upshadow',
       'x_intrad_loshadow', 'x_returns_1d', 'x_returns_5d', 'x_zscore_5d',
       'x_zscore_vol_5d', 'x_macd_5d_vs_20d', 'x_rsi', 'x_zscore_20d',
       'x_zscore_vol_20d', 'x_zscore_60d', 'x_zscore_vol_60d', 'x_bb_60d',
       'x_macd_50d_vs_252d', 'x_returns_1y', 'x_rets5d_vol_60d',
       'x_rets5d_vol_120d', 'x_mkt_dispersion', 'x_wday_cos', 'x_wday_sin',
       'x_is_eoq', 'x_mkt_volat_60d', 'x_mkt_volat_120d', 'split', 'split_3f'],
      dtype='object')

# Data Preparation

In [12]:
features = feats_and_targets.filter(regex='x_').columns
label = 'y_fwd_logrets_5d'

In [13]:
features

Index(['x_intrad_upshadow', 'x_intrad_loshadow', 'x_returns_1d',
       'x_returns_5d', 'x_zscore_5d', 'x_zscore_vol_5d', 'x_macd_5d_vs_20d',
       'x_rsi', 'x_zscore_20d', 'x_zscore_vol_20d', 'x_zscore_60d',
       'x_zscore_vol_60d', 'x_bb_60d', 'x_macd_50d_vs_252d', 'x_returns_1y',
       'x_rets5d_vol_60d', 'x_rets5d_vol_120d', 'x_mkt_dispersion',
       'x_wday_cos', 'x_wday_sin', 'x_is_eoq', 'x_mkt_volat_60d',
       'x_mkt_volat_120d'],
      dtype='object')

In [14]:
feats_and_targets = feats_and_targets.replace({np.inf: np.nan, -np.inf: np.nan}).fillna(0.)

# Splitting

Define a column to split data into 2 or 3 folds (depending on your model building strategy). In general:


Train/Val/Test:
* Use train to fit model, Val to assess performance and probing different hyperparameters and test to yield a honest (final) performance metric

Cross-Validation:
* Use dev/test folds, in dev fold apply a Cross-validation algorithm that will sucesivelly split in train/valid
* Once again, use test to yield a honest (final) performance metri

In [15]:
df_train = feats_and_targets.query('split_3f == "1.train"')
X_train = df_train[features]
y_train = df_train[label]

df_valid = feats_and_targets.query('split_3f == "2.valid"')
X_valid =  df_valid[features]
y_valid =  df_valid[label]

df_test = feats_and_targets.query('split_3f == "3.test"')
X_test =  df_test[features]
y_test =  df_test[label]

# Model Building

`ParameterGrid`  allows to itererate over a grid of hyper-params

In [16]:
from sklearn.model_selection import ParameterGrid

grid_hparams = {
    'fit_intercept': [False, True],
    'alpha': [0.001, 0.0001], 
               }
grid_hparams = ParameterGrid(grid_hparams)

for hparams in grid_hparams:
    print(hparams)


{'alpha': 0.001, 'fit_intercept': False}
{'alpha': 0.001, 'fit_intercept': True}
{'alpha': 0.0001, 'fit_intercept': False}
{'alpha': 0.0001, 'fit_intercept': True}


In [17]:
len(grid_hparams)

4

In [18]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error as skl_mse
model = Lasso(random_state=RND_SEED)

res = []
for fold_i, hparams in enumerate(grid_hparams):
    # instanciate model
    model = Lasso(random_state=RND_SEED, **hparams)
    # fit model on train
    model.fit(X_train, y_train)
    # predict and assess on valid
    p_valid = model.predict(X_valid)
    rmse =  np.sqrt(skl_mse(y_valid, p_valid))
    # store results
    res.append(rmse)


In [19]:
res

[0.0720259437778695,
 0.07203199982892511,
 0.07207907724733775,
 0.07214050962309944]

It can be seen that hparams configuration `idx=0` yields the lowest rmse