<a href="https://colab.research.google.com/github/tobytoyin/sec-10q-msc-report/blob/main/4_BoW_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install dask distributed
# !pip install dask[complete] distributed --upgrade
# !pip install dask-ml

# setup dirs 
train_dir = '/content/drive/MyDrive/Aston/Term 3/ML-training'
which_base = 'base_sel'
dir_json = 'dir_map.json'
model_json = 'model_paths.json'
model_dir = 'featureSelection'

In [None]:
from dask.diagnostics import ProgressBar

## Helper Functions 
def train_test_split_year(dataframe, year_col, year_for_test):
  test_subset = dataframe[year_col].dt.year.isin(year_for_test)
  train_set = dataframe.loc[~test_subset].copy()
  test_set = dataframe.loc[test_subset].copy()

  return train_set, test_set

def exponential(base, start_exp, end_exp, step=1) -> list:
  return [base**i for i in np.arange(start_exp, end_exp, step=step, dtype=float)]

def direction(y, return_value=0.01):
  if y > return_value:
    return 1

  if y < -return_value:
    return 2

  return 0

def save_model(model, name, features, cat=None, num=None, best_cv=None, pred=None, dir=model_dir):
  with open(f'{train_dir}/{model_json}', 'r') as f:
    modelpaths = json.load(f)

  assert 'baseline' in modelpaths.keys()

  save_path = f'{train_dir}/models/{dir}/{name}.pkl'
  joblib.dump(model, save_path)

  modelpaths[name] = {
      'path': save_path, 
      'features': features, 
      'cat': cat, 
      'num': num, 
      'best_cv': best_cv, 
      'pred': list(pred), 
  }

  with open(f'{train_dir}/{model_json}', 'w') as f:
    json.dump(modelpaths, f)

def train_wrapper(estimator, param, trainX, trainy, cat_columns, num_columns, 
                  scoring='neg_mean_squared_error'):
  features = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_columns),  
    ('scaler', MinMaxScaler(), num_columns)
  ], remainder='passthrough')

  model = Pipeline([
    ('features', features), 
    ('model', estimator)
  ])

  # Setup grid cv
  # grid_model = GridSearchCV(
  #     estimator=model,
  #     param_grid=param,
  #     scoring=scoring, 
  #     return_train_score=True,
  #     cv=5,
  #     n_jobs=-1,
  #     # verbose=1,
  # )
  with ProgressBar():
    model.fit(trainX, trainy)
  # print("[BEST] : ", grid_model.best_score_)

  return model

def retrieve_best_test_scores(grid):
  """Retreive the cv test scores given a model idx"""
  try: 
    # get best score 
    best_model_id = grid.cv_results_['rank_test_score'][0] - 1
    scores = []
    
    for i in range(0, 5):
      scores.append(grid.cv_results_[f'split{i}_test_score'][best_model_id])
    return scores
  
  except: 
    return reg.final_estimator_.best_score_
  
def check_with_valid(grid, X, y):
  # baseline mse
  try: 
    model = grid.best_estimator_

  except AttributeError:
    model = grid

  print("\n===== Training Result =====")
  print("SMA60 MSE  : ", mean_squared_error(y, X['sma60']))
  print("MODEL MSE  : ", mean_squared_error(y, model.predict(X)))


  # # # baseline direction
  rate = 0.01
  pred_baseline_y_dir = list(map(lambda x: direction(x, rate), X['sma60']))

  pred_y_dir = list(map(lambda x: direction(x, rate), model.predict(X)))
  true_y_dir = list(map(lambda x: direction(x, rate), y))

  print(f"SMA60 Direction ({rate})  : ", f1_score(true_y_dir, pred_baseline_y_dir, average='macro'))
  print(f"MODEL Direction ({rate}) : ", f1_score(true_y_dir, pred_y_dir, average='macro'))
  print('\n')

  # params_score = pd.DataFrame(grid.cv_results_).filter(regex='param_model|mean_test_score')
  # display(params_score.sort_values(by='mean_test_score', ascending=False))

def plot_learning_curve(model, trainX, trainy, scoring='neg_mean_squared_error'):
  size, train_score, val_score = learning_curve(model, 
                                                trainX, trainy,
                                                # train_sizes=np.linspace(0.1, 1.0, 10),
                                                scoring=scoring)

  sns.lineplot(x=size, y=-train_score.mean(1), label='train')
  g = sns.lineplot(x=size, y=-val_score.mean(1), label='cv')
  ylabels = scoring.replace('_', ' ').replace('neg', '')
  g.set(ylabel=ylabels, xlabel='training size')
  plt.show()

In [None]:
%cd '/content/drive/MyDrive/Aston/Term 3/ML-training'

import sklearn
import seaborn as sns
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import json 
import joblib
import random

from dask_ml.model_selection import GridSearchCV
from math import ceil
from sklearn.dummy import DummyRegressor
from scipy.stats.mstats import winsorize
from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from IPython.display import display
from sklearn.ensemble import StackingRegressor

SEED = 13579
random.seed(SEED)




with open(f'{train_dir}/{dir_json}', 'r') as f:
  datapaths = json.load(f)

with open(f'{train_dir}/{model_json}', 'r') as f:
  modelpaths = json.load(f)

assert sklearn.__version__ == '0.24.2', "wrong version"

/content/drive/MyDrive/Aston/Term 3/ML-training


# Create Datasets

In [None]:
train_set = pd.read_csv(datapaths[which_base]['train'], index_col=0)
test_set = pd.read_csv(datapaths[which_base]['test'], index_col=0)

cat_columns = datapaths[which_base]['cat_columns']
fin_columns = datapaths[which_base]['fin_columns']
all_columns = datapaths[which_base]['all_columns']

len(all_columns)

60

In [None]:
def feature_merge(feature_keys):
  """Merge datapaths features by key with the global data set
  """
  global train_set 
  global test_set 

  data = {
      'train': None, 
      'test': None, 
      'features': {}
  }

  # create copy 
  train_copy = train_set.copy()
  test_copy = test_set.copy()
  all_feature_names = set()

  # get features
  features_names = []
  for key in feature_keys:
    features_names += datapaths[key]['feature_names']
    train_features_df = pd.read_csv(datapaths[key]['train'], index_col=0)
    test_features_df = pd.read_csv(datapaths[key]['test'], index_col=0)
    
    assert len(train_set) == len(train_features_df), "Number of Train Examples are different"
    assert len(test_set) == len(test_features_df), "Number of Test Examples are different"

    train_copy = train_copy.merge(train_features_df, left_index=True, right_index=True)
    test_copy = test_copy.merge(test_features_df, left_index=True, right_index=True)

  data['train'] = train_copy
  data['test'] = test_copy
  data['features'] = features_names


  # for set_name, set_ in zip(['train', 'test'], sets_):
  #   merge_df = set_.copy()
  #   # merge with each feature keys 
  #   for key in feature_keys:
  #     features_df = pd.read_csv(datapaths[key][set_name], index_col=0)
    
  #     # merge 
  #     assert len(set_) == len(features_df), "Number of Examples are different"
  #     merge_df = merge_df.merge(features_df, left_index=True, right_index=True)
  #     all_feature_names.update(datapaths[key]['feature_names'])

  #   # add into dictionary after merging all features  
  #   data[set_name] = merge_df
    

    # add feature names to dictionary
  # data['features'] = list(all_feature_names)

  return data

def train_helper(variant, model, params, save_name):
  # get variant and datasets 
  variant_features = datasets[variant]['features']
  full_features = all_columns + variant_features

  trainX = datasets[variant]['train'][full_features]
  trainy = datasets[variant]['train']['y_return30_nom']
  testX = datasets[variant]['test'][full_features]

  print("Traning Shape:", trainX.shape)

  # train model
  grid_result = train_wrapper(model, params, trainX, trainy, 
                              cat_columns=cat_columns, num_columns=fin_columns)
  
  # predict 
  try:  
    pred = grid_result.best_estimator_.predict(testX)
  except: 
    pred = grid_result.final_estimator_.predict(textX)

  # save model 
  best_scores = retrieve_best_test_scores(grid_result)
  save_model(grid_result, save_name, full_features, cat=cat_columns, num=fin_columns, best_cv=best_scores,
             pred=pred)
  # plot_learning_curve(grid_result.best_estimator_, trainX, trainy)
  check_with_valid(grid_result, trainX, trainy)

In [None]:
# display(list(datapaths.keys()))
lingustics_features = {
    'sentiment_percent': ['sentiment_percent_lg'], 
    'unigram_lsa100': ['unigram_lsa100_lg'],
    'unigram_lsa200': ['unigram_lsa200_lg'], 
    'unigram_lsa100_sent': ['unigram_lsa100_lg', 'sentiment_percent_lg'], 
    'unigram_lsa200_sent': ['unigram_lsa200_lg', 'sentiment_percent_lg'], 
    'bigram_lsa100': ['bigram_lsa100_lg'], 
    'bigram_lsa200': ['bigram_lsa200_lg'], 
    'bigram_lsa100_sent': ['bigram_lsa100_lg', 'sentiment_percent_lg'], 
    'bigram_lsa200_sent': ['bigram_lsa200_lg', 'sentiment_percent_lg'], 
    'unigram_selectedLDA30': ['unigram_selectedLda30_lg'], 
    'unigram_selectedLDA30_sent': ['unigram_selectedLda30_lg', 'sentiment_percent_lg'], 
    'bigram_selectedLda30': ['bigram_selectedLda30_lg'], 
    'bigram_selectedLDA30_sent': ['bigram_selectedLda30_lg', 'sentiment_percent_lg'], 
    'np_lsa200': ['np_lsa200_lg'], 
    'np_lsa200_sent': ['np_lsa200_lg', 'sentiment_percent_lg'], 
}
kind = 'featureSelected'


# create different datasets 
datasets = {}

for key, item in lingustics_features.items():
  ## data dataframe to collections 
  datasets[key] = feature_merge(item)

In [None]:
from sklearn.pipeline import make_pipeline

# select data features 
def pick_columns(X, features):
  return X[features]

features_getter = FunctionTransformer(lambda x: pick_columns(x, all_columns + datasets['bigram_lsa100']['features']))
pipeline = make_pipeline(features_getter)


# Load the Best Models for each features types

In [None]:
def get_model(name):
  return joblib.load(f'{train_dir}/models/{model_dir}/{name}.pkl')

def get_params(param):
  new_param = {}
  for k, v in param.items():
    new_param[k.replace('model__',  '')] = v
  return new_param

def build_ensemble_models(variant, models=['svr', 'rf']):
  estimators = []
  for model in models:
    pkl_model = get_model(f'{variant}_{kind}_{model}')
    best_param = get_params(pkl_model.best_params_)
    best_model = pkl_model.best_estimator_['model'].set_params(**best_param)

    estimators.append((model, best_model))

  return estimators

In [None]:
# find the best cv models 
mean_cvs = []
for variant in lingustics_features.keys():
  local_max = 10000
  local_max_ss = None
  
  for model in ['_featureSelected_svr', '_featureSelected_rf', '_featureSelected_gb']:
    mean_cv = -np.mean(modelpaths[f'{variant}{model}']['best_cv'])
    if mean_cv < local_max:
      local_max = mean_cv 
      local_max_ss = (f'{variant}', f'{variant}{model}', mean_cv)

  mean_cvs.append(local_max_ss)

In [None]:
top_3_models = sorted(mean_cvs, key=lambda x: x[2])[0:3]
top_3_models

[('unigram_selectedLDA30_sent',
  'unigram_selectedLDA30_sent_featureSelected_rf',
  0.013691809846414138),
 ('unigram_selectedLDA30',
  'unigram_selectedLDA30_featureSelected_rf',
  0.013691992823818383),
 ('bigram_selectedLDA30_sent',
  'bigram_selectedLDA30_sent_featureSelected_rf',
  0.013706195990455135)]

In [None]:
def build_ensemble_models(models, prefix):
  estimators = []
  for (feature_name, model_name, _), prefix_ in zip(models, prefix):
    pkl_model = get_model(model_name)
    best_param = get_params(pkl_model.best_params_)
    best_model = pkl_model.best_estimator_['model'].set_params(**best_param)
    
    # append feature pipeline 
    features = [f'{prefix_}_{x}' for x in datasets[feature_name]['features'] if 'topic' in x]
    features_getter = FunctionTransformer(lambda x: pick_columns(x, all_columns + features))
    features_scaler = ColumnTransformer([
      ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_columns),  
      ('scaler', MinMaxScaler(), fin_columns)
    ], remainder='passthrough')
    pipeline = make_pipeline(features_getter, features_scaler, best_model)

    estimators.append((model_name, pipeline))

  return estimators

ensemble_models = build_ensemble_models(top_3_models, ['unigram', 'unigram', 'bigram'])

In [None]:
# create dataset 
# display(list(datapaths.keys()))
unigram_lda_train = pd.read_csv(datapaths['unigram_selectedLda30_lg']['train'], index_col=0)
unigram_lda_test = pd.read_csv(datapaths['unigram_selectedLda30_lg']['test'], index_col=0)
unigram_lda_train.columns = 'unigram_' + unigram_lda_train.columns 
unigram_lda_test.columns = 'unigram_' + unigram_lda_test.columns 

bigram_lda_train = pd.read_csv(datapaths['bigram_selectedLda30_lg']['train'], index_col=0)
bigram_lda_test = pd.read_csv(datapaths['bigram_selectedLda30_lg']['test'], index_col=0)
bigram_lda_train.columns = 'bigram_' + bigram_lda_train.columns 
bigram_lda_test.columns = 'bigram_' + bigram_lda_test.columns 

sentiment_train = pd.read_csv(datapaths['sentiment_percent_lg']['train'], index_col=0)
sentiment_test = pd.read_csv(datapaths['sentiment_percent_lg']['test'], index_col=0)

new_train = train_set.merge(unigram_lda_train, left_index=True, right_index=True)
new_train = new_train.merge(bigram_lda_train, left_index=True, right_index=True)
new_train = new_train.merge(sentiment_train, left_index=True, right_index=True)

new_test = test_set.merge(unigram_lda_test, left_index=True, right_index=True)
new_test = new_test.merge(bigram_lda_test, left_index=True, right_index=True)
new_test = new_test.merge(sentiment_test, left_index=True, right_index=True)

datasets['ensemble'] = {}
datasets['ensemble']['train'] = new_train
datasets['ensemble']['test'] = new_test
datasets['ensemble']['features'] = list(unigram_lda_train.columns) + list(bigram_lda_train.columns) + list(sentiment_train.columns)

In [None]:
# train model 
variant = 'ensemble'
estimators = build_ensemble_models(top_3_models, ['unigram', 'unigram', 'bigram'])
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
reg.fit(new_train, new_train['y_return30_nom'])



StackingRegressor(cv=5,
                  estimators=[('unigram_selectedLDA30_sent_featureSelected_rf',
                               Pipeline(steps=[('functiontransformer',
                                                FunctionTransformer(func=<function build_ensemble_models.<locals>.<lambda> at 0x7feab641c830>)),
                                               ('columntransformer',
                                                ColumnTransformer(remainder='passthrough',
                                                                  transformers=[('onehot',
                                                                                 OneHotEncoder(handle_unknown='ignore'),
                                                                                 ['period...
                                                                                  'BP_pct1',
                                                                                  'DP_pct1',
                             

In [None]:
# make prediction
pred = reg.predict(new_test)

In [None]:
modelpaths['ensemble_featureSelected_stacks'] = {}
modelpaths['ensemble_featureSelected_stacks']['path'] = None
modelpaths['ensemble_featureSelected_stacks']['features'] = datasets['ensemble']['features']
modelpaths['ensemble_featureSelected_stacks']['best_cv'] = reg.final_estimator_.best_score_
modelpaths['ensemble_featureSelected_stacks']['pred'] = list(pred)

In [None]:
with open(f'{train_dir}/{model_json}', 'w') as f:
  json.dump(modelpaths, f)

In [None]:
variant = 'unigram_lsa200'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

In [None]:
variant = 'unigram_lsa200_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

In [None]:
variant = 'unigram_lsa300'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

In [None]:
variant = 'unigram_lsa300_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

In [None]:
variant = 'bigram_lsa100_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 167)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.009544841522554828
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.518288544636545




In [None]:
variant = 'bigram_lsa200_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 267)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.009852790637244322
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.4870407288801339




In [None]:
variant = 'bigram_selectedLda30'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 74)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.008617451952346651
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.5904980976644075




In [None]:
variant = 'bigram_selectedLDA30_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 81)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.008364976862067634
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.5803738581437614




In [None]:
from time import sleep

okay = False
while not okay:
  try: 
    variant = 'np_lsa200'
    estimators = build_ensemble_models(variant)
    okay = True

  except FileNotFoundError:
    sleep(60*60)  
  
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 260)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.010245755533315594
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.48618920779444225




In [None]:
variant = 'np_lsa200_sent'
estimators = build_ensemble_models(variant)
reg = StackingRegressor(estimators=estimators, n_jobs=-1, cv=5, verbose=1)
train_helper(variant, reg, {}, f'{variant}_{kind}_stack')

Traning Shape: (8744, 267)

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.01002745547290499
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.483855622238426




# Non Ensemble Training

In [None]:
from dask.diagnostics import ProgressBar

## Helper Functions 
def train_test_split_year(dataframe, year_col, year_for_test):
  test_subset = dataframe[year_col].dt.year.isin(year_for_test)
  train_set = dataframe.loc[~test_subset].copy()
  test_set = dataframe.loc[test_subset].copy()

  return train_set, test_set

def exponential(base, start_exp, end_exp, step=1) -> list:
  return [base**i for i in np.arange(start_exp, end_exp, step=step, dtype=float)]

def direction(y, return_value=0.01):
  if y > return_value:
    return 1

  if y < -return_value:
    return 2

  return 0

def save_model(model, name, features, cat=None, num=None, best_cv=None, dir='featureSelection'):
  with open(f'{train_dir}/{model_json}', 'r') as f:
    modelpaths = json.load(f)

  assert 'baseline' in modelpaths.keys()

  save_path = f'{train_dir}/models/{dir}/{name}.pkl'
  joblib.dump(model, save_path)

  modelpaths[name] = {
      'path': save_path, 
      'features': features, 
      'cat': cat, 
      'num': num, 
      'best_cv': best_cv, 
  }

  with open(f'{train_dir}/{model_json}', 'w') as f:
    json.dump(modelpaths, f)

def train_wrapper(estimator, param, trainX, trainy, cat_columns, num_columns, 
                  scoring='neg_mean_squared_error'):
  features = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), cat_columns),  
    ('scaler', MinMaxScaler(), num_columns)
  ], remainder='passthrough')

  model = Pipeline([
    ('features', features), 
    ('model', estimator)
  ])

  # Setup grid cv
  grid_model = GridSearchCV(
      estimator=model,
      param_grid=param,
      scoring=scoring, 
      return_train_score=True,
      cv=5,
      n_jobs=-1,
      # verbose=1,
  )
  with ProgressBar():
    grid_model.fit(trainX, trainy)
  print("[BEST] : ", grid_model.best_score_)

  return grid_model

def retrieve_best_test_scores(grid):
  """Retreive the cv test scores given a model idx"""
  # get best score 
  best_model_id = grid.cv_results_['rank_test_score'][0] - 1
  scores = []
  
  for i in range(0, 5):
    scores.append(grid.cv_results_[f'split{i}_test_score'][best_model_id])
  return scores
  
def check_with_valid(grid, X, y):
  # baseline mse
  print("\n===== Training Result =====")
  print("SMA60 MSE  : ", mean_squared_error(y, X['sma60']))
  print("MODEL MSE  : ", mean_squared_error(y, grid.best_estimator_.predict(X)))


  # # # baseline direction
  rate = 0.01
  pred_baseline_y_dir = list(map(lambda x: direction(x, rate), X['sma60']))

  pred_y_dir = list(map(lambda x: direction(x, rate), grid.best_estimator_.predict(X)))
  true_y_dir = list(map(lambda x: direction(x, rate), y))

  print(f"SMA60 Direction ({rate})  : ", f1_score(true_y_dir, pred_baseline_y_dir, average='macro'))
  print(f"MODEL Direction ({rate}) : ", f1_score(true_y_dir, pred_y_dir, average='macro'))
  print('\n')

  # params_score = pd.DataFrame(grid.cv_results_).filter(regex='param_model|mean_test_score')
  # display(params_score.sort_values(by='mean_test_score', ascending=False))

def plot_learning_curve(model, trainX, trainy, scoring='neg_mean_squared_error'):
  size, train_score, val_score = learning_curve(model, 
                                                trainX, trainy,
                                                # train_sizes=np.linspace(0.1, 1.0, 10),
                                                scoring=scoring)

  sns.lineplot(x=size, y=-train_score.mean(1), label='train')
  g = sns.lineplot(x=size, y=-val_score.mean(1), label='cv')
  ylabels = scoring.replace('_', ' ').replace('neg', '')
  g.set(ylabel=ylabels, xlabel='training size')
  plt.show()

def train_helper(variant, model, params, save_name):
  # get variant and datasets 
  variant_features = datasets[variant]['features']
  full_features = all_columns + variant_features

  trainX = datasets[variant]['train'][full_features]
  trainy = datasets[variant]['train']['y_return30_nom']

  print("Traning Shape:", trainX.shape)

  # train model
  grid_result = train_wrapper(model, params, trainX, trainy, 
                              cat_columns=cat_columns, num_columns=fin_columns)

  # save model 
  best_scores = retrieve_best_test_scores(grid_result)
  save_model(grid_result, save_name, full_features, cat=cat_columns, num=fin_columns, best_cv=best_scores)
  # plot_learning_curve(grid_result.best_estimator_, trainX, trainy)
  check_with_valid(grid_result, trainX, trainy)

In [None]:
variant = 'np_lsa200'
variant_features = datasets[variant]['features']
full_features = all_columns + variant_features
n = len(full_features)
params = {
    'model__n_estimators': [500, 1000], 
    'model__min_samples_leaf': [5, 10],
    'model__min_samples_split': [int(n / 3)],
    'model__random_state': [SEED]
}

train_helper(variant, RandomForestRegressor(), params, f'{variant}_{kind}_rf')

Traning Shape: (8744, 260)
[########################################] | 100% Completed |  5hr 55min 47.0s
[                                        ] | 0% Completed | 38min 17.6s

In [None]:
variant = 'np_lsa200'
variant_features = datasets[variant]['features']
full_features = all_columns + variant_features
n = len(full_features)
params = {
  'model__subsample': [0.8], 
  'model__learning_rate': [0.05], 
  'model__max_depth': [5], 
  'model__n_estimators': [500, 1000], 
  'model__min_samples_leaf': [1, 5],
  'model__min_samples_split': [2, int(n/3)],
  'model__random_state': [SEED]
}

train_helper(variant, GradientBoostingRegressor(), params, f'{variant}_{kind}_gb')

Traning Shape: (8744, 260)
[########################################] | 100% Completed |  5hr 26min 49.5s
[########################################] | 100% Completed |  9min 42.8s
[BEST] :  -0.014605824689989475

===== Training Result =====
SMA60 MSE  :  0.014276989108699727
MODEL MSE  :  0.004048681497023178
SMA60 Direction (0.01)  :  0.05459854966213367
MODEL Direction (0.01) :  0.6106172762533602




In [None]:
variant = 'np_lsa200_sent'
variant_features = datasets[variant]['features']
full_features = all_columns + variant_features
n = len(full_features)
params = {
  'model__subsample': [0.8], 
  'model__learning_rate': [0.05], 
  'model__max_depth': [5], 
  'model__n_estimators': [500, 1000], 
  'model__min_samples_leaf': [1, 5],
  'model__min_samples_split': [2, int(n/3)],
  'model__random_state': [SEED]
}

train_helper(variant, GradientBoostingRegressor(), params, f'{variant}_{kind}_gb')

In [None]:
variant = 'np_lsa200_sent'
variant_features = datasets[variant]['features']
full_features = all_columns + variant_features
n = len(full_features)
params = {
    'model__n_estimators': [500, 1000], 
    'model__min_samples_leaf': [5, 10],
    'model__min_samples_split': [int(n / 3)],
    'model__random_state': [SEED]
}

train_helper(variant, RandomForestRegressor(), params, f'{variant}_{kind}_rf')

Traning Shape: (8744, 267)
[                                        ] | 0% Completed |  1hr 35min 52.3s


KeyboardInterrupt: ignored