In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle

from feature_engine.imputation import MeanMedianImputer, ArbitraryNumberImputer
from feature_engine.outliers import Winsorizer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.discretisation.arbitrary import ArbitraryDiscretiser
from feature_engine.selection import DropFeatures
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.selection import RecursiveFeatureAddition
from feature_engine.encoding import MeanEncoder

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler , MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.pipeline import Pipeline 

s_path = os.path.dirname(os.path.realpath(__file__))
sys.path.append(s_path)

pd.set_option('display.max_columns',100)
pd.set_option('precision', 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import tm_teoriaMvto_base_prep as base_prep
import tm_teoriaMvto_label as tm_label
import tm_teoriaMvto_train as tm_train
import tm_teoriaMvto_ft_eng as ft_eng
# import tm_teoriaMvto_ft_sel as ft_sel

# models to test
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier


In C:\Users\Desktop\Anaconda3\envs\ngym36\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Desktop\Anaconda3\envs\ngym36\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Desktop\Anaconda3\envs\ngym36\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\Desktop\Anaconda3\envs\ngym36\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Desktop\Anaconda3\envs\n

In [2]:
path_files = os.path.join(s_path, 'historical_files')

# base prep control
EXPORT_X = False
EXPORT_Y = False

# params labeling
s_prefix = 'ft_'
s_lbl_type = 'c_binary'
f_th = 0.2
b_dist = True
b_percent = True
b_custom = True

# params loading
s_regime = 'mi_up'



In [3]:
if EXPORT_X:
  df_data = base_prep.import_sampling(path_files= os.path.join(path_files, 'FeaturesFiles'), prefix_files= s_prefix)

  # create new features before extracting samples - using full data needed by lag features
  df_data = ft_eng.BasicFeatures().transform(X=df_data)

  df_X = base_prep.ft_export(df_data, path_files= os.path.join(path_files, 'TrainFiles'), prefix= s_prefix)
  y_sc = base_prep.y_export(df_data, path_files= os.path.join(path_files, 'TrainFiles'), prefix= s_prefix)



In [4]:
y_config = 'prefix_'+ s_prefix +'_type_'+ s_lbl_type +'_th_'+ str(f_th) +'_dist_'+ str(b_dist) +'_percent_'+str(b_percent)+'_custom_'+str(b_custom)

if EXPORT_Y:
  df_label = None 
  if EXPORT_X: df_label = y_sc

  # specify df_data = None (default) to load pickle from s_path + 'y_prep_data.pkl'
  label = tm_label.Labeling(df_data = df_label, label_type= s_lbl_type, b_dist_to_high= b_dist, 
                            s_path = os.path.join(path_files, 'TrainFiles'), prefix_files = s_prefix, th_label_y1=f_th, 
                            b_percent_freq=b_percent, b_custom_dir=b_custom)

  df_y = label.apply_label(s_model_return = s_regime)  # it will export 4 pickle files to label_obj.s_path
  # y_config = label.s_name

y_outfile = os.path.join(os.path.join(path_files, 'TrainFiles'), 'y_' + y_config +'.pkl')

if os.path.exists(y_outfile):
  print('y outfile ready for configuration: {}'.format(y_config))
else:
  print('WARNING: y outfile not found for configuration. Define EXPORT_Y = True and try again {}'.format(y_config))



y outfile ready for configuration: prefix_ft__type_c_binary_th_0.2_dist_True_percent_True_custom_True


In [5]:
l_prop_08 = ['escora_bid_2.5_0.8', 'escora_ask_2.5_0.8', 'escora_bid_3.5_0.8', 
            'escora_ask_3.5_0.8', 'escora_bid_4.5_0.8', 'escora_ask_4.5_0.8', ]

l_prop_12 = ['escora_bid_2.5_1.2', 'escora_ask_2.5_1.2', 'escora_bid_3.5_1.2',
            'escora_ask_3.5_1.2', 'escora_ask_4.5_1.2', 'escora_bid_4.5_1.2', ]

l_prop_2 = [ 'escora_bid_2.5_2', 'escora_ask_2.5_2', 'escora_bid_3.5_2',
            'escora_ask_3.5_2', 'escora_bid_4.5_2', 'escora_ask_4.5_2', ]

l_col_log = ['agg_net_d', 'aggbig_net_d', 'vol_trd', 'vol_big', 'big_v', 'vol_trd_aux', 
   'vol_big_aux', 'big_v_aux', 'loc_agg_net_d', 'big_c', 'big_c_aux',
   'loc_aggbig_net_d', 'agg_net_m', 'agg_net_m_aux', 'abagg', 'abagg_aux',
   'aggbig_net_m', 'aggbig_net_m_aux', 'loc_agg_net_m', 'loc_aggbig_net_m',
   'loc_agg_net_m_aux', 'loc_aggbig_net_m_aux', 'loc_aggbig_c_m', 'loc_aggbig_v_m', 
   'loc_aggbig_c_m_aux', 'loc_aggbig_v_m_aux', 'abs_v', 'abs_c', 'aggpior_v', 'aggpior_v_aux', 
   'aggpior_c', 'aggpior_c_aux', 'agg_net_10', 'agg_net_40', 'agg_net_80', 'loc_agg_net_10',
   'aggbig_net_10', 'aggpior_DIF', 'aggpior_DIF_30', 'abs_DIF', 'abs_DIF_30',
   'abagg_10', 'aggpior_aux_DIF', ]

l_side_drop = ['big_c','big_v','aggpior_c','aggpior_v','loc_aggbig_c_m','loc_aggbig_v_m','pagg_c_best',
  'pagg_c_best_0.5','pagg_c_best_0.7','pagg_c_best_0.9','pagg_v_best','pagg_v_best_0.5','pagg_v_best_0.7',
  'pagg_v_best_0.9','abs_c','abs_v','int_c','int_c_0.6','int_c_0.7','int_c_0.8','int_c_0.9','int_dif_c',
  'int_v','int_v_0.6','int_v_0.7','int_v_0.8','int_v_0.9','int_dif_v','imp_c','imp_c_0.6','imp_c_0.7',
  'imp_c_0.8','imp_c_0.9','imp_v','imp_v_0.6','imp_v_0.7','imp_v_0.8','imp_v_0.9','escora_bid_2.5_1.2',
  'escora_bid_2.5_2','escora_ask_2.5_1.2','escora_ask_2.5_2','escora_bid_3.5_1.2','escora_bid_3.5_2',
  'escora_ask_3.5_1.2','escora_ask_3.5_2','escora_bid_4.5_0.8','escora_bid_4.5_2','escora_ask_4.5_0.8',
  'escora_ask_4.5_2','movesc_bid_2.5','movesc_ask_2.5','movesc_bid_2.5_0.5','movesc_ask_2.5_0.5',
  'movesc_bid_2.5_0.7','movesc_ask_2.5_0.7','movesc_bid_3.5','movesc_ask_3.5','movesc_bid_3.5_0.7',
  'movesc_ask_3.5_0.7','movesc_bid_3.5_0.9','movesc_ask_3.5_0.9','movesc_bid_4.5','movesc_ask_4.5',
  'movesc_bid_4.5_0.5','movesc_ask_4.5_0.5','movesc_bid_4.5_0.7','movesc_ask_4.5_0.7','depth_bid7','depth_ask7'
  ]

l_side_drop_aux = ['big_c_aux','big_v_aux','aggpior_c_aux','aggpior_v_aux','loc_aggbig_c_m_aux',
  'loc_aggbig_v_m_aux','pagg_c_best_aux','pagg_c_best_0.5_aux','pagg_c_best_0.7_aux',
  'pagg_c_best_0.9_aux','pagg_v_best_aux','pagg_v_best_0.5_aux', 'pagg_v_best_0.7_aux',
  'pagg_v_best_0.9_aux'
  ]

l_ft_aux = [
  'vol_trd_aux', 'n_trd_aux','vol_big_aux','n_big_aux','vol_big_ratio_aux','big_c_aux','big_v_aux',
  'aggpior_c_aux','aggpior_v_aux','aggimb_aux','aggimb_big_aux','n_aggimb_aux','agg_net_m_aux',
  'aggbig_net_m_aux','loc_aggbig_c_m_aux','loc_aggbig_v_m_aux','loc_agg_net_m_aux','loc_aggbig_net_m_aux',
  'loc_agg_imb_m_aux','loc_aggbig_imb_m_aux','pagg_c_best_aux','pagg_c_best_0.5_aux','pagg_c_best_0.7_aux',
  'pagg_c_best_0.9_aux','pagg_v_best_aux','pagg_v_best_0.5_aux','pagg_v_best_0.7_aux','pagg_v_best_0.9_aux',
  'abagg_aux','n_p_aux','aggpior_aux_DIF','pagg_aux_DIF'
  ]



In [6]:
test_size = 0.2

if not EXPORT_X:
  df_X = tm_train.load_models('X_samples_'+s_prefix, os.path.join(path_files, 'TrainFiles'))  # search for path_files/s_regime.pkl

X = df_X.loc[df_X['model'] == s_regime].drop(columns='model')

if not EXPORT_Y:
  df_y = tm_train.load_models('y_' + y_config, os.path.join(path_files, 'TrainFiles'))
  y = df_y.loc[df_y['model'] == s_regime].drop(columns='model')
else:
  y = df_y.drop(columns='model')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle = False)

print('load_split: X, y data load and split complete!')



load_split: X, y data load and split complete!


In [7]:
# DEBUG: prop_features contaning np.NaN = (pd.isna(X_train).sum()>0).sort_values(ascending=False).head(50)
X_train = ft_eng.PropImputer(0.8, l_prop_08).transform(X_train)
X_train = ft_eng.PropImputer(1.2, l_prop_12).transform(X_train)
X_train = ft_eng.PropImputer(2, l_prop_2).transform(X_train)

median_inputer = MeanMedianImputer(variables=['PA_down',])
X_train = median_inputer.fit_transform(X_train)

nan_imputer = ArbitraryNumberImputer(0.0, variables=['ohlc_10','ohlc_50'])
X_train = nan_imputer.fit_transform(X_train)

X_train = ft_eng.DifAll().transform(X_train)
X_train = ft_eng.LogVolume(l_col_log).transform(X_train)

if s_regime[:2] == 'mw':
  # for now, removing all side columns
  l_cols_drop = l_side_drop + l_ft_aux + ['smart_price', 'sspread']
else:
  l_cols_drop = l_side_drop + l_side_drop_aux + ['s_run', 'n_p_aux', 'smart_price', 'sspread']

# TODO: INCREMENTAR L COLS DROP TAMBEM QUANDO LABEL NAO FOR DIST-TO-HIGH, RETIRANDO AS COLUNAS DE VARIACAO DE PRECO!

X_train.drop(columns=l_cols_drop, inplace=True)


In [8]:
X_test = ft_eng.PropImputer(0.8, l_prop_08).transform(X_test)
X_test = ft_eng.PropImputer(1.2, l_prop_12).transform(X_test)
X_test = ft_eng.PropImputer(2, l_prop_2).transform(X_test)

X_test = median_inputer.transform(X_test)

X_test = nan_imputer.transform(X_test)

X_test = ft_eng.DifAll().transform(X_test)
X_test = ft_eng.LogVolume(l_col_log).transform(X_test)

X_test.drop(columns=l_cols_drop, inplace=True)



In [9]:

# TODO: move function to ft_selection.py
def list_sub(lst1, lst2):
  final_list = list(set(lst1) - set(lst2))
  return final_list


def list_union(*l_lists):
  l_union = l_lists[0]
  if len(l_lists) > 1:
    for i in range(len(l_lists)-1):
      l_union = set(l_union) | set(l_lists[i+1])
  return list(l_union)

l_duplicate = ['loc_agg_net_m',]  # _2 ft comes from ft_eng.duplicate()

l_cap_1 = ['vewma_c_v', 'vewmag_dif', 'book_imb', 'book_imb_dif', ]

l_cap_5 = ['PA_up', 'PA_down', ]

l_cap_10 = ['n_big_aux', ]

l_cap_default = list_sub(X_train.columns.to_list(), list_union(l_cap_1, l_cap_5, l_cap_10))

l_bins_q2 = ['loc_agg_net_m_2',]  # _2 ft comes from ft_eng.duplicate()

l_bins_q4 = ['ohlc_10','ohlc_50',]

l_bins_q5 = ['loc_aggbig_net_m','vol_trd_aux','aggbig_net_m_aux','book_imb','rng_ewma_dif',
            'rng_ewma_dif_40','rng_ewma_dif_80','vewma_10','vewma_g_p_10','aggbig_net_10',
]

l_bins_q6 = ['agg_net_d', 'aggbig_net_d', 'loc_agg_imb_m_aux',]

l_bins_q8 = ['rng_ewma','vewma_g_p','vewmag_dif','n_trd_aux','abagg_aux','escora_bid_2.5_0.8','escora_ask_2.5_0.8',
            'escora_bid_3.5_0.8','escora_ask_3.5_0.8','escora_bid_4.5_1.2','escora_ask_4.5_1.2','movesc_bid_2.5_0.9',
            'movesc_ask_2.5_0.9','movesc_bid_3.5_0.5','movesc_ask_3.5_0.5','movesc_bid_4.5_0.9','movesc_ask_4.5_0.9',
            'msg_imb','rng_smart_10','imp_DIF_10','imp_DIF_50','agg_net_80','imp_FCAST_40','aggpior_DIF',
            'book_imb_mean_dif_cp','msg_imb_mean_10','escora_4.5_0.8_DIF',
]

l_bins_q10 = ['vol_big_ratio','aggimb','n_aggimb','agg_net_m','loc_aggbig_imb_m','abagg','n_p','vewma',
            'vewma_c_v', 'aggimb_aux','aggimb_big_aux','agg_net_m_aux','loc_aggbig_net_m_aux','smart_price_dif',
            'smart_price_50','rng_smart_50','agg_net_10','agg_net_40','loc_agg_net_10','int_DIF_10','abagg_10',
            'book_imb_mean_10','book_imb_mean_dif_lp','msg_imb_mean_40','msg_imb_mean_dif_lp','msg_imb_mean_dif_cp',
            'sspread_mean','movesc_2.5_0.7_DIF','msg_imb_mean_40_ABS','loc_agg_net_m',
] 

d_bins_arbitrary = {
            'n_trd':  [-0.01, 70, 180, 280, 380, 5000000],
            'vol_trd':  [0, 7.237, 7.55, 10000],
            'aggimb_big':		[-1.1, -0.4, 0.4, 1.1],
            'aggbig_net_m':		[-10000, -5, +10000],
            'chgfreq':		[-0.01, 0.167, 0.280, 1.01],
            'last_d_s':		[-1.01, -0.5, 0.5, 1.01],
            'loc_agg_net_d':		[-10000, -6.4, 5.2, 5.7, 10000],
            'loc_aggbig_net_d':		[-10000, -5.4, -3.8, 10000],
            'n_big_aux':	[-0.01, 3.1, 1000],
            'vol_big_ratio_aux':		[-0.01, 0.275, 0.520, 1.01],
            'loc_agg_net_m_aux':		[-10000, -4.451, -3.592, -2.435, 3.607, 10000] ,
            'loc_aggbig_imb_m_aux':		[-1.1, -0.99, -0.391, -0.0118, 0.4, 0.99, 1.1],
            'aggpior_DIF_30':		[-10000, -4.454, -4.111, -3.829, -3.26, -3.05, -3.04, -0.01, 0.01, 3.714, 10000],
            'abs_DIF':		[-10000, -4.796, -4.19, -0.1, 0.1, 2.398, 4.564, 10000],
            'pagg_DIF':	[-1.1, -0.95, -0.6, -0.4, -0.001, 0.001, 0.4, 0.6, 0.95, 1.1],
            'book_imb_dif':	[-1000, -27, -14, -6.8, -2.5, 0.1, 4.29, 11.2, 22, 1000],
            'book_imb_mean_40': [-10000, -30, -20, -16, 10000],
            'aggpior_aux_DIF':		[-10000, -4.331, -3.584, -3.045, -0.01, 0.01, 3.045, 10000] ,
            'pagg_aux_DIF':		[-1.1, -0.95, -0.5, -0.001, 0.001, 0.5, 0.95, 1.1],
            'escora_3.5_1.2_DIF':		[-1.1, -0.171, -0.0482, 0, 0.0482, 0.0927, 1.1],
            'movesc_3.5_0.7_DIF':		[-10000, -4.451, -1.391, -0.146, 0.0, 0.312, 10000],
            'book_imb_mean_dif_cp_ABS':		[-0.01, 0.119, 10000],
            'msg_imb_mean_dif_lp_ABS':		[-0.01, 0.281, 1.1],
            'loc_agg_imb_m':  [-1.01, -0.491, -0.366, -0.288, -0.223, -0.167, 1.01],
            'imp_FCAST_10': [-10000, -3.012, -1.674, -0.653, 10000],
}

l_bins_nulls = ['vol_big','n_big','PA_up','PA_down','vol_big_aux','n_aggimb_aux','imp_DIF','int_DIF',
                'int_DIF_50','abs_DIF_30','msg_imb_dif','depth_DIF',
                'depth_DIF_10','book_imb_mean_us_5','book_imb_mean_us_20','sspread_mean_us_5',
                'escora_2.5_2_DIF','movesc_4.5_0.7_DIF', 
                # TODO: 's_run', 'n_p_aux' treat for MW
]

# ft classification based on mean encoding 
l_ft_alta = ['pagg_DIF','loc_agg_net_m_aux','aggpior_aux_DIF','loc_agg_imb_m','vol_big_ratio','aggimb',
             'n_aggimb','agg_net_m','loc_aggbig_imb_m','n_p','vewma','agg_net_m_aux','smart_price_dif',
             'smart_price_50','agg_net_10','msg_imb_mean_40_ABS','ohlc_10','ohlc_50','loc_aggbig_net_m',
             'vol_trd_aux','aggbig_net_m_aux','rng_ewma_dif_80','vewma_10','rng_ewma','vewma_g_p','vewmag_dif',
             'n_trd_aux','escora_bid_2.5_0.8','escora_ask_2.5_0.8','escora_bid_3.5_0.8','escora_ask_3.5_0.8'
             ,'escora_bid_4.5_1.2','escora_ask_4.5_1.2','movesc_bid_3.5_0.5','movesc_ask_3.5_0.5','msg_imb','rng_smart_10',
]

l_ft_media = ['last_d_s','aggimb_big','loc_aggbig_imb_m_aux','pagg_aux_DIF','movesc_3.5_0.7_DIF','abs_DIF',
              'book_imb_mean_dif_cp_ABS','book_imb_dif','abagg','vewma_c_v','aggimb_aux','loc_aggbig_net_m_aux',
              'rng_smart_50','agg_net_40','loc_agg_net_10','int_DIF_10','abagg_10','msg_imb_mean_dif_cp',
              'sspread_mean','movesc_2.5_0.7_DIF','loc_agg_net_m','book_imb','rng_ewma_dif','rng_ewma_dif_40',
              'vewma_g_p_10','aggbig_net_10','vol_trd','loc_agg_imb_m_aux','movesc_bid_2.5_0.9','movesc_ask_2.5_0.9',
              'movesc_bid_4.5_0.9','movesc_ask_4.5_0.9','imp_DIF_10','imp_FCAST_40','aggpior_DIF','book_imb_mean_dif_cp',
              'msg_imb_mean_10',
]

# best features from RecursiveFeatureAdd/RandomForest, threshold= 0.002
l_best_ft = ['ohlc_50', 'vewma', 'vewmag_dif', 'rng_smart_10', 'ohlc_10',
            'loc_agg_imb_m', 'aggpior_DIF', 'agg_net_10', 'smart_price_dif',
            'escora_bid_3.5_0.8', 'msg_imb_mean_40_ABS']

l_ft_imp_me_scale = ['movesc_ask_2.5_0.9', 'smart_price_dif', 'ohlc_50', 'ohlc_10',
                    'movesc_bid_2.5_0.9', 'loc_agg_imb_m', 'loc_agg_net_m_aux', 'rng_smart_10',
]




In [10]:
pipe_cap_outliers = Pipeline([
                ('cap1', Winsorizer(variables=l_cap_1, capping_method='quantiles', fold = 0.01, tail = 'both')),
                ('cap5', Winsorizer(variables=l_cap_5, capping_method='quantiles', fold = 0.05, tail = 'both')),
                ('cap10', Winsorizer(variables=l_cap_10, capping_method='quantiles', fold = 0.10, tail = 'both')),
                ('cap001', Winsorizer(variables=l_cap_default, capping_method='quantiles', fold = 0.001, tail = 'both')),
])

pipe_norm_scale = Pipeline([
                ('scaler', SklearnTransformerWrapper(transformer=RobustScaler(quantile_range=(0.10, 0.90)))),
                ('minmax', SklearnTransformerWrapper(transformer=MinMaxScaler())),
                # ('pca', PCA(n_components=30, svd_solver='auto')), 
                # ('isomap', Isomap(n_components=13, n_neighbors=50, n_jobs=-1)),   #  expensive
])

pipe_discrete = Pipeline([('drop', DropFeatures(l_bins_nulls)),
                        ('duplicate', ft_eng.Duplicate(l_duplicate)),
                        ('outliers_all', Winsorizer(capping_method='quantiles', fold = 0.001, tail = 'both')),
                        ('bins_manual', ArbitraryDiscretiser(d_bins_arbitrary)),
                        ('bins_q10', EqualFrequencyDiscretiser(return_object=False, q=10, variables=l_bins_q10)),
                        ('bins_q8', EqualFrequencyDiscretiser(return_object=False, q=8, variables=l_bins_q8)),
                        ('bins_q6', EqualFrequencyDiscretiser(return_object=False, q=6, variables=l_bins_q6)),
                        ('bins_q5', EqualFrequencyDiscretiser(return_object=False, q=5, variables=l_bins_q5)),
                        ('bins_q4', EqualFrequencyDiscretiser(return_object=False, q=4, variables=l_bins_q4)),
                        ('bins_q2', EqualFrequencyDiscretiser(return_object=False, q=2, variables=l_bins_q2)),
])

pipe_mean_encoding = Pipeline([('drop', DropFeatures(l_bins_nulls)),
                        ('duplicate', ft_eng.Duplicate(l_duplicate)),
                        ('outliers_all', Winsorizer(capping_method='quantiles', fold = 0.001, tail = 'both')),
                        ('bins_manual', ArbitraryDiscretiser(d_bins_arbitrary)),
                        ('bins_q10', EqualFrequencyDiscretiser(return_object=True, q=10, variables=l_bins_q10)),
                        ('bins_q8', EqualFrequencyDiscretiser(return_object=True, q=8, variables=l_bins_q8)),
                        ('bins_q6', EqualFrequencyDiscretiser(return_object=True, q=6, variables=l_bins_q6)),
                        ('bins_q5', EqualFrequencyDiscretiser(return_object=True, q=5, variables=l_bins_q5)),
                        ('bins_q4', EqualFrequencyDiscretiser(return_object=True, q=4, variables=l_bins_q4)),
                        ('bins_q2', EqualFrequencyDiscretiser(return_object=True, q=2, variables=l_bins_q2)),
                        ('mean_enc', MeanEncoder())
])

# X_train_transf = pipe_mean_encoding.fit_transform(X_train, y_train)
# X_test_pre = pipe_cap_outliers.transform(X_test)

# DEBUG: must return all zeroes 
# pd.isna(X_train_transf).sum().sort_values(ascending=False)


In [11]:
# TODO: FOREST MINIMIZE 
pipe_svm_opt = Pipeline([('cap_outliers', pipe_cap_outliers),
                        ('norm_scale', pipe_norm_scale),
                        ('ft_sel_importance', ft_eng.SelectFt(l_ft_imp_me_scale))
])
pipe_svm_opt.fit(X_train, y_train) 
X_opt = pipe_svm_opt.transform(X_train)
X_test_opt = pipe_svm_opt.transform(X_test)

pipe_svm_opt_me = Pipeline([('mean_encoding', pipe_mean_encoding),
                            ('ft_sel_importance', ft_eng.SelectFt(l_ft_imp_me_scale))
])
pipe_svm_opt_me.fit(X_train, y_train) 
X_opt_me = pipe_svm_opt_me.transform(X_train)
X_test_opt_me = pipe_svm_opt_me.transform(X_test)

# union features scaled and mean encoding for "important" features
X_scaled_me = X_opt.join(X_opt_me, how='outer', rsuffix='_me', sort=False)
X_test_scaled_me = X_test_opt.join(X_test_opt_me, how='outer', rsuffix='_me', sort=False)

svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced',C=1.26, gamma=0.01)

svm_opt.fit(X_scaled_me, y_train)
l_results = tm_train.report_results(X_scaled_me, X_test_scaled_me, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

# train_obj = tm_train.ModelTraining()
# train_obj.optimized_training(X_scaled_me, y_train, 'svm', svm_opt, pipe = False)
# tm_train.save_models(train_obj,'train_obj_svm_opt_scaled_me', s_path)


  return f(*args, **kwargs)
train_auc: 0.6001, test_auc: 0.5600 (overfit: 0.0401)
train_f1:  0.5536, test_f1:  0.5004 (overfit: 0.0532)
train_f*:  0.5116, test_f*:  0.4763 (overfit: 0.0354)
--------
cm_train: 
[[3649 3382]
 [2013 3345]]
cm_test: 
[[973 807]
 [609 709]]
threshold: 0.450


In [12]:
train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_scaled_me, y_train, 'svm', svm_opt, pipe = False)


Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 113.8022
Function value obtained: 0.4047
Current minimum: 0.4047
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 117.1350
Function value obtained: 0.4314
Current minimum: 0.4047
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 115.8240
Function value obtained: 0.4318
Current minimum: 0.4047
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 116.1251
Function value obtained: 0.4344
Current minimum: 0.4047
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 112.3816
Function value obtained: 0.4226
Current minimum: 0.4047
Iteration No: 6 started. 

In [14]:
l_alta = ['pagg_DIF','loc_agg_net_m_aux','aggpior_aux_DIF','loc_agg_imb_m','vol_big_ratio','aggimb',
             'n_aggimb','agg_net_m','loc_aggbig_imb_m','n_p','vewma','agg_net_m_aux','smart_price_dif',
             'smart_price_50','agg_net_10','msg_imb_mean_40_ABS','ohlc_10','ohlc_50','loc_aggbig_net_m',
             'vol_trd_aux','aggbig_net_m_aux','rng_ewma_dif_80','vewma_10','rng_ewma','vewma_g_p','vewmag_dif',
             'n_trd_aux','escora_bid_2.5_0.8','escora_ask_2.5_0.8','escora_bid_3.5_0.8','escora_ask_3.5_0.8'
             ,'escora_bid_4.5_1.2','escora_ask_4.5_1.2','movesc_bid_3.5_0.5','movesc_ask_3.5_0.5','msg_imb','rng_smart_10',
]

l_media = ['last_d_s','aggimb_big','loc_aggbig_imb_m_aux','pagg_aux_DIF','movesc_3.5_0.7_DIF','abs_DIF',
              'book_imb_mean_dif_cp_ABS','book_imb_dif','abagg','vewma_c_v','aggimb_aux','loc_aggbig_net_m_aux',
              'rng_smart_50','agg_net_40','loc_agg_net_10','int_DIF_10','abagg_10','msg_imb_mean_dif_cp',
              'sspread_mean','movesc_2.5_0.7_DIF','loc_agg_net_m','book_imb','rng_ewma_dif','rng_ewma_dif_40',
              'vewma_g_p_10','aggbig_net_10','vol_trd','loc_agg_imb_m_aux','movesc_bid_2.5_0.9','movesc_ask_2.5_0.9',
              'movesc_bid_4.5_0.9','movesc_ask_4.5_0.9','imp_DIF_10','imp_FCAST_40','aggpior_DIF','book_imb_mean_dif_cp',
              'msg_imb_mean_10',
]
# best features from RecursiveFeatureAdd/RandomForest, threshold= 0.002

l_best = ['ohlc_50', 'vewma', 'vewmag_dif', 'rng_smart_10', 'ohlc_10',
          'loc_agg_imb_m', 'aggpior_DIF', 'agg_net_10', 'smart_price_dif',
          'escora_bid_3.5_0.8', 'msg_imb_mean_40_ABS'
]

l_ambas = ['movesc_ask_2.5_0.9', 'smart_price_dif', 'ohlc_50', 'ohlc_10',
           'movesc_bid_2.5_0.9', 'loc_agg_imb_m', 'loc_agg_net_m_aux', 'rng_smart_10',
]

l_tm = ['chgfreq','msg_imb_mean_dif_cp','pagg_aux_DIF','msg_imb','imp_FCAST_10','imp_FCAST_40',]
l_linear = ['pagg_DIF','vol_trd','vewmag_dif','msg_imb_mean_40_ABS','n_p','rng_ewma_dif',]
l_tree=['agg_net_10', 'vewma', 'vewma_c_v', 'agg_net_40', 'imp_DIF_10', 'book_imb_mean_dif_cp',]


In [15]:
N_OPT = 1
RUN_MODEL = 'svm'
RUN_BASE = 'scale_me'  # {'scale', 'discrete', 'me', 'scale_me', 'scale_discrete', }
FT_BINS = l_ambas  # {l_ambas, l_tree, l_linear, l_tm, l_alta, l_media}
FT_SCALE = l_ambas
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))


-------------------------------------------
OPTIMIZATION: 1


In [17]:
print('\noptimize_train():')
print('Best Metric: %.3f' % (1.0 - train_obj.result_opt.fun))
for i, val in enumerate(train_obj.result_opt.x):
    print('{}: {}'.format(train_obj.param_name[i],val))
print('execution time: 100.68')


optimize_train():
Best Metric: 0.598
svc__C: 24.12294736213961
svc__gamma: 0.0019248847839615224
execution time: 100.68


In [20]:

N_OPT = 2
RUN_MODEL = 'svm'
RUN_BASE = 'scale_me'  # {'scale', 'discrete', 'me', 'scale_me', 'scale_discrete', }
FT_BINS = list_union(l_ambas, l_linear, l_tm, l_alta)   # {l_ambas, l_tree, l_linear, l_tm, l_alta, l_media}
FT_SCALE = l_ambas

# TODO: code discrete
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))

if isinstance(FT_SCALE,list):
  pipe_svm_opt_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_svm_opt_s.fit(X_train, y_train) 
  X_scale = pipe_svm_opt_s.transform(X_train)
  X_test_scale = pipe_svm_opt_s.transform(X_test)

if isinstance(FT_BINS,list):
  pipe_svm_opt_bins = Pipeline([('mean_encoding', pipe_mean_encoding),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
  ])  # test with discrete instead of ME
  pipe_svm_opt_bins.fit(X_train, y_train) 
  X_me = pipe_svm_opt_bins.transform(X_train)
  X_test_me = pipe_svm_opt_bins.transform(X_test)

if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale

svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced',C=1.26, gamma=0.01)

svm_opt.fit(X_opt, y_train)
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe = False)
# tm_train.save_models(train_obj,'train_obj_svm_opt_scaled_me', s_path)




-------------------------------------------
OPTIMIZATION: 2
  return f(*args, **kwargs)
train_auc: 0.6303, test_auc: 0.5835 (overfit: 0.0468)
train_f1:  0.5686, test_f1:  0.5141 (overfit: 0.0545)
train_f*:  0.5346, test_f*:  0.4901 (overfit: 0.0445)
--------
cm_train: 
[[3981 3050]
 [2018 3340]]
cm_test: 
[[997 783]
 [591 727]]
threshold: 0.450
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 179.5487
Function value obtained: 0.4327
Current minimum: 0.4327
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 171.3881
Function value obtained: 0.4626
Current minimum: 0.4327
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 234.5315
Function value obtained: 0.5187
Current minimum: 0.4327
Iteration No: 4 started. Evaluating function at random point.
Iterat

In [24]:
l_svm_opt_results = []  # [n_opt, param_c, param_gamma, best_metric]


In [25]:
l_svm_opt_results.append([1, 24.12294736213961, 0.0019248847839615224, 0.598])  


In [26]:
l_svm_opt_results

[[1, 24.12294736213961, 0.0019248847839615224, 0.598]]

In [27]:
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, train_obj.result_opt.fun])  # columns: [n_opt, param_c, param_gamma, best_metric]
svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)
print('Model results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)
print('Final time: {}'.format(epoch2str(time.time())))


  return f(*args, **kwargs)
Model results for optimum C: 81.4109892709494 and gamma: 0.0005792284414399651: 
train_auc: 0.6332, test_auc: 0.5926 (overfit: 0.0406)
train_f1:  0.5639, test_f1:  0.5182 (overfit: 0.0457)
train_f*:  0.5360, test_f*:  0.5039 (overfit: 0.0321)
--------
cm_train: 
[[4099 2932]
 [2103 3255]]
cm_test: 
[[1065  715]
 [ 607  711]]
threshold: 0.450
Final time: [2021-05-27 16:12:56.436]


In [28]:
l_svm_opt_results

[[1, 24.12294736213961, 0.0019248847839615224, 0.598],
 [2, 81.4109892709494, 0.0005792284414399651, 0.3854155197496225]]

In [31]:

N_OPT = 4
RUN_MODEL = 'svm'
RUN_BASE = 'me' 
FT_BINS = list_union(l_ambas + l_linear + l_tm, l_alta, l_media)
FT_SCALE = 'NA'

# TODO: code discrete
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_svm_opt_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_svm_opt_s.fit(X_train, y_train) 
  X_scale = pipe_svm_opt_s.transform(X_train)
  X_test_scale = pipe_svm_opt_s.transform(X_test)

if isinstance(FT_BINS,list):
  pipe_svm_opt_bins = Pipeline([('mean_encoding', pipe_mean_encoding),
                              # ('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
  ])  
  pipe_svm_opt_bins.fit(X_train, y_train) 
  X_me = pipe_svm_opt_bins.transform(X_train)
  X_test_me = pipe_svm_opt_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced')   # ,C=1.26, gamma=0.01

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe = False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))




-------------------------------------------
OPTIMIZATION: 4
Initial time: [2021-05-27 16:42:15.501]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 217.6673
Function value obtained: 0.4340
Current minimum: 0.4340
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 234.9012
Function value obtained: 0.6587
Current minimum: 0.4340
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 242.7697
Function value obtained: 0.4399
Current minimum: 0.4340
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 214.3503
Function value obtained: 0.4018
Current minimum: 0.4018
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti

In [32]:
l_svm_opt_results

[[1, 24.12294736213961, 0.0019248847839615224, 0.598],
 [2, 81.4109892709494, 0.0005792284414399651, 0.3854155197496225],
 [4, 66.96994043468159, 0.002034897777407982, 0.39606239440065616]]

In [33]:
l_svm_opt_results[1][3]

0.3854155197496225

In [34]:
l_svm_opt_results[1][3] = 1-l_svm_opt_results[1][3]

In [35]:
l_svm_opt_results[2][3] = 1-l_svm_opt_results[2][3]

In [36]:
l_svm_opt_results

[[1, 24.12294736213961, 0.0019248847839615224, 0.598],
 [2, 81.4109892709494, 0.0005792284414399651, 0.6145844802503775],
 [4, 66.96994043468159, 0.002034897777407982, 0.6039376055993438]]

In [37]:

N_OPT = 3
RUN_MODEL = 'svm'
RUN_BASE = 'scale_discrete' 
FT_BINS = list_union(l_ambas + l_linear + l_tm, l_alta)
FT_SCALE = l_ambas

# TODO: code discrete and choose pipe based on RUN_BASE instead of FT
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_svm_opt_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_svm_opt_s.fit(X_train, y_train) 
  X_scale = pipe_svm_opt_s.transform(X_train)
  X_test_scale = pipe_svm_opt_s.transform(X_test)

if isinstance(FT_BINS,list):
  pipe_svm_opt_bins = Pipeline([# ('mean_encoding', pipe_mean_encoding),
                              ('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
  ])  
  pipe_svm_opt_bins.fit(X_train, y_train) 
  X_me = pipe_svm_opt_bins.transform(X_train)
  X_test_me = pipe_svm_opt_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list() for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced')   # ,C=1.26, gamma=0.01 

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe = False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, 1-train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))




-------------------------------------------
OPTIMIZATION: 3
Initial time: [2021-05-27 19:40:33.867]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 179.1548
Function value obtained: 0.4080
Current minimum: 0.4080
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 191.0142
Function value obtained: 0.5633
Current minimum: 0.4080
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 274.4609
Function value obtained: 0.6666
Current minimum: 0.4080
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 303.3357
Function value obtained: 0.6667
Current minimum: 0.4080
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti

In [38]:

N_OPT = 5
RUN_MODEL ='svm'
RUN_BASE = 'scale_me'
FT_BINS = l_alta
FT_SCALE = l_ambas + l_linear + l_tm

# TODO: code discrete and choose pipe based on RUN_BASE instead of FT
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_svm_opt_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_svm_opt_s.fit(X_train, y_train) 
  X_scale = pipe_svm_opt_s.transform(X_train)
  X_test_scale = pipe_svm_opt_s.transform(X_test)

if isinstance(FT_BINS,list):
  pipe_svm_opt_bins = Pipeline([# ('mean_encoding', pipe_mean_encoding),
                              ('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
  ])  
  pipe_svm_opt_bins.fit(X_train, y_train) 
  X_me = pipe_svm_opt_bins.transform(X_train)
  X_test_me = pipe_svm_opt_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list() for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced')   # ,C=1.26, gamma=0.01 

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe=False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, 1-train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))


# export l_results
pd.DataFrame(l_svm_opt_results, columns=['N_OPT', 'C_opt', 'gamma_opt', 'result']).to_csv(os.path.join(s_path,'l_svm_opt_results.csv'))


-------------------------------------------
OPTIMIZATION: 5
Initial time: [2021-05-27 20:25:07.300]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 179.0418
Function value obtained: 0.6371
Current minimum: 0.6371
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 180.7073
Function value obtained: 0.4082
Current minimum: 0.4082
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 179.5385
Function value obtained: 0.5595
Current minimum: 0.4082
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 264.4960
Function value obtained: 0.4283
Current minimum: 0.4082
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti

In [39]:
l_svm_opt_results

[[1, 24.12294736213961, 0.0019248847839615224, 0.598],
 [2, 81.4109892709494, 0.0005792284414399651, 0.6145844802503775],
 [4, 66.96994043468159, 0.002034897777407982, 0.6039376055993438],
 [3, 1.3429450785281131e-06, 0.00036415083918894586, 0.5939489402261352],
 [5, 0.00031468149842993917, 0.0004161508044879455, 0.5918470622897115]]

In [43]:

N_OPT = 7
RUN_MODEL = 'svm'
RUN_BASE = 'scale_me'  
FT_BINS = l_ambas + l_linear + l_tm
FT_SCALE = l_ambas

# TODO: code discrete and choose pipe based on RUN_BASE instead of FT
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_svm_opt_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_svm_opt_s.fit(X_train, y_train) 
  X_scale = pipe_svm_opt_s.transform(X_train)
  X_test_scale = pipe_svm_opt_s.transform(X_test)

if isinstance(FT_BINS,list):
  if RUN_BASE in ['scale_me','me']:
    pipe_svm_opt_bins = Pipeline([('mean_encoding', pipe_mean_encoding),
                                ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  else:
    pipe_svm_opt_bins = Pipeline([('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  pipe_svm_opt_bins.fit(X_train, y_train) 
  X_me = pipe_svm_opt_bins.transform(X_train)
  X_test_me = pipe_svm_opt_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list() for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced', C=24.12, gamma=0.002) 

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe=False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, 1-train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))

# export l_results
pd.DataFrame(l_svm_opt_results, columns=['N_OPT', 'C_opt', 'gamma_opt', 'result']).to_csv(os.path.join(s_path,'l_svm_opt_results.csv'))


-------------------------------------------
OPTIMIZATION: 7
Initial time: [2021-05-28 11:13:57.081]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 125.2855
Function value obtained: 0.3928
Current minimum: 0.3928
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 131.1696
Function value obtained: 0.4227
Current minimum: 0.3928
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 132.5559
Function value obtained: 0.5426
Current minimum: 0.3928
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 126.5605
Function value obtained: 0.4225
Current minimum: 0.3928
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti

In [44]:

N_OPT = 9
RUN_MODEL = 'svm'
RUN_BASE = 'scale_me'  
FT_BINS = l_ambas + l_linear 
FT_SCALE = l_ambas + l_linear 

# TODO: code discrete and choose pipe based on RUN_BASE instead of FT
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_s.fit(X_train, y_train) 
  X_scale = pipe_s.transform(X_train)
  X_test_scale = pipe_s.transform(X_test)

if isinstance(FT_BINS,list):
  if RUN_BASE in ['scale_me','me']:
    pipe_bins = Pipeline([('mean_encoding', pipe_mean_encoding),
                                ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  else:
    pipe_bins = Pipeline([('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  pipe_bins.fit(X_train, y_train) 
  X_me = pipe_bins.transform(X_train)
  X_test_me = pipe_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list() for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced', C=24.12, gamma=0.002) 

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe=False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, 1-train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))

# export l_results
pd.DataFrame(l_svm_opt_results, columns=['N_OPT', 'C_opt', 'gamma_opt', 'result']).to_csv(os.path.join(s_path,'l_svm_opt_results.csv'))


-------------------------------------------
OPTIMIZATION: 9
Initial time: [2021-05-28 14:35:05.658]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 145.0171
Function value obtained: 0.4539
Current minimum: 0.4539
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 124.2284
Function value obtained: 0.4007
Current minimum: 0.4007
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 128.6944
Function value obtained: 0.5583
Current minimum: 0.4007
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 131.1725
Function value obtained: 0.5962
Current minimum: 0.4007
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti

In [45]:

N_OPT = 8
RUN_MODEL = 'svm'
RUN_BASE = 'scale_me'
FT_BINS = l_ambas + l_tm 
FT_SCALE = l_ambas + l_tm 


# TODO: code discrete and choose pipe based on RUN_BASE instead of FT
print('-------------------------------------------')
print('OPTIMIZATION: {}'.format(N_OPT))
print('Initial time: {}'.format(epoch2str(time.time())))

if isinstance(FT_SCALE,list):
  pipe_s = Pipeline([('cap_outliers', pipe_cap_outliers),
                          ('norm_scale', pipe_norm_scale),
                          ('ft_sel_importance', ft_eng.SelectFt(FT_SCALE))
  ])
  pipe_s.fit(X_train, y_train) 
  X_scale = pipe_s.transform(X_train)
  X_test_scale = pipe_s.transform(X_test)

if isinstance(FT_BINS,list):
  if RUN_BASE in ['scale_me','me']:
    pipe_bins = Pipeline([('mean_encoding', pipe_mean_encoding),
                                ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  else:
    pipe_bins = Pipeline([('discrete', pipe_discrete),
                              ('ft_sel_importance', ft_eng.SelectFt(FT_BINS))
    ])  
  pipe_bins.fit(X_train, y_train) 
  X_me = pipe_bins.transform(X_train)
  X_test_me = pipe_bins.transform(X_test)


if isinstance(FT_BINS,list) & isinstance(FT_SCALE,list):
  # union features scaled and bins 
  X_opt = X_scale.join(X_me, how='outer', rsuffix='_me', sort=False)
  X_test_opt = X_test_scale.join(X_test_me, how='outer', rsuffix='_me', sort=False)
elif isinstance(FT_BINS,list):
  X_opt = X_me
  X_test_opt = X_test_me
elif isinstance(FT_SCALE,list):
  X_opt = X_scale
  X_test_opt = X_test_scale
else:
  print('\nOptimization Warning: Please, specify a list() for FT_BINS and/or FT_SCALE!')


svm_opt = SVC(probability=True, cache_size=1000, verbose=0,
              class_weight= 'balanced', C=24.12, gamma=0.002) 

# svm_opt.fit(X_opt, y_train)
# l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

train_obj = tm_train.ModelTraining()
train_obj.optimized_training(X_opt, y_train, 'svm', svm_opt, pipe=False)
C_opt = train_obj.result_opt.x[0]
gamma_opt = train_obj.result_opt.x[1]
l_svm_opt_results.append([N_OPT, C_opt, gamma_opt, 1-train_obj.result_opt.fun]) 

svm_opt.set_params(C=C_opt, gamma=gamma_opt)
svm_opt.fit(X_opt, y_train)

print('\nModel results for optimum C: {} and gamma: {}: '.format(C_opt, gamma_opt))
l_results = tm_train.report_results(X_opt, X_test_opt, y_train, y_test, fitted_model=svm_opt, ready_probs=False, th=0)

print('\nFinal time: {}'.format(epoch2str(time.time())))

# export l_results
pd.DataFrame(l_svm_opt_results, columns=['N_OPT', 'C_opt', 'gamma_opt', 'result']).to_csv(os.path.join(s_path,'l_svm_opt_results.csv'))


-------------------------------------------
OPTIMIZATION: 8
Initial time: [2021-05-28 15:25:20.876]
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 138.7605
Function value obtained: 0.5756
Current minimum: 0.5756
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 127.3393
Function value obtained: 0.4120
Current minimum: 0.4120
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 206.9300
Function value obtained: 0.4539
Current minimum: 0.4120
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 134.2765
Function value obtained: 0.4339
Current minimum: 0.4120
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Ti