In [1]:
import os

import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import sgpp, sgml, sgutil

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
expr_dic = {}
for i in ['sunshine', 'cloud', 'dewpoint', 'chp', 'cos_wd', 'sin_wd']:
    for j in range(1, 7):
        expr_dic['{}_{}'.format(i, j)] =  pl.col(i).shift(j).fill_null(strategy = 'backward')
for i in ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp']:
    expr_dic['{}_ma24'.format(i)] = pl.mean(i).rolling(index_column = 'id', period = '24i', closed = 'left').fill_null(strategy = 'backward')

In [3]:
p1 = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64}),
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'expected_day': (pl.col('id') % 365) + 1,
        'sin_ed': ((pl.col('id') % 365) / 365 * np.pi).sin(),
        'year': pl.col('id') // 365, 
    })
)
df_train = p1.fit_transform(['data/train.csv'])
df_test = p1.transform(['data/test.csv'])
p2 = make_pipeline(
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(index_col = 'id')
)
df_all = p2.fit_transform(
    pl.concat([df_train, df_test], how = 'align')
)

p3 = make_pipeline(
    sgpp.ApplyWrapper(
        StandardScaler().set_output(transform='pandas'), 
        ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed'] + ['cos_wd', 'sin_wd', 'chp'] + list(expr_dic.keys()),
    ),
    sgpp.ApplyWrapper(
        MinMaxScaler().set_output(transform='pandas'), ['year', 'expected_day', 'winddirection']
    )
)
df_all = p3.fit_transform(df_all)

df_train = df_all.loc[df_all['rainfall'].notna()]
df_test = df_all.loc[df_all['rainfall'].isna()].drop(columns = ['rainfall'])

In [4]:
df_org = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64, 'day': pl.Int16, 'rainfall': pl.String}),
    sgpp.ColumnNameCleaner(),
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'id': -pl.arange(pl.col('day').len(), 0, -1),
        'expected_day': pl.arange(1, pl.col('day').len() + 1),
        'sin_ed': (pl.arange(1, pl.col('day').len() + 1) / 365 * np.pi).sin(),
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'rainfall': pl.col('rainfall').replace({'yes': 1, 'no': 0}).cast(pl.Int8),
        'year': -1
    }),
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(index_col = 'id'), 
).fit_transform(['data/Rainfall.csv'])
df_org = p3.transform(df_org)

In [5]:
target = 'rainfall'
sc = sgutil.SGCache('img', 'result', 'model')
_ = sc.cache_result(
    'target', lambda : df_train[target].sort_index()
)

In [6]:
len(df_test.columns)

65

In [37]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [38]:
from sklearn.feature_selection import SequentialFeatureSelector
X_all = [i for i in df_test.columns.tolist() if i not in ['day']]

# Forward Feature Selection wrapping Logistic Regression

In [39]:
sfs = sc.cache_result(
    'ff_sfs_lr',
    lambda : SequentialFeatureSelector(
        estimator = LogisticRegression(), direction = 'forward', scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'sunshine', 'windspeed', 'chp', 'sunshine_1',
       'sunshine_2', 'sunshine_3', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cloud_5', 'cloud_6', 'dewpoint_1', 'dewpoint_5', 'chp_1', 'chp_3',
       'chp_5', 'cos_wd_3', 'cos_wd_4', 'cos_wd_5', 'sin_wd_2',
       'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24',
       'humidity_ma24', 'sunshine_ma24', 'year'], dtype='<U16')

In [40]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8972053872053873, 0.012120648342876662)

In [41]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
dewpoint_ma24,-0.020234,0.113573,5.612868
maxtemp_ma24,-0.024915,0.081302,3.263118
cos_wd_3,-0.014448,0.043408,3.004357
sunshine_3,-0.018969,0.033022,1.740821
sin_wd_2,-0.01501,0.022889,1.524927
dewpoint_1,0.052131,0.071972,1.38061
chp_1,-0.039389,0.049762,1.26336
maxtemp,0.127891,0.153483,1.200112
temparature_ma24,0.365757,0.205129,0.560833
dewpoint_5,-0.063698,0.031295,0.491304


In [42]:
sc.cv_result('lr_sfs', hparams, lr_adapter, result['valid_prd'])

In [43]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[8:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8983557800224468, 0.011976255278421333)

In [44]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
dewpoint_5,-0.060115,0.045998,0.765164
temparature,-0.14863,0.109471,0.736534
temparature_ma24,0.339612,0.173672,0.511386
cos_wd_5,0.098969,0.045272,0.45744
chp_3,-0.25801,0.096879,0.375485
cloud_6,0.065133,0.023313,0.357921
mintemp,-0.333428,0.118167,0.354401
cloud_5,0.185984,0.061321,0.32971
sunshine_2,0.115192,0.037786,0.328024
cloud_4,0.149698,0.045243,0.302226


In [45]:
sc.cv_result('lr_sfs2', hparams, lr_adapter, result['valid_prd'])

In [46]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[4:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8985185185185184, 0.01312478134285176)

In [47]:
sc.cv_result('lr_sfs3', hparams, lr_adapter, result['valid_prd'])

In [48]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
cloud_6,0.069157,0.025011,0.361659
sunshine_2,0.108589,0.038656,0.355989
chp_3,-0.28322,0.08874,0.313325
cloud_4,0.156865,0.04529,0.288716
pressure,-0.134968,0.037416,0.277223
sin_ed,-0.948355,0.226483,0.238817
sunshine_4,0.21675,0.051374,0.237018
cloud_5,0.23214,0.053793,0.231728
year,0.554469,0.127472,0.2299
chp_5,-0.200099,0.045552,0.227648


# Forward Feature Selection wrapping Logistic Regression Aumenting Org

In [49]:
sfs = sc.cache_result(
    'ff_sfs_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = LogisticRegression(), direction = 'forward', scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd',
       'chp', 'sunshine_1', 'sunshine_2', 'sunshine_3', 'sunshine_4',
       'sunshine_5', 'cloud_2', 'cloud_3', 'cloud_4', 'dewpoint_2',
       'dewpoint_6', 'chp_2', 'chp_4', 'chp_6', 'cos_wd_3', 'sin_wd_2',
       'maxtemp_ma24', 'temparature_ma24', 'mintemp_ma24', 'year',
       'winddirection'], dtype='<U16')

In [50]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8947811447811448, 0.012302086080667738)

In [51]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
maxtemp,-0.021097,0.14982,7.101507
cos_wd_3,-0.010459,0.045744,4.373715
mintemp_ma24,0.040715,0.160245,3.935769
sunshine_5,0.019164,0.033826,1.765062
chp_6,-0.022549,0.038773,1.71953
sin_wd_2,-0.022583,0.030652,1.357293
maxtemp_ma24,0.119041,0.137336,1.153678
chp_4,0.042263,0.034726,0.821672
chp,0.069188,0.052726,0.762075
temparature,-0.166954,0.126749,0.759183


In [52]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a2', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8962906846240178, 0.01146601290644487)

In [53]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
chp_4,0.034988,0.042251,1.207573
chp,0.054333,0.046949,0.864086
sunshine_3,0.048751,0.035849,0.735355
dewpoint_2,-0.100342,0.063914,0.636965
temparature,-0.184064,0.098829,0.536927
maxtemp_ma24,0.118299,0.062029,0.524339
chp_2,-0.134452,0.053239,0.395968
year,0.290991,0.111759,0.384062
cos_wd,-0.257909,0.063183,0.244983
dewpoint_6,0.340524,0.075403,0.221431


In [54]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[7:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a3', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8971043771043771, 0.011150089212136121)

# Backward Feature Selection wrapping Logistic Regression

In [55]:
sfs = sc.cache_result(
    'bf_sfs_lr',
    lambda : SequentialFeatureSelector(
        estimator = LogisticRegression(), direction = 'backward', scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = False
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp',
       'sunshine_1', 'sunshine_2', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cloud_5', 'dewpoint_5', 'chp_3', 'chp_4', 'chp_5', 'cos_wd_3',
       'cos_wd_4', 'sin_wd_3', 'temparature_ma24', 'mintemp_ma24',
       'cloud_ma24', 'sunshine_ma24', 'cos_wd_ma24', 'sin_wd_ma24',
       'chp_ma24', 'year'], dtype='<U16')

In [56]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8971717171717172, 0.011681897853436847)

# Backward Feature Selection wrapping Logistic Regression Augumenting Org

In [57]:
sfs = sc.cache_result(
    'bf_sfs_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = LogisticRegression(max_iter = 500), direction = 'backward', scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'mintemp', 'dewpoint', 'humidity', 'cloud',
       'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp', 'sunshine_1',
       'sunshine_3', 'sunshine_4', 'sunshine_6', 'cloud_4', 'cloud_5',
       'dewpoint_2', 'dewpoint_6', 'chp_3', 'chp_5', 'chp_6', 'cos_wd_3',
       'sin_wd_1', 'sin_wd_3', 'sin_wd_5', 'temparature_ma24',
       'mintemp_ma24', 'cos_wd_ma24', 'sin_wd_ma24', 'year',
       'winddirection'], dtype='<U16')

In [58]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs_a', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8929517396184063, 0.009206929636066645)

# Forward Feature Selection wrapping LGBM

In [59]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.889421997755331, 0.9399807098765433)

In [60]:
sfs = sc.cache_result(
    'ff_sfs_lgb',
    lambda : SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'sunshine', 'windspeed', 'sin_wd', 'chp', 'cloud_1', 'cloud_2',
       'cloud_3', 'cloud_5', 'cloud_6', 'dewpoint_1', 'dewpoint_2',
       'dewpoint_4', 'chp_3', 'chp_5', 'cos_wd_3', 'cos_wd_4', 'cos_wd_5',
       'sin_wd_2', 'sin_wd_6', 'maxtemp_ma24', 'temparature_ma24',
       'mintemp_ma24', 'dewpoint_ma24', 'humidity_ma24', 'sunshine_ma24',
       'year', 'expected_day'], dtype='<U16')

In [61]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8978815937149272, 0.9357921226150394)

In [62]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
cos_wd_5,9.0,8.455767,0.93953
mintemp_ma24,5.6,5.176872,0.924441
cloud_5,4.6,3.209361,0.697687
sin_wd_6,9.0,5.91608,0.657342
expected_day,8.6,5.549775,0.645323
sin_ed,6.4,4.09878,0.640434
dewpoint_4,13.2,8.348653,0.632474
dewpoint_1,4.8,3.03315,0.631906
chp_5,4.8,3.03315,0.631906
year,2.8,1.643168,0.586846


In [63]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': df_imp.iloc[3:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs2', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8974635241301907, 0.9356849747474747)

In [64]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
dewpoint_1,6.2,5.449771,0.878995
cloud_2,2.8,2.167948,0.774267
sin_wd_2,5.2,3.701351,0.711798
maxtemp_ma24,6.4,4.393177,0.686434
temparature_ma24,8.4,5.59464,0.666029
sunshine_ma24,23.4,13.557286,0.579371
sin_wd_6,8.4,4.827007,0.574644
cos_wd_3,10.4,5.85662,0.563137
cloud_6,6.0,3.316625,0.552771
dewpoint_4,12.0,6.442049,0.536837


# Forward Feature Selection wrapping LGBM Augumenting Org

In [65]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_lgb2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'chp',
       'sunshine_1', 'sunshine_5', 'cloud_2', 'cloud_3', 'cloud_6',
       'dewpoint_1', 'dewpoint_2', 'chp_2', 'chp_3', 'chp_4', 'chp_6',
       'cos_wd_1', 'sin_wd_5', 'maxtemp_ma24', 'temparature_ma24',
       'dewpoint_ma24', 'humidity_ma24', 'cloud_ma24', 'windspeed_ma24',
       'chp_ma24', 'year'], dtype='<U16')

In [66]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_a', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.894753086419753, 0.9293044541265711)

In [67]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
year,0.4,0.547723,1.369306
sin_wd_5,6.6,6.465292,0.97959
chp_6,3.6,3.130495,0.869582
dewpoint_1,4.4,3.507136,0.797076
windspeed_ma24,6.2,3.768289,0.607789
maxtemp,17.6,10.285913,0.584427
chp_4,2.6,1.516575,0.583298
chp_3,4.6,2.607681,0.566887
humidity_ma24,4.0,2.236068,0.559017
temparature,7.0,3.535534,0.505076


In [68]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': df_imp.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_a2', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8944360269360269, 0.9285391714003444)

# Backward Feature Selection wrapping LGBM

In [69]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_lgb',
    lambda : SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'backward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['temparature', 'mintemp', 'sunshine', 'windspeed', 'cos_wd', 'chp',
       'sunshine_1', 'sunshine_6', 'cloud_1', 'cloud_5', 'cloud_6',
       'dewpoint_3', 'chp_1', 'chp_2', 'chp_3', 'chp_5', 'cos_wd_3',
       'cos_wd_4', 'sin_wd_3', 'sin_wd_4', 'sin_wd_6', 'pressure_ma24',
       'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24',
       'humidity_ma24', 'cloud_ma24', 'sunshine_ma24', 'sin_wd_ma24',
       'chp_ma24', 'year', 'winddirection'], dtype='<U16')

In [70]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_bfs', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.894851290684624, 0.9360164141414142)

# Forward Feature Selection wrapping XGB

In [71]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.890830527497194, 0.9449452861952862)

In [72]:
sfs = sc.cache_result(
    'ff_sfs_xgb',
    lambda : SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'sunshine', 'windspeed', 'chp', 'sunshine_3', 'sunshine_6',
       'cloud_1', 'cloud_6', 'dewpoint_2', 'dewpoint_5', 'chp_1', 'chp_3',
       'chp_4', 'chp_6', 'cos_wd_1', 'cos_wd_4', 'cos_wd_5', 'sin_wd_2',
       'sin_wd_5', 'sin_wd_6', 'maxtemp_ma24', 'temparature_ma24',
       'dewpoint_ma24', 'humidity_ma24', 'sunshine_ma24',
       'windspeed_ma24', 'sin_wd_ma24', 'year'], dtype='<U16')

In [73]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8974074074074074, 0.939300119248036)

In [74]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
sin_wd_2,0.015824,0.010957,0.692473
temparature,0.045126,0.029284,0.648934
mintemp,0.026033,0.013939,0.535451
sin_wd_ma24,0.028133,0.01494,0.531041
chp_4,0.019704,0.010135,0.514338
dewpoint_2,0.029755,0.010634,0.357371
cloud_6,0.012646,0.004439,0.351047
maxtemp,0.044567,0.014776,0.331533
temparature_ma24,0.021675,0.007158,0.330252
cos_wd_5,0.019397,0.00594,0.306239


In [75]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': df_imp.iloc[2:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs2', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8980808080808081, 0.9389749929854097)

# Forward Feature Selection wrapping LGBM Augumenting Org

In [76]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_xgb2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd',
       'sin_wd', 'chp', 'sunshine_1', 'sunshine_3', 'sunshine_6',
       'cloud_3', 'cloud_4', 'cloud_5', 'dewpoint_3', 'chp_2', 'chp_3',
       'cos_wd_1', 'cos_wd_2', 'cos_wd_6', 'sin_wd_2', 'temparature_ma24',
       'mintemp_ma24', 'cloud_ma24', 'sunshine_ma24', 'windspeed_ma24',
       'year'], dtype='<U16')

In [77]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs_a', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.892730078563412, 0.9328386625662745)

In [78]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
year,0.004691,0.00451,0.961465
temparature,0.044348,0.022436,0.505911
dewpoint_3,0.021473,0.010637,0.495364
cloud_4,0.01886,0.00884,0.468726
maxtemp,0.034693,0.01121,0.323127
cos_wd_1,0.015215,0.004814,0.31637
mintemp,0.046001,0.013188,0.286681
cos_wd_6,0.018253,0.004961,0.271796
windspeed_ma24,0.01901,0.005144,0.270591
cloud_5,0.011589,0.003097,0.267268


In [79]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': df_imp.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.892564534231201, 0.9311259477513435)

# Backward Feature Selection wrapping XGB

In [80]:
sfs = sc.cache_result(
    'bf_sfs_xgb',
    lambda : SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'backward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['pressure', 'dewpoint', 'sunshine', 'windspeed', 'cos_wd', 'chp',
       'sunshine_1', 'sunshine_2', 'sunshine_3', 'cloud_1', 'cloud_3',
       'cloud_4', 'cloud_5', 'cloud_6', 'dewpoint_2', 'dewpoint_3',
       'chp_3', 'chp_4', 'chp_6', 'cos_wd_1', 'cos_wd_2', 'cos_wd_4',
       'sin_wd_2', 'sin_wd_6', 'pressure_ma24', 'maxtemp_ma24',
       'temparature_ma24', 'mintemp_ma24', 'humidity_ma24',
       'sunshine_ma24', 'windspeed_ma24', 'chp_ma24'], dtype='<U16')

In [81]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_bfs', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8956060606060607, 0.9394081439393938)

# Stepwise Feature Selections wrapping Logistic Regression

In [82]:
from mlxtend import feature_selection as mfs

In [83]:
sfs = sc.cache_result(
    'ff_sfsf_lr',
    lambda : mfs.SequentialFeatureSelector(
        estimator = LogisticRegression(), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'pressure', 'mintemp', 'dewpoint', 'sunshine',
       'windspeed', 'chp', 'sunshine_1', 'sunshine_2', 'sunshine_4',
       'cloud_3', 'cloud_4', 'cloud_5', 'dewpoint_5', 'chp_5', 'cos_wd_4',
       'maxtemp_ma24', 'temparature_ma24', 'humidity_ma24', 'cos_wd_ma24',
       'sin_wd_ma24', 'year'], dtype='<U16')

In [84]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_f', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8990460157126824, 0.01061797500616197)

In [85]:
sfs = sc.cache_result(
    'ff_sfsf_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: mfs.SequentialFeatureSelector(
            estimator = LogisticRegression(), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'pressure', 'mintemp', 'dewpoint', 'humidity', 'cloud',
       'sunshine', 'windspeed', 'cos_wd', 'sunshine_1', 'cloud_3',
       'dewpoint_2', 'dewpoint_6', 'maxtemp_ma24', 'temparature_ma24',
       'mintemp_ma24', 'winddirection'], dtype='<U16')

In [86]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_f2', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8960437710437711, 0.011426817196481703)

In [87]:
sfs = sc.cache_result(
    'bf_sfsf_lr',
    lambda : mfs.SequentialFeatureSelector(
        estimator = LogisticRegression(), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'dewpoint', 'cloud', 'sunshine', 'windspeed',
       'chp', 'sunshine_1', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cos_wd_4', 'temparature_ma24', 'humidity_ma24', 'cos_wd_ma24',
       'sin_wd_ma24', 'year'], dtype='<U16')

In [88]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs_f', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8954320987654322, 0.012407271619931246)

In [89]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_lgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'sunshine', 'sin_wd', 'chp', 'cloud_1',
       'cloud_5', 'cloud_6', 'chp_3', 'sin_wd_6', 'maxtemp_ma24',
       'temparature_ma24', 'sunshine_ma24', 'chp_ma24'], dtype='<U16')

In [90]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_f', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.897732884399551, 0.9294440937149272)

In [91]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_lgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'temparature', 'mintemp', 'dewpoint', 'sunshine',
       'windspeed', 'chp', 'cloud_3', 'chp_3', 'chp_4', 'cos_wd_4',
       'maxtemp_ma24', 'temparature_ma24', 'humidity_ma24', 'cos_wd_ma24'],
      dtype='<U16')

In [92]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_bfs_f', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8989450056116723, 0.9331223695286195)

In [93]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_xgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'sunshine', 'windspeed', 'chp', 'sunshine_3', 'cloud_1', 'cloud_6',
       'dewpoint_2', 'cos_wd_4', 'cos_wd_5', 'sin_wd_2', 'sin_wd_6',
       'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24',
       'humidity_ma24'], dtype='<U16')

In [94]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs_f', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8985437710437709, 0.9357281144781145)

In [95]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_xgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'dewpoint', 'sunshine', 'windspeed', 'chp',
       'sunshine_3', 'cloud_1', 'dewpoint_2', 'cos_wd_4', 'sin_wd_6',
       'dewpoint_ma24', 'humidity_ma24', 'sin_wd_ma24', 'year'],
      dtype='<U13')

In [96]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_bfs_f', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.899405162738496, 0.9343618476430977)

In [97]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8915151515151514, 0.9011123386644219)

In [98]:
sfs = sc.cache_result(
    'ff_sfsf_lsvc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'pressure', 'temparature', 'mintemp', 'dewpoint',
       'sunshine', 'windspeed', 'chp', 'sunshine_1', 'sunshine_2',
       'cloud_3', 'dewpoint_2', 'dewpoint_5', 'dewpoint_6', 'chp_5',
       'cos_wd_5', 'sin_wd_2', 'sin_wd_4', 'maxtemp_ma24',
       'temparature_ma24', 'dewpoint_ma24', 'humidity_ma24',
       'cos_wd_ma24', 'year'], dtype='<U16')

In [99]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('lsvc_sfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8986139169472503, 0.8995172208193042)

In [100]:
sfs = sc.cache_result(
    'bf_sfsf_lsvc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'pressure', 'temparature', 'mintemp', 'dewpoint',
       'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd',
       'sunshine_1', 'cloud_3', 'cos_wd_4', 'cos_wd_5', 'sin_wd_4',
       'temparature_ma24', 'mintemp_ma24', 'humidity_ma24', 'cos_wd_ma24',
       'year'], dtype='<U16')

In [101]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('lsvc_bfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8990712682379349, 0.8999386223344559)

In [102]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8931032547699214, 0.9092634680134679)

In [103]:
sfs = sc.cache_result(
    'ff_sfsf_p2svc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['mintemp', 'dewpoint', 'sunshine', 'windspeed', 'chp', 'cloud_1',
       'cloud_3', 'cloud_6', 'dewpoint_2', 'cos_wd_1', 'cos_wd_3',
       'cos_wd_4', 'dewpoint_ma24', 'sunshine_ma24', 'cos_wd_ma24',
       'sin_wd_ma24', 'year'], dtype='<U13')

In [104]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('p2svc_sfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9008810325476994, 0.9073690025252527)

In [122]:
sfs = sc.cache_result(
    'bf_sfsf_p2svc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
np.array(sfs.k_feature_names_)

array(['mintemp', 'dewpoint', 'sunshine', 'windspeed', 'chp', 'cloud_1',
       'cloud_3', 'cloud_6', 'dewpoint_2', 'cos_wd_1', 'cos_wd_3',
       'cos_wd_4', 'dewpoint_ma24', 'sunshine_ma24', 'cos_wd_ma24',
       'sin_wd_ma24', 'year'], dtype='<U13')

In [123]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('p2svc_bfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9009034792368127, 0.9073684764309764)

In [124]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8661475869809203, 0.8825776865881032)

In [125]:
sfs = sc.cache_result(
    'ff_sfsf_knn70',
    lambda : mfs.SequentialFeatureSelector(
        estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'dewpoint', 'cloud', 'sunshine', 'chp', 'dewpoint_2',
       'chp_1', 'cos_wd_4', 'cos_wd_6', 'sin_wd_3', 'sunshine_ma24',
       'chp_ma24', 'year'], dtype='<U13')

In [126]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_sfs_f', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8974354657687991, 0.9070700056116723)

In [127]:
sfs = sc.cache_result(
    'bf_sfsf_knn70',
    lambda : mfs.SequentialFeatureSelector(
        estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'humidity', 'cloud', 'sunshine', 'windspeed',
       'chp', 'sunshine_3', 'cloud_6', 'dewpoint_4', 'dewpoint_6',
       'chp_2', 'cos_wd_3', 'cos_wd_4', 'cos_wd_6', 'sin_wd_3',
       'sin_wd_4', 'mintemp_ma24', 'dewpoint_ma24', 'chp_ma24'],
      dtype='<U13')

In [128]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_bfs_f', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8981228956228955, 0.904110725308642)

In [129]:
sfs = sc.cache_result(
    'ff_sfsf_knn702',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: mfs.SequentialFeatureSelector(
            estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'temparature', 'mintemp', 'dewpoint', 'humidity',
       'cloud', 'sunshine', 'windspeed', 'sunshine_6', 'dewpoint_1',
       'dewpoint_6', 'sin_wd_6', 'pressure_ma24', 'maxtemp_ma24',
       'temparature_ma24', 'mintemp_ma24', 'dewpoint_ma24', 'cloud_ma24',
       'sunshine_ma24', 'expected_day', 'winddirection'], dtype='<U16')

In [130]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_sfs_f2', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8916189674523007, 0.8991573995711537)

# Feature Selection Analysis

In [131]:
result_sk = ['ff_sfs_lr', 'ff_sfs_lr2', 'bf_sfs_lr', 'bf_sfs_lr2', 'ff_sfs_lgb', 'ff_sfs_lgb2', 'bf_sfs_lgb', 'ff_sfs_xgb', 'ff_sfs_xgb2', 'bf_sfs_xgb']
selected_sk = [
    np.array(X_all)[sc.read_result(i).get_support()].tolist()
    for i in result_sk
]

In [132]:
pd.Series(selected_sk, index = result_sk).explode().value_counts().iloc[:10]

temparature_ma24    10
sunshine            10
windspeed           10
chp                 10
chp_3                9
year                 9
mintemp              9
dewpoint             9
sunshine_1           8
sin_ed               8
Name: count, dtype: int64

In [133]:
result_mfs = [
    'ff_sfsf_lr', 'ff_sfsf_lr2', 'bf_sfsf_lr', 'ff_sfs_lgb_f', 'bf_sfs_lgb_f', 'ff_sfs_xgb_f', 'bf_sfs_xgb_f', 'ff_sfsf_lsvc', 'bf_sfsf_lsvc',
    'ff_sfsf_p2svc', 'bf_sfsf_p2svc', 'ff_sfsf_knn70', 'bf_sfsf_knn70', 'ff_sfsf_knn702'
]
selected_mfs = [
    list(sc.read_result(i).k_feature_names_)
    for i in result_mfs
]

In [134]:
pd.Series(selected_mfs, index = result_mfs).explode().value_counts().iloc[:10]

sunshine            14
mintemp             13
sin_ed              12
dewpoint            12
windspeed           12
chp                 11
cos_wd_4            10
temparature_ma24     9
year                 8
cloud_3              8
Name: count, dtype: int64

In [135]:
pd.concat([
    pd.Series(selected_mfs, index = result_mfs).explode(),
    pd.Series(selected_mfs, index = result_mfs).explode()
]).value_counts().iloc[:30]

sunshine            28
mintemp             26
sin_ed              24
dewpoint            24
windspeed           24
chp                 22
cos_wd_4            20
temparature_ma24    18
year                16
cloud_3             16
dewpoint_ma24       14
dewpoint_2          14
cos_wd_ma24         14
maxtemp_ma24        14
humidity_ma24       14
cloud               12
sunshine_1          10
sunshine_ma24       10
temparature         10
sin_wd_ma24         10
cloud_6             10
cloud_1             10
sin_wd_6             8
humidity             8
dewpoint_6           8
mintemp_ma24         8
pressure             8
cos_wd_5             6
sunshine_3           6
sin_wd_4             6
Name: count, dtype: int64