In [1]:
import os

import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import sgpp, sgml, sgutil

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
expr_dic = {}
for i in ['sunshine', 'cloud', 'dewpoint', 'chp', 'cos_wd', 'sin_wd']:
    for j in range(1, 7):
        expr_dic['{}_{}'.format(i, j)] =  pl.col(i).shift(j).fill_null(strategy = 'backward')
for i in ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp']:
    expr_dic['{}_ma24'.format(i)] = pl.mean(i).rolling(index_column = 'id', period = '24i', closed = 'left').fill_null(strategy = 'backward')

In [3]:
p1 = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64}),
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'expected_day': (pl.col('id') % 365) + 1,
        'sin_ed': ((pl.col('id') % 365) / 365 * np.pi).sin(),
        'year': pl.col('id') // 365, 
    })
)
df_train = p1.fit_transform(['data/train.csv'])
df_test = p1.transform(['data/test.csv'])
p2 = make_pipeline(
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(index_col = 'id')
)
df_all = p2.fit_transform(
    pl.concat([df_train, df_test], how = 'align')
)

p3 = make_pipeline(
    sgpp.ApplyWrapper(
        StandardScaler().set_output(transform='pandas'), 
        ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed'] + ['cos_wd', 'sin_wd', 'chp'] + list(expr_dic.keys()),
    ),
    sgpp.ApplyWrapper(
        MinMaxScaler().set_output(transform='pandas'), ['year', 'expected_day', 'winddirection']
    )
)
df_all = p3.fit_transform(df_all)

df_train = df_all.loc[df_all['rainfall'].notna()]
df_test = df_all.loc[df_all['rainfall'].isna()].drop(columns = ['rainfall'])

In [4]:
df_org = make_pipeline(
    sgpp.PolarsProcessor(predefined_types = {'id': pl.Int64, 'day': pl.Int16, 'rainfall': pl.String}),
).fit_transform(['data/Rainfall.csv']).rename(
    lambda x: x.strip()
)
df_org = make_pipeline(
    sgpp.ExprProcessor({
        'winddirection': pl.col('winddirection').fill_null(strategy = 'forward'),
        'windspeed': pl.col('windspeed').fill_null(strategy = 'forward')
    }),
    sgpp.ExprProcessor({
        'id': pl.arange(1, pl.col('day').len() + 1),
        'sin_ed': (pl.arange(1, pl.col('day').len() + 1) / 365 * np.pi).sin(),
        'cos_wd': (pl.col('winddirection') / 180 * np.pi).cos() * pl.col('windspeed'),
        'sin_wd': (pl.col('winddirection') / 180 * np.pi).sin() * pl.col('windspeed'),
        'chp': pl.col('cloud') * pl.col('humidity') / pl.col('pressure'),
        'rainfall': pl.col('rainfall').replace({'yes': 1, 'no': 0}).cast(pl.Int8),
    }),
    sgpp.ExprProcessor(expr_dic),
    sgpp.PandasCoverter(), 
).fit_transform(df_org).assign(
    expected_day = lambda x: x.index + 1,
    year = -1
).pipe(
    lambda x: x.set_index(-(len(x) - x.index))
)
df_org = p3.transform(df_org)

In [5]:
target = 'rainfall'
sc = sgutil.SGCache('img', 'result')
_ = sc.cache_result(
    'target', lambda : df_train[target].sort_index()
)

In [6]:
len(df_test.columns)

65

In [7]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [8]:
from sklearn.feature_selection import SequentialFeatureSelector
X_all = [i for i in df_test.columns.tolist() if i not in ['day']]

# Forward Feature Selection wrapping Logistic Regression

In [9]:
sfs = sc.cache_result(
    'ff_sfs_lr',
    lambda : SequentialFeatureSelector(
        estimator = LogisticRegression(), direction = 'forward', scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'sunshine', 'windspeed', 'chp', 'sunshine_1',
       'sunshine_2', 'sunshine_3', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cloud_5', 'cloud_6', 'dewpoint_1', 'dewpoint_5', 'chp_1', 'chp_3',
       'chp_5', 'cos_wd_3', 'cos_wd_4', 'cos_wd_5', 'sin_wd_2',
       'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24',
       'humidity_ma24', 'sunshine_ma24', 'year'], dtype='<U16')

In [10]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8972053872053873, 0.012120648342876662)

In [11]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
dewpoint_ma24,-0.020234,0.113573,5.612868
maxtemp_ma24,-0.024915,0.081302,3.263118
cos_wd_3,-0.014448,0.043408,3.004357
sunshine_3,-0.018969,0.033022,1.740821
sin_wd_2,-0.01501,0.022889,1.524927
dewpoint_1,0.052131,0.071972,1.38061
chp_1,-0.039389,0.049762,1.26336
maxtemp,0.127891,0.153483,1.200112
temparature_ma24,0.365757,0.205129,0.560833
dewpoint_5,-0.063698,0.031295,0.491304


In [12]:
sc.cv_result('lr_sfs', hparams, lr_adapter, result['valid_prd'])

In [13]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[8:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8983557800224468, 0.011976255278421333)

In [14]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
dewpoint_5,-0.060115,0.045998,0.765164
temparature,-0.14863,0.109471,0.736534
temparature_ma24,0.339612,0.173672,0.511386
cos_wd_5,0.098969,0.045272,0.45744
chp_3,-0.25801,0.096879,0.375485
cloud_6,0.065133,0.023313,0.357921
mintemp,-0.333428,0.118167,0.354401
cloud_5,0.185984,0.061321,0.32971
sunshine_2,0.115192,0.037786,0.328024
cloud_4,0.149698,0.045243,0.302226


In [15]:
sc.cv_result('lr_sfs2', hparams, lr_adapter, result['valid_prd'])

In [16]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[4:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8985185185185184, 0.01312478134285176)

In [17]:
sc.cv_result('lr_sfs3', hparams, lr_adapter, result['valid_prd'])

In [18]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
cloud_6,0.069157,0.025011,0.361659
sunshine_2,0.108589,0.038656,0.355989
chp_3,-0.28322,0.08874,0.313325
cloud_4,0.156865,0.04529,0.288716
pressure,-0.134968,0.037416,0.277223
sin_ed,-0.948355,0.226483,0.238817
sunshine_4,0.21675,0.051374,0.237018
cloud_5,0.23214,0.053793,0.231728
year,0.554469,0.127472,0.2299
chp_5,-0.200099,0.045552,0.227648


# Forward Feature Selection wrapping Logistic Regression Aumenting Org

In [19]:
sfs = sc.cache_result(
    'ff_sfs_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = LogisticRegression(), direction = 'forward', scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd',
       'chp', 'sunshine_1', 'sunshine_2', 'sunshine_3', 'sunshine_4',
       'sunshine_5', 'cloud_2', 'cloud_3', 'cloud_4', 'dewpoint_2',
       'dewpoint_6', 'chp_2', 'chp_4', 'chp_6', 'cos_wd_3', 'sin_wd_2',
       'maxtemp_ma24', 'temparature_ma24', 'mintemp_ma24', 'year',
       'winddirection'], dtype='<U16')

In [20]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8947811447811448, 0.012302086080667738)

In [21]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
maxtemp,-0.021097,0.14982,7.101555
cos_wd_3,-0.010459,0.045744,4.373801
mintemp_ma24,0.040715,0.160246,3.935837
sunshine_5,0.019164,0.033826,1.765061
chp_6,-0.022548,0.038773,1.719538
sin_wd_2,-0.022583,0.030652,1.357292
maxtemp_ma24,0.119042,0.137336,1.153678
chp_4,0.042262,0.034726,0.821677
chp,0.069188,0.052726,0.762076
temparature,-0.166954,0.126749,0.759183


In [22]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a2', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8962906846240178, 0.01146601290644487)

In [23]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
chp_4,0.034988,0.042251,1.207573
chp,0.054333,0.046949,0.864086
sunshine_3,0.048751,0.035849,0.735355
dewpoint_2,-0.100342,0.063914,0.636965
temparature,-0.184064,0.098829,0.536927
maxtemp_ma24,0.118299,0.062029,0.524339
chp_2,-0.134452,0.053239,0.395968
year,0.290991,0.111759,0.384062
cos_wd,-0.257909,0.063183,0.244983
dewpoint_6,0.340524,0.075403,0.221431


In [24]:
hparams = {
    'model_params': {},
    'X_num': df_coef.iloc[7:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_a3', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8971043771043771, 0.011150089212136121)

# Backward Feature Selection wrapping Logistic Regression

In [25]:
sfs = sc.cache_result(
    'bf_sfs_lr',
    lambda : SequentialFeatureSelector(
        estimator = LogisticRegression(), direction = 'backward', scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = False
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp',
       'sunshine_1', 'sunshine_2', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cloud_5', 'dewpoint_5', 'chp_3', 'chp_4', 'chp_5', 'cos_wd_3',
       'cos_wd_4', 'sin_wd_3', 'temparature_ma24', 'mintemp_ma24',
       'cloud_ma24', 'sunshine_ma24', 'cos_wd_ma24', 'sin_wd_ma24',
       'chp_ma24', 'year'], dtype='<U16')

In [26]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8971717171717172, 0.011681897853436847)

# Backward Feature Selection wrapping Logistic Regression Augumenting Org

In [27]:
sfs = sc.cache_result(
    'bf_sfs_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = LogisticRegression(max_iter = 500), direction = 'backward', scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'mintemp', 'dewpoint', 'humidity', 'cloud',
       'sunshine', 'windspeed', 'cos_wd', 'sin_wd', 'chp', 'sunshine_1',
       'sunshine_3', 'sunshine_4', 'sunshine_6', 'cloud_4', 'cloud_5',
       'dewpoint_2', 'dewpoint_6', 'chp_3', 'chp_5', 'chp_6', 'cos_wd_3',
       'sin_wd_1', 'sin_wd_3', 'sin_wd_5', 'temparature_ma24',
       'mintemp_ma24', 'cos_wd_ma24', 'sin_wd_ma24', 'year',
       'winddirection'], dtype='<U16')

In [28]:
hparams = {
    'model_params': {},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs_a', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8929517396184063, 0.009206929636066645)

# Forward Feature Selection wrapping LGBM

In [29]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8895819304152637, 0.9398553240740741)

In [None]:
sfs = sc.cache_result(
    'ff_sfs_lgb',
    lambda : SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

In [31]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8969472502805835, 0.9357702020202019)

In [32]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
cos_wd_5,8.8,6.610598,0.751204
sin_wd_6,9.0,6.123724,0.680414
sin_ed,5.6,3.714835,0.663363
mintemp_ma24,6.8,4.438468,0.652716
temparature,2.4,1.516575,0.631906
dewpoint_1,6.4,3.974921,0.621081
cloud_5,4.6,2.701851,0.587359
dewpoint_ma24,14.2,7.79102,0.548663
cos_wd_3,12.4,6.580274,0.530667
expected_day,10.0,5.0,0.5


In [33]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': df_imp.iloc[3:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs2', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8956874298540966, 0.9342052469135801)

In [34]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
cloud_2,2.2,2.04939,0.931541
cloud_5,5.6,5.029911,0.898198
sin_wd_2,5.6,4.615192,0.824141
cos_wd_3,8.2,6.723095,0.81989
year,2.4,1.949359,0.812233
cloud_6,5.4,3.781534,0.700284
dewpoint_4,13.8,9.628084,0.697687
dewpoint_ma24,13.0,8.860023,0.68154
mintemp_ma24,6.4,4.159327,0.649895
expected_day,9.8,6.340347,0.646974


# Forward Feature Selection wrapping LGBM Augumenting Org

In [96]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_lgb2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'chp',
       'sunshine_1', 'sunshine_5', 'cloud_2', 'cloud_3', 'cloud_6',
       'dewpoint_1', 'dewpoint_2', 'chp_2', 'chp_3', 'chp_4', 'chp_6',
       'cos_wd_1', 'sin_wd_5', 'maxtemp_ma24', 'temparature_ma24',
       'dewpoint_ma24', 'humidity_ma24', 'cloud_ma24', 'windspeed_ma24',
       'chp_ma24', 'year'], dtype='<U16')

In [97]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_a', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8941694725028059, 0.9291066322568062)

In [98]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
chp_2,4.6,4.449719,0.96733
year,1.6,1.516575,0.947859
sin_wd_5,7.2,6.457554,0.896882
chp_4,2.2,1.788854,0.813116
temparature,4.8,3.63318,0.756913
dewpoint_1,6.2,4.494441,0.72491
cloud_2,6.0,3.741657,0.62361
chp_ma24,4.0,2.345208,0.586302
sunshine_1,8.6,4.929503,0.573198
cos_wd_1,5.6,3.130495,0.559017


In [99]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': df_imp.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_a2', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8952160493827159, 0.9286553801395666)

# Backward Feature Selection wrapping LGBM

In [39]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_lgb',
    lambda : SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), direction = 'backward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['temparature', 'mintemp', 'sunshine', 'windspeed', 'cos_wd', 'chp',
       'sunshine_1', 'sunshine_6', 'cloud_1', 'cloud_5', 'cloud_6',
       'dewpoint_3', 'chp_1', 'chp_2', 'chp_3', 'chp_5', 'cos_wd_3',
       'cos_wd_4', 'sin_wd_3', 'sin_wd_4', 'sin_wd_6', 'pressure_ma24',
       'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24',
       'humidity_ma24', 'cloud_ma24', 'sunshine_ma24', 'sin_wd_ma24',
       'chp_ma24', 'year', 'winddirection'], dtype='<U16')

In [40]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_bfs', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8956874298540966, 0.9361584595959597)

# Forward Feature Selection wrapping XGB

In [41]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8913524130190797, 0.9442971380471381)

In [42]:
sfs = sc.cache_result(
    'ff_sfs_xgb',
    lambda : SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'sunshine', 'windspeed', 'chp', 'sunshine_3', 'sunshine_6',
       'cloud_1', 'cloud_6', 'dewpoint_2', 'dewpoint_5', 'chp_1', 'chp_3',
       'chp_4', 'chp_6', 'cos_wd_1', 'cos_wd_4', 'cos_wd_5', 'sin_wd_2',
       'sin_wd_5', 'sin_wd_6', 'maxtemp_ma24', 'temparature_ma24',
       'dewpoint_ma24', 'humidity_ma24', 'sunshine_ma24',
       'windspeed_ma24', 'sin_wd_ma24', 'year'], dtype='<U16')

In [43]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8972895622895625, 0.938904496352413)

In [44]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
mintemp,0.039779,0.021842,0.549098
cloud_6,0.011195,0.005909,0.527865
temparature,0.030662,0.015242,0.497081
sin_wd_ma24,0.016736,0.00765,0.457095
chp_1,0.024434,0.009823,0.402003
maxtemp,0.043826,0.016326,0.372526
maxtemp_ma24,0.016735,0.005493,0.328238
cos_wd_1,0.017793,0.00471,0.264693
windspeed_ma24,0.024015,0.006354,0.26457
dewpoint_2,0.027764,0.006754,0.243267


In [45]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': df_imp.iloc[2:].index.tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs2', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.897368125701459, 0.9388632856341189)

# Forward Feature Selection wrapping LGBM Augumenting Org

In [46]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_xgb2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: SequentialFeatureSelector(
            estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'forward', scoring = 'roc_auc', cv = skf
        ).fit(x[X_all], x[target])
    )
)
np.array(X_all)[sfs.get_support()]

array(['sin_ed', 'pressure', 'maxtemp', 'temparature', 'mintemp',
       'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'cos_wd',
       'sin_wd', 'chp', 'sunshine_1', 'sunshine_3', 'sunshine_6',
       'cloud_3', 'cloud_4', 'cloud_5', 'dewpoint_3', 'chp_2', 'chp_3',
       'cos_wd_1', 'cos_wd_2', 'cos_wd_6', 'sin_wd_2', 'temparature_ma24',
       'mintemp_ma24', 'cloud_ma24', 'sunshine_ma24', 'windspeed_ma24',
       'year'], dtype='<U16')

In [47]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs_a', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8932379349046016, 0.932458459148739)

In [48]:
df_imp = pd.concat(
    [i['feature_importance'] for i in result['model_result']], axis=1
).agg(['mean', 'std'], axis=1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_imp.iloc[:10]

Unnamed: 0,mean,std,CV
year,0.005378,0.005091,0.946545
cloud_4,0.018069,0.009273,0.513186
dewpoint_3,0.01746,0.008724,0.499647
cos_wd_6,0.015537,0.006964,0.448243
chp_2,0.014191,0.006312,0.444766
cos_wd_2,0.017929,0.007543,0.42069
sin_wd_2,0.017178,0.006387,0.371787
mintemp,0.045677,0.015609,0.341731
sunshine_1,0.015253,0.004352,0.285315
maxtemp,0.03594,0.009467,0.263405


In [49]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': df_imp.iloc[6:].index.tolist(),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8944949494949496, 0.9314452025294265)

# Backward Feature Selection wrapping XGB

In [50]:
sfs = sc.cache_result(
    'bf_sfs_xgb',
    lambda : SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), direction = 'backward', scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])
)
np.array(X_all)[sfs.get_support()]

array(['pressure', 'dewpoint', 'sunshine', 'windspeed', 'cos_wd', 'chp',
       'sunshine_1', 'sunshine_2', 'sunshine_3', 'cloud_1', 'cloud_3',
       'cloud_4', 'cloud_5', 'cloud_6', 'dewpoint_2', 'dewpoint_3',
       'chp_3', 'chp_4', 'chp_6', 'cos_wd_1', 'cos_wd_2', 'cos_wd_4',
       'sin_wd_2', 'sin_wd_6', 'pressure_ma24', 'maxtemp_ma24',
       'temparature_ma24', 'mintemp_ma24', 'humidity_ma24',
       'sunshine_ma24', 'windspeed_ma24', 'chp_ma24'], dtype='<U16')

In [51]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': np.array(X_all)[sfs.get_support()].tolist()
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_bfs', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8943265993265992, 0.9391373807519642)

# Stepwise Feature Selections wrapping Logistic Regression

In [52]:
from mlxtend import feature_selection as mfs

In [None]:
sfs = sc.cache_result(
    'ff_sfsf_lr',
    lambda : mfs.SequentialFeatureSelector(
        estimator = LogisticRegression(), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

In [54]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_f', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8990460157126824, 0.01061797500616197)

In [None]:
sfs = sc.cache_result(
    'ff_sfsf_lr2',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: mfs.SequentialFeatureSelector(
            estimator = LogisticRegression(), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(sfs.k_feature_names_)

In [56]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_sfs_f2', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8960437710437711, 0.011426817196481703)

In [100]:
sfs = sc.cache_result(
    'bf_sfsf_lr',
    lambda : mfs.SequentialFeatureSelector(
        estimator = LogisticRegression(), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'dewpoint', 'cloud', 'sunshine', 'windspeed',
       'chp', 'sunshine_1', 'sunshine_4', 'cloud_3', 'cloud_4',
       'cos_wd_4', 'temparature_ma24', 'humidity_ma24', 'cos_wd_ma24',
       'sin_wd_ma24', 'year'], dtype='<U16')

In [101]:
hparams = {
    'model_params': {},
    'X_num': list(sfs.k_feature_names_),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
sc.cv_result('lr_bfs_f', hparams, lr_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8954320987654322, 0.012407271619931246)

In [None]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_lgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

In [60]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_sfs_f', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8975448933782267, 0.9289444795173962)

In [None]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_lgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

In [62]:
hparams = {
    'model_params': {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
sc.cv_result('lgb_bfs_f', hparams, lgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.897368125701459, 0.9331044823232324)

In [None]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfs_xgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

In [64]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_sfs_f', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8985101010101012, 0.9348846099887765)

In [None]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfs_xgb_f',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
np.array(sfs.k_feature_names_)

In [66]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
sc.cv_result('xgb_bfs_f', hparams, xgb_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8983894500561167, 0.9343153759820426)

In [67]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8915151515151514, 0.9011121632996633)

In [None]:
sfs = sc.cache_result(
    'ff_sfsf_lsvc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

In [69]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('lsvc_sfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8986223344556679, 0.899520202020202)

In [None]:
sfs = sc.cache_result(
    'bf_sfsf_lsvc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

In [71]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('lsvc_bfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.899074074074074, 0.8999391484287319)

In [72]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8930920314253648, 0.9092631172839505)

In [None]:
sfs = sc.cache_result(
    'ff_sfsf_p2svc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

In [74]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('p2svc_sfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.90087822671156, 0.907370405443322)

In [None]:
sfs = sc.cache_result(
    'bf_sfsf_p2svc',
    lambda : mfs.SequentialFeatureSelector(
        estimator = SVC(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target])
)
np.array(sfs.k_feature_names_)

In [76]:
hparams = {
    'model_params': {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, svc_adapter)
sc.cv_result('p2svc_bfs_f', hparams, svc_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9008726150392817, 0.9073697039842873)

In [87]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': X_all
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8661475869809203, 0.8825776865881032)

In [91]:
sfs = sc.cache_result(
    'ff_sfsf_knn70',
    lambda : mfs.SequentialFeatureSelector(
        estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'dewpoint', 'cloud', 'sunshine', 'chp', 'dewpoint_2',
       'chp_1', 'cos_wd_4', 'cos_wd_6', 'sin_wd_3', 'sunshine_ma24',
       'chp_ma24', 'year'], dtype='<U13')

In [92]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_sfs_f', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8974354657687991, 0.9070700056116723)

In [103]:
sfs = sc.cache_result(
    'bf_sfsf_knn70',
    lambda : mfs.SequentialFeatureSelector(
        estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'mintemp', 'humidity', 'cloud', 'sunshine', 'windspeed',
       'chp', 'sunshine_3', 'cloud_6', 'dewpoint_4', 'dewpoint_6',
       'chp_2', 'cos_wd_3', 'cos_wd_4', 'cos_wd_6', 'sin_wd_3',
       'sin_wd_4', 'mintemp_ma24', 'dewpoint_ma24', 'chp_ma24'],
      dtype='<U13')

In [104]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_bfs_f', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8981228956228955, 0.904110725308642)

In [105]:
sfs = sc.cache_result(
    'ff_sfsf_knn702',
    lambda : pd.concat([df_train, df_org]).pipe(
        lambda x: mfs.SequentialFeatureSelector(
            estimator = KNeighborsClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf, n_jobs = -1
        ).fit(x[X_all], x[target])
    )
)
np.array(sfs.k_feature_names_)

array(['sin_ed', 'temparature', 'mintemp', 'dewpoint', 'humidity',
       'cloud', 'sunshine', 'windspeed', 'sunshine_6', 'dewpoint_1',
       'dewpoint_6', 'sin_wd_6', 'pressure_ma24', 'maxtemp_ma24',
       'temparature_ma24', 'mintemp_ma24', 'dewpoint_ma24', 'cloud_ma24',
       'sunshine_ma24', 'expected_day', 'winddirection'], dtype='<U16')

In [107]:
hparams = {
    'model_params': {'n_neighbors': 70},
    'X_num': list(sfs.k_feature_names_),
    'train_data_proc_param': {'include_org': True}
}
result = sgml.cv(df_train, skf, hparams, config, knn_adapter)
sc.cv_result('knn70_sfs_f2', hparams, knn_adapter, result['valid_prd'])
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8916189674523007, 0.8991573995711537)

# Feature Selection Analysis

In [77]:
result_sk = ['ff_sfs_lr', 'ff_sfs_lr2', 'bf_sfs_lr', 'bf_sfs_lr2', 'ff_sfs_lgb', 'ff_sfs_lgb2', 'bf_sfs_lgb', 'ff_sfs_xgb', 'ff_sfs_xgb2', 'bf_sfs_xgb']
selected_sk = [
    np.array(X_all)[sc.read_result(i).get_support()].tolist()
    for i in result_sk
]

In [78]:
pd.Series(selected_sk, index = result_sk).explode().value_counts().iloc[:10]

temparature_ma24    10
sunshine            10
windspeed           10
chp                 10
chp_3                9
year                 9
mintemp              9
dewpoint             9
sunshine_1           8
sin_ed               8
Name: count, dtype: int64

In [109]:
result_mfs = [
    'ff_sfsf_lr', 'ff_sfsf_lr2', 'bf_sfsf_lr', 'ff_sfs_lgb_f', 'bf_sfs_lgb_f', 'ff_sfs_xgb_f', 'bf_sfs_xgb_f', 'ff_sfsf_lsvc', 'bf_sfsf_lsvc',
    'ff_sfsf_p2svc', 'bf_sfsf_p2svc', 'ff_sfsf_knn70', 'bf_sfsf_knn70', 'ff_sfsf_knn702'
]
selected_mfs = [
    list(sc.read_result(i).k_feature_names_)
    for i in result_mfs
]

In [110]:
pd.Series(selected_mfs, index = result_mfs).explode().value_counts().iloc[:10]

sunshine            14
mintemp             13
sin_ed              12
dewpoint            12
windspeed           12
chp                 11
cos_wd_4            10
temparature_ma24     9
year                 8
cloud_3              8
Name: count, dtype: int64

In [114]:
pd.concat([
    pd.Series(selected_mfs, index = result_mfs).explode(),
    pd.Series(selected_mfs, index = result_mfs).explode()
]).value_counts().iloc[:30]

sunshine            28
mintemp             26
sin_ed              24
dewpoint            24
windspeed           24
chp                 22
cos_wd_4            20
temparature_ma24    18
year                16
cloud_3             16
dewpoint_ma24       14
dewpoint_2          14
cos_wd_ma24         14
maxtemp_ma24        14
humidity_ma24       14
cloud               12
sunshine_1          10
sunshine_ma24       10
temparature         10
sin_wd_ma24         10
cloud_6             10
cloud_1             10
sin_wd_6             8
humidity             8
dewpoint_6           8
mintemp_ma24         8
pressure             8
cos_wd_5             6
sunshine_3           6
sin_wd_4             6
Name: count, dtype: int64