In [2]:
import pandas as pd
import numpy as np
import polars as pl
import sgml, sgutil, sgpp
import joblib
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.pipeline import make_pipeline
sc = sgutil.SGCache('img', 'result', 'model')
data_processor = joblib.load('model/data_processor.joblib')
data_processor_org = joblib.load('model/data_processor_org.joblib')
df_train = data_processor.transform(['data/train.csv'])
df_org = data_processor_org.transform(['data/Rainfall.csv'])
target = 'rainfall'

In [3]:
X_all = [i for i in df_train.columns if i not in [target, 'day']]
selected = [
    np.array(X_all)[sc.read_result(i).get_support()].tolist()
    for i in ['ff_sfs_lgb2', 'bf_sfs_lgb', 'ff_sfs_xgb2', 'bf_sfs_xgb']
] + [
    list(sc.read_result(i).k_feature_names_)
    for i in ['ff_sfs_lgb_f', 'bf_sfs_lgb_f', 'ff_sfs_xgb_f', 'bf_sfs_xgb_f']
]

In [4]:
s_val = pd.Series(selected).explode().value_counts()
len(s_val)

58

In [5]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [6]:
import mlxtend.feature_selection as mfs

In [7]:
hparams = {
    'model_params': {'num_leaves': 3, 'n_estimators': 250, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfsf2_lgb',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
list(sfs.k_feature_names_)

['sin_ed',
 'pressure',
 'maxtemp',
 'mintemp',
 'sunshine',
 'windspeed',
 'chp',
 'dewpoint_2',
 'chp_3',
 'cos_wd_1',
 'cos_wd_4',
 'sin_wd_2',
 'dewpoint_ma24',
 'humidity_ma24']

In [8]:
hparams = {
    'model_params' : {'num_leaves': 3, 'n_estimators': 250, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8991638608305275, 0.9204527918069585)

In [9]:
sc.cv_result('lgb2_sfs', hparams, lgb_adapter, result['valid_prd'])

In [10]:
hparams = {
    'model_params': {'num_leaves': 3, 'n_estimators': 250, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'bf_sfsf2_lgb',
    lambda : mfs.SequentialFeatureSelector(
        estimator = lgb.LGBMClassifier(verbose = 0, **hparams['model_params']), k_features = 'best', forward = False, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
list(sfs.k_feature_names_)

['sin_ed',
 'pressure',
 'maxtemp',
 'sunshine',
 'windspeed',
 'chp',
 'cloud_5',
 'dewpoint_2',
 'dewpoint_5',
 'chp_3',
 'chp_6',
 'cos_wd_1',
 'cos_wd_4',
 'sin_wd_2',
 'dewpoint_ma24',
 'humidity_ma24']

In [11]:
hparams = {
    'model_params' : {'num_leaves': 3, 'n_estimators': 250, 'colsample_bytree': 0.75, 'learning_rate': 0.03},
    'X_num': list(sfs.k_feature_names_)
}
result = sgml.cv(df_train, skf, hparams, config, lgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8990375982042649, 0.9203475729517396)

In [12]:
sc.cv_result('lgb2_bfs', hparams, lgb_adapter, result['valid_prd'])

In [13]:
hparams = {
    'model_params': {'max_depth': 2, 'n_estimators': 150, 'colsample_bytree': 0.5, 'learning_rate': 0.02},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfsf2_xgb',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target])

)
list(sfs.k_feature_names_)

['sin_ed',
 'pressure',
 'temparature',
 'mintemp',
 'cloud',
 'sunshine',
 'windspeed',
 'sin_wd',
 'chp',
 'sunshine_5',
 'cloud_3',
 'cloud_5',
 'cloud_6',
 'dewpoint_1',
 'dewpoint_2',
 'dewpoint_5',
 'chp_2',
 'chp_3',
 'cos_wd_1',
 'cos_wd_2',
 'cos_wd_4',
 'sin_wd_1',
 'sin_wd_2',
 'sin_wd_6',
 'temparature_ma24',
 'mintemp_ma24',
 'dewpoint_ma24',
 'cloud_ma24',
 'sunshine_ma24',
 'sin_wd_ma24',
 'chp_ma24',
 'winddirection']

In [14]:
hparams = {
    'model_params': {'max_depth': 2, 'n_estimators': 200, 'colsample_bytree': 0.5, 'learning_rate': 0.03},
    'X_num': X_all
}
sfs = sc.cache_result(
    'ff_sfsf2_xgb',
    lambda : mfs.SequentialFeatureSelector(
        estimator = xgb.XGBClassifier(**hparams['model_params']), k_features = 'best', forward = True, floating = True, scoring = 'roc_auc', cv = skf
    ).fit(df_train[X_all], df_train[target]), rerun = 0
)
list(sfs.k_feature_names_)

['sin_ed',
 'pressure',
 'temparature',
 'mintemp',
 'cloud',
 'sunshine',
 'windspeed',
 'sin_wd',
 'chp',
 'sunshine_5',
 'cloud_3',
 'cloud_5',
 'cloud_6',
 'dewpoint_1',
 'dewpoint_2',
 'dewpoint_5',
 'chp_2',
 'chp_3',
 'cos_wd_1',
 'cos_wd_2',
 'cos_wd_4',
 'sin_wd_1',
 'sin_wd_2',
 'sin_wd_6',
 'temparature_ma24',
 'mintemp_ma24',
 'dewpoint_ma24',
 'cloud_ma24',
 'sunshine_ma24',
 'sin_wd_ma24',
 'chp_ma24',
 'winddirection']

In [18]:
hparams = {
    'model_params' : {'max_depth': 2, 'n_estimators': 200, 'colsample_bytree': 0.5, 'learning_rate': 0.03},
    'X_num': ['sin_ed', 'pressure', 'mintemp', 'sunshine', 'windspeed', 'chp', 'sunshine_5', 'cloud_3',
              'dewpoint_2', 'chp_2', 'chp_3', 'chp_4', 'chp_6', 'cos_wd_1', 'cos_wd_4', 'dewpoint_ma24', 'humidity_ma24', 'year']
}
result = sgml.cv(df_train, skf, hparams, config, xgb_adapter)
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.8976711560044894, 0.9219355008417509)

In [None]:
sc.cv_result('xgb2_sfs', hparams, xgb_adapter, result['valid_prd'], overwrite = False)