In [1]:
import os

import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import sgpp, sgml, sgutil
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sc = sgutil.SGCache('img', 'result', 'model')
data_processor = joblib.load('model/data_processor2.joblib')
data_processor_org = joblib.load('model/data_processor_org2.joblib')
df_train = data_processor.transform(['data/train_lb.csv'])
df_org = data_processor_org.transform(['data/Rainfall.csv'])
target = 'rainfall'

In [2]:
df_train_org = pd.concat([df_train, df_org], axis = 0)

In [3]:
X_sfs = ['pressure', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 
         'sin_wd', 'cos_wd', 'year', 'chp', 'shp']
for i in ['sunshine', 'cloud', 'chp', 'shp']:
    for j in range(1, 4):
        X_sfs.append('{}_{}'.format(i, j))
for i in ['maxtemp', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'chp', 'shp']:
    X_sfs.append('{}_ma24'.format(i))

In [4]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from mlxtend.feature_selection import SequentialFeatureSelector

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [5]:
wrappers = [
    ('lr', LogisticRegression, lr_adapter, {'solver': 'liblinear'}),
    ('lgb', lgb.LGBMClassifier, lgb_adapter, {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03, 'verbose': 0}),
    ('xgb', xgb.XGBClassifier, xgb_adapter, {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04}),
    ('knn', KNeighborsClassifier, knn_adapter, {'n_neighbors': 70}),
    ('svc', SVC, svc_adapter, {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1})
]
forward = [
    ('fwd', True), ('bwd', False)
]
inc_org = [
    ('no', False),
    ('yes', True)
]

In [6]:
results = list()
for m, wrapper, adapter, hparams in wrappers:
    for f_name, f in forward:
        for i_name, inc in inc_org:
            model_name = 'dp2_{}_{}_{}'.format(m, f_name, i_name)
            df = df_train_org if inc else df_train
            sfs = sc.cache_result(
                model_name,
                lambda : SequentialFeatureSelector(
                    wrapper(**hparams), 'best', forward = f, floating = True, cv = skf, scoring = 'roc_auc'
                ).fit(df[X_sfs], df[target]), rerun = False
            )
            hparams_cv = {
                'model_params': hparams, 
                'X_num': list(sfs.k_feature_names_),
                'train_data_proc_param': {'include_org': inc}
            }
            valid_scores = sc.cv_result(model_name, df_train, skf, hparams_cv, config, adapter)
            results.append(
                (model_name, sfs.k_feature_names_, sfs.k_score_, np.mean(valid_scores), np.std(valid_scores))
            )
            print(results[-1])

('dp2_lr_fwd_no', ('temparature', 'dewpoint', 'cloud', 'sunshine', 'windspeed', 'chp', 'sunshine_1', 'sunshine_2', 'sunshine_3', 'cloud_3', 'chp_3', 'shp_3', 'temparature_ma24', 'cloud_ma24', 'shp_ma24'), 0.8950583687072656, 0.8950583687072656, 0.006448044455650506)
('dp2_lr_fwd_yes', ('pressure', 'temparature', 'dewpoint', 'humidity', 'cloud', 'windspeed', 'cos_wd', 'shp', 'chp_2', 'chp_3', 'shp_1', 'maxtemp_ma24', 'temparature_ma24', 'cloud_ma24', 'shp_ma24'), 0.8953128055554214, 0.8935524528185506, 0.007775189327468324)
('dp2_lr_bwd_no', ('temparature', 'dewpoint', 'cloud', 'sunshine', 'windspeed', 'chp', 'sunshine_1', 'sunshine_2', 'cloud_3', 'chp_3', 'dewpoint_ma24', 'cloud_ma24', 'sunshine_ma24', 'shp_ma24'), 0.8954590156922162, 0.8954590156922162, 0.006433819913575707)
('dp2_lr_bwd_yes', ('temparature', 'dewpoint', 'humidity', 'cloud', 'windspeed', 'shp', 'cloud_3', 'shp_1', 'temparature_ma24', 'dewpoint_ma24', 'cloud_ma24', 'shp_ma24'), 0.8948968610395, 0.8941136735399058, 0.00

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_xgb_bwd_no', ('dewpoint', 'humidity', 'cloud', 'sunshine', 'cos_wd', 'year', 'chp', 'cloud_1', 'cloud_3', 'chp_1', 'shp_1', 'maxtemp_ma24', 'humidity_ma24'), 0.89444299340767, 0.8931039062731736, 0.006095772049457402)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_xgb_bwd_yes', ('pressure', 'temparature', 'humidity', 'cloud', 'sin_wd', 'cos_wd', 'year', 'chp', 'shp', 'sunshine_2', 'chp_2', 'maxtemp_ma24', 'cloud_ma24', 'chp_ma24', 'shp_ma24'), 0.8949212965273965, 0.8920217312237249, 0.005096930848130386)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_knn_fwd_no', ('dewpoint', 'cos_wd', 'year', 'chp', 'shp', 'cloud_1', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24', 'sunshine_ma24', 'chp_ma24'), 0.8917624541238943, 0.8917624541238943, 0.007379280513942331)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_knn_fwd_yes', ('dewpoint', 'humidity', 'cloud', 'cos_wd', 'shp', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24', 'cloud_ma24', 'sunshine_ma24', 'windspeed_ma24'), 0.8886649170445826, 0.8873120265673309, 0.005673901312227002)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_knn_bwd_no', ('dewpoint', 'sunshine', 'year', 'chp'), 0.8920137065270921, 0.8920137065270921, 0.0054132938847248485)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_knn_bwd_yes', ('dewpoint', 'humidity', 'cloud', 'sunshine', 'cos_wd', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24', 'cloud_ma24', 'sunshine_ma24', 'shp_ma24'), 0.8888580303073406, 0.888366198235851, 0.005165340147978148)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_svc_fwd_no', ('pressure', 'temparature', 'dewpoint', 'sunshine', 'windspeed', 'chp', 'sunshine_1', 'sunshine_3', 'cloud_2', 'cloud_3', 'chp_3', 'shp_1', 'shp_2', 'shp_3', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24'), 0.8912819318125342, 0.8912819318125342, 0.007704754375711072)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_svc_fwd_yes', ('pressure', 'temparature', 'dewpoint', 'cloud', 'windspeed', 'chp', 'shp', 'sunshine_2', 'cloud_1', 'chp_1', 'chp_2', 'chp_3', 'shp_1', 'maxtemp_ma24', 'cloud_ma24', 'sunshine_ma24', 'windspeed_ma24', 'chp_ma24', 'shp_ma24'), 0.8937486749014664, 0.8911534823998604, 0.006126517923065173)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_svc_bwd_no', ('pressure', 'temparature', 'dewpoint', 'sunshine', 'windspeed', 'sin_wd', 'chp', 'sunshine_2', 'shp_1', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24', 'humidity_ma24', 'cloud_ma24', 'sunshine_ma24', 'shp_ma24'), 0.8923372553560839, 0.8923421960675464, 0.005885474690354851)


Fold:   0%|          | 0/5 [00:00<?, ?it/s]

('dp2_svc_bwd_yes', ('pressure', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'chp', 'sunshine_1', 'sunshine_2', 'cloud_1', 'chp_1', 'chp_2', 'chp_3', 'shp_1', 'maxtemp_ma24', 'temparature_ma24', 'dewpoint_ma24', 'cloud_ma24', 'sunshine_ma24', 'windspeed_ma24', 'chp_ma24', 'shp_ma24'), 0.893710288418375, 0.8905582585289826, 0.0061987199738983015)


In [17]:
df_fs_result = pd.DataFrame(
    results, columns = ['model', 'features', 'k_score', 'm', 'std']
).sort_values(
    'm', ascending = False
).pipe(
    lambda x: x.join(
        x.pop('model').str.split('_', expand = True)
    )
)

In [23]:
gb_feat_cnt = df_fs_result.loc[
    df_fs_result[1].isin(['xgb', 'lgb']), 'features'
].explode().value_counts()
X_gb = gb_feat_cnt.loc[gb_feat_cnt >= 4].index.tolist()
gb_feat_cnt.loc[gb_feat_cnt >= 4]

features
cloud               8
cloud_ma24          7
cos_wd              7
pressure            6
chp                 6
chp_ma24            6
humidity            6
sunshine            5
chp_1               5
shp_ma24            4
sin_wd              4
temparature_ma24    4
dewpoint            4
maxtemp_ma24        4
year                4
Name: count, dtype: int64

In [42]:
hparams = {
    'model_params': {'max_depth': 4, 'n_estimators': 200, 'learning_rate': 0.04},
    'X_num': X_gb
}
valid_scores = sc.cv_result(
    'dp2_cb_fg4_no', df_train, skf, hparams, config, cb_adapter, rerun =False
)
np.mean(valid_scores)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.8911669125113224

In [71]:
hparams = {
    'model_params': {'num_leaves': 3, 'n_estimators': 1000, 'colsample_bytree': 0.25, 'subsample': 0.25, 'subsample_freq': 1, 'learning_rate': 0.01},
    'X_num': X_gb
}
valid_scores = sc.cv_result(
    'dp2_lgb_fg4_no', df_train, skf, hparams, config, lgb_adapter, rerun = False
)
np.mean(valid_scores)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.8931480685626662

In [86]:
hparams = {
    'model_params': {'max_depth': 3, 'n_estimators': 1000, 'colsample_bytree': 0.25, 'subsample': 0.25, 'learning_rate': 0.01},
    'X_num': X_gb
}
valid_scores = sc.cv_result(
    'dp2_xgb_fg4_no', df_train, skf, hparams, config, xgb_adapter, rerun =True
)
np.mean(valid_scores)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.8931438539425409