In [1]:
import os

import pandas as pd
import polars as pl
import numpy as np
import seaborn as sns
import sgpp, sgml, sgutil
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

sc = sgutil.SGCache('img', 'result', 'model')
data_processor = joblib.load('model/data_processor2.joblib')
data_processor_org = joblib.load('model/data_processor_org2.joblib')
df_train = data_processor.transform(['data/train_lb.csv'])
df_org = data_processor_org.transform(['data/Rainfall.csv'])
target = 'rainfall'

In [2]:
df_train_org = pd.concat([df_train, df_org], axis = 0)

In [3]:
X_sfs = ['pressure', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 
         'sin_wd', 'cos_wd', 'year', 'chp', 'shp']
for i in ['sunshine', 'cloud', 'chp', 'shp']:
    for j in range(1, 4):
        X_sfs.append('{}_{}'.format(i, j))
for i in ['maxtemp', 'temparature', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'windspeed', 'chp', 'shp']:
    X_sfs.append('{}_ma24'.format(i))

In [4]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from mlxtend.feature_selection import SequentialFeatureSelector

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [5]:
wrappers = [
    ('lr', LogisticRegression, lr_adapter, {}),
    ('lgb', lgb.LGBMClassifier, lgb_adapter, {'num_leaves': 7, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.03, 'verbose': 0}),
    ('xgb', xgb.XGBClassifier, xgb_adapter, {'max_depth': 3, 'n_estimators': 100, 'colsample_bytree': 0.75, 'learning_rate': 0.04}),
    ('knn', KNeighborsClassifier, knn_adapter, {'n_neighbors': 70}),
    ('svc', SVC, svc_adapter, {'C': 0.1, 'probability': True, 'kernel': 'poly', 'degree': 2, 'coef0': 1})
]
forward = [
    ('fwd', True), ('bwd', False)
]
inc_org = [
    ('no', False),
    ('yes', True)
]

In [None]:
results = list()
for m, wrapper, adapter, hparams in wrappers:
    for f_name, f in forward:
        for i_name, inc in inc_org:
            model_name = 'dp2_{}_{}_{}'.format(m, f_name, i_name)
            df = df_train_org if inc else df_train
            sfs = sc.cache_result(
                model_name,
                lambda : SequentialFeatureSelector(
                    wrapper(**hparams), 'best', forward = f, floating = True, cv = skf, scoring = 'roc_auc'
                ).fit(df[X_sfs], df[target]), rerun = False
            )
            hparams_cv = {
                'model_params': hparams, 
                'X_num': list(sfs.k_feature_names_),
                'train_data_proc_param': {'include_org': inc}
            }
            valid_scores = sc.cv_result(model_name, df_train, skf, hparams_cv, config, adapter)
            results.append(
                (model_name, sfs.k_feature_names_, sfs.k_score_, np.mean(valid_scores), np.std(valid_scores))
            )
            print(results[-1])