In [7]:
import pandas as pd
import numpy as np
import sgml, sgutil
import joblib
from itertools import product
import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
sc = sgutil.SGCache('img', 'result', 'model')
data_processor = joblib.load('model/data_processor2.joblib')
data_processor_org = joblib.load('model/data_processor_org2.joblib')
df_train = data_processor.transform(['data/train_lb.csv'])
df_test = data_processor.transform(['data/test.csv'])
df_org = data_processor_org.transform(['data/Rainfall.csv'])
target = 'rainfall'

In [59]:
df_lbp = pd.read_csv('data/lb_probing.csv', index_col = 'id').iloc[:146].astype(int)

In [5]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from mlxtend.feature_selection import SequentialFeatureSelector

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

def include_org(df, include_org = False):
    return pd.concat([df, df_org]) if include_org else df

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'train_data_proc': include_org,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)
knn_adapter = sgml.SklearnAdapter(KNeighborsClassifier)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(xgb.XGBClassifier)
cb_adapter = sgml.CBAdapter(cb.CatBoostClassifier)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

s_target = sc.read_result('target2')

In [8]:
models = [
    'dp2_{}_{}_{}'.format(m, f_name, i_name) 
    for m, f_name, i_name in product(['lr', 'lgb', 'xgb', 'knn', 'svc'], ['fwd', 'bwd'], ['no', 'yes'])
]
df_stk = sc.read_prds(models, s_target.index)

In [25]:
df_scores = pd.DataFrame({k : i['valid_scores'] for k, i in sc.read_cvs(models).items()})
df_scores = df_scores.iloc[:, np.argsort(-df_scores.mean())]

In [39]:
s_scores = pd.Series([
    roc_auc_score(s_target, df_stk[df_scores.columns[:i]].mean(axis=1))
    for i in range(1, len(df_scores.columns) + 1)
])
s_scores

0     0.894349
1     0.894206
2     0.894297
3     0.893968
4     0.894840
5     0.895447
6     0.895757
7     0.895602
8     0.895590
9     0.895466
10    0.896113
11    0.896840
12    0.896721
13    0.896594
14    0.896503
15    0.896336
16    0.896092
17    0.895946
18    0.896044
19    0.896056
dtype: float64

# Submission 8

In [44]:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(
    LogisticRegression(solver ='liblinear'), 'best', forward = True, floating = True, cv = skf, scoring = 'roc_auc'
).fit(df_stk, s_target)
sfs.k_feature_names_, sfs.k_score_

(('dp2_lr_fwd_no',
  'dp2_lr_bwd_no',
  'dp2_lgb_fwd_yes',
  'dp2_knn_fwd_no',
  'dp2_knn_bwd_no'),
 0.8986813776160159)

In [45]:
sfs = SequentialFeatureSelector(
    LogisticRegression(solver ='liblinear'), 'best', forward = False, floating = True, cv = skf, scoring = 'roc_auc'
).fit(df_stk, s_target)
sfs.k_feature_names_, sfs.k_score_

(('dp2_lr_fwd_no',
  'dp2_lr_bwd_no',
  'dp2_lgb_fwd_yes',
  'dp2_knn_fwd_no',
  'dp2_knn_bwd_no'),
 0.8986813776160159)

In [48]:
sfs = SequentialFeatureSelector(
    LogisticRegression(solver ='liblinear'), 'best', forward = False, floating = True, cv = skf, scoring = 'roc_auc'
).fit(df_stk.join(sc.read_prd('dp2_cb_fg4_no', s_target.index)), s_target)
sfs.k_feature_names_, sfs.k_score_

(('dp2_lr_fwd_no',
  'dp2_lr_bwd_no',
  'dp2_lgb_fwd_yes',
  'dp2_knn_fwd_no',
  'dp2_knn_bwd_no'),
 0.8986813776160159)

In [50]:
X_enb = list(sfs.k_feature_names_)
clf_enb = LogisticRegression(solver = 'liblinear')
clf_enb.fit(df_stk[X_enb], s_target)

In [49]:
for i in sfs.k_feature_names_:
    sc.train_cv(i, df_train, config)

In [63]:
s_prd = pd.Series(
    clf_enb.predict_proba(
        pd.concat(
            [sc.get_predictor_cv(i, config)(df_test).rename(i) for i in sfs.k_feature_names_], axis = 1
        )
    )[:, 1], index = df_test.index, name = target
)
s_prd.iloc[:146] = df_lbp[target]

In [65]:
s_prd.to_csv('result/submission8.csv')

In [67]:
# 1.0
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission8.csv -m "8"

100%|██████████████████████████████████████| 15.0k/15.0k [00:01<00:00, 14.4kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 9

In [70]:
sfs = SequentialFeatureSelector(
    LogisticRegression(solver ='liblinear'), 'best', forward = True, floating = True, cv = skf, scoring = 'roc_auc'
).fit(df_stk.join(sc.read_prds(['dp2_cb_fg4_no', 'dp2_lgb_fg4_no', 'dp2_xgb_fg4_no'], s_target.index)), s_target)
sfs.k_feature_names_, sfs.k_score_

(('dp2_lr_bwd_no',
  'dp2_lr_bwd_yes',
  'dp2_knn_fwd_no',
  'dp2_knn_bwd_no',
  'dp2_svc_bwd_no',
  'dp2_xgb_fg4_no'),
 0.8986752472148247)

In [72]:
df_stk2 = df_stk.join(sc.read_prds(['dp2_cb_fg4_no', 'dp2_lgb_fg4_no', 'dp2_xgb_fg4_no'], s_target.index))

In [73]:
X_enb = list(sfs.k_feature_names_)
clf_enb = LogisticRegression(solver = 'liblinear')
clf_enb.fit(df_stk2[X_enb], s_target)
for i in sfs.k_feature_names_:
    sc.train_cv(i, df_train, config)

In [76]:
s_prd = pd.Series(
    clf_enb.predict_proba(
        pd.concat(
            [sc.get_predictor_cv(i, config)(df_test).rename(i) for i in sfs.k_feature_names_], axis = 1
        )
    )[:, 1], index = df_test.index, name = target
)
s_prd.iloc[:146] = df_lbp[target]
s_prd.to_csv('result/submission9.csv')

In [77]:
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission9.csv -m "9"

100%|██████████████████████████████████████| 15.0k/15.0k [00:01<00:00, 14.7kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 10

In [93]:
import optuna
from sklearn.model_selection import cross_val_score
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial):
    # 어떤 feature를 사용할지 선택
    selected_features = []
    for feature in df_stk2.columns:
        if trial.suggest_categorical(f"use_{feature}", [True, False]):
            selected_features.append(feature)

    # 아무 feature도 선택하지 않으면 성능 0 반환
    if len(selected_features) == 0:
        return 0.0
    scores = cross_val_score(LogisticRegression(solver = 'liblinear'), df_stk2[selected_features], s_target, cv=skf, scoring="roc_auc")
    return scores.mean()

In [96]:
study.optimize(objective, n_trials=300)

# 결과 출력
print("Best trial:")
print("  AUC:", study.best_trial.value)
print("  Features:")
for key, value in study.best_trial.params.items():
    if value and key.startswith("use_"):
        print(f"    {key[4:]}")

Best trial:
  AUC: 0.8984844885085549
  Features:
    dp2_lr_fwd_no
    dp2_lr_bwd_no
    dp2_xgb_bwd_no
    dp2_knn_fwd_no
    dp2_knn_bwd_no
    dp2_svc_fwd_yes
    dp2_svc_bwd_no
