In [1]:
import pandas as pd
import numpy as np
import sgml, sgutil

sc = sgutil.SGCache('img', 'result')

In [26]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

target = 'rainfall'
s_target = sc.read_result('target')
config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [5]:
model_results = [
    sc.read_prd(i, s_target.index) for i in sc.get_cv_list()
]
df_stk = pd.concat(model_results, axis=1)
df_stk.head()

Unnamed: 0_level_0,lgb_sfs,lgb_sfs2,lgb_sfs_a,lgb_sfs_a2,lr_bfs,lr_bfs_a,lr_sfs,lr_sfs2,lr_sfs3,lr_sfs_2,lr_sfs_3,lr_sfs_a,lr_sfs_a2,lr_sfs_a3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.957987,0.960491,0.965182,0.964534,0.968675,0.960399,0.971519,0.973572,0.973572,0.973319,0.973572,0.963775,0.963454,0.967448
1,0.958735,0.957716,0.957015,0.959917,0.98646,0.979461,0.986961,0.98477,0.98477,0.987867,0.98477,0.981633,0.981487,0.984377
2,0.324463,0.29505,0.213575,0.215169,0.148145,0.121243,0.134054,0.154791,0.154791,0.142596,0.154791,0.114761,0.119809,0.151486
3,0.956526,0.954448,0.95013,0.956735,0.985884,0.98393,0.990519,0.993977,0.993977,0.991563,0.993977,0.986827,0.986581,0.987076
4,0.116064,0.111079,0.098587,0.11724,0.132335,0.085233,0.151153,0.218779,0.218779,0.164982,0.218779,0.100179,0.106565,0.113072


In [6]:
roc_auc_score(
    s_target, df_stk.mean(axis=1)
)

0.899395061728395

In [18]:
from sklearn.model_selection import cross_validate
cv_result = cross_validate(
    LogisticRegression(), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9002861952861952, 0.012099543649190754)

In [40]:
clf_lr = LogisticRegression().fit(df_stk, s_target)

In [45]:
s_coef = pd.Series(
    clf_lr.coef_[0], index = df_stk.columns
)
s_coef

lgb_sfs       0.852705
lgb_sfs2      1.150141
lgb_sfs_a     0.059445
lgb_sfs_a2    0.755379
lr_bfs       -0.009457
lr_bfs_a     -0.371034
lr_sfs        0.055362
lr_sfs2       0.718166
lr_sfs3       0.718166
lr_sfs_2      0.450560
lr_sfs_3      0.718166
lr_sfs_a     -0.150857
lr_sfs_a2     0.501039
lr_sfs_a3     0.644671
dtype: float64

In [47]:
cv_result = cross_validate(
    LogisticRegression(), df_stk[s_coef.loc[s_coef > 0].index.values], s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9004489337822672, 0.011826351358647003)

In [38]:
cv_result = cross_validate(
    LinearSVC(C=10), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9015095398428732, 0.012057737949290713)

In [48]:
cv_result = cross_validate(
    LinearSVC(C=10),  df_stk[s_coef.loc[s_coef > 0].index.values], s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9012121212121211, 0.011656976501973178)