In [1]:
import pandas as pd
import numpy as np
import sgml, sgutil

sc = sgutil.SGCache('img', 'result')

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

target = 'rainfall'
s_target = sc.read_result('target')
config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [38]:
df_stk = sc.read_prds(sc.get_cv_list(), s_target.index)
df_stk.head()

Unnamed: 0_level_0,knn70_bfs_f,knn70_sfs_f,knn70_sfs_f2,lgb_bfs,lgb_bfs_f,lgb_sfs,lgb_sfs2,lgb_sfs_a,lgb_sfs_a2,lgb_sfs_f,...,lsvc_bfs_f,lsvc_sfs_f,p2svc_bfs_f,p2svc_sfs_f,xgb_bfs,xgb_bfs_f,xgb_sfs,xgb_sfs2,xgb_sfs_a,xgb_sfs_f
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.942857,0.942857,0.957143,0.961729,0.96341,0.959396,0.960603,0.954278,0.95853,0.960227,...,0.946687,0.953316,0.939305,0.939116,0.960705,0.951509,0.96447,0.961581,0.971312,0.953746
1,0.985714,0.971429,1.0,0.957777,0.957388,0.958262,0.95973,0.957464,0.960577,0.955016,...,0.964884,0.980961,0.96051,0.960704,0.970343,0.964462,0.963227,0.961197,0.975665,0.962158
2,0.314286,0.128571,0.185714,0.306126,0.214272,0.350243,0.292654,0.226779,0.233048,0.222275,...,0.107595,0.131004,0.111948,0.115182,0.298123,0.285813,0.285703,0.253963,0.208304,0.225557
3,0.985714,0.957143,0.985714,0.951085,0.957388,0.954954,0.954629,0.958466,0.960463,0.957285,...,0.970843,0.982288,0.961817,0.962007,0.969265,0.96299,0.955591,0.961683,0.974714,0.965428
4,0.314286,0.157143,0.285714,0.125042,0.117842,0.121943,0.102464,0.167561,0.154888,0.113593,...,0.078792,0.139608,0.136672,0.140219,0.080916,0.063153,0.071863,0.072423,0.203416,0.050437


In [39]:
df_stk.apply(
    lambda x: roc_auc_score(s_target, x)
).sort_values(ascending = False)

p2svc_sfs_f     0.899905
p2svc_bfs_f     0.899890
lr_sfs_f        0.898687
lsvc_bfs_f      0.898419
lr_sfs3         0.898169
lr_sfs2         0.898044
lsvc_sfs_f      0.897831
knn70_bfs_f     0.897017
lr_sfs          0.896853
lr_sfs_a3       0.896734
lr_bfs          0.896706
knn70_sfs_f     0.896539
xgb_bfs_f       0.896468
lr_sfs_a2       0.896047
xgb_sfs_f       0.895691
lr_sfs_f2       0.895504
lr_bfs_f        0.895328
xgb_sfs         0.895090
xgb_sfs2        0.894864
lgb_sfs_f       0.894483
lgb_bfs_f       0.894456
lr_sfs_a        0.894342
lgb_sfs         0.894180
lgb_bfs         0.893356
lr_bfs_a        0.892708
lgb_sfs2        0.892393
xgb_bfs         0.892093
lgb_sfs_a2      0.891894
xgb_sfs_a       0.891749
knn70_sfs_f2    0.890999
lgb_sfs_a       0.890458
dtype: float64

In [40]:
roc_auc_score(
    s_target, df_stk.mean(axis=1)
)

0.9007845117845119

In [41]:
from sklearn.model_selection import cross_validate
cv_result = cross_validate(
    LogisticRegression(C=0.1), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9018911335578002, 0.012146510989015206)

In [42]:
clf_lr = LogisticRegression(C=0.1).fit(df_stk, s_target)

In [43]:
s_coef = pd.Series(
    clf_lr.coef_[0], index = df_stk.columns
)
s_coef.sort_values(ascending = False)

lgb_sfs_a2      0.373527
lgb_sfs_a       0.314613
lr_sfs3         0.295846
knn70_sfs_f     0.275183
xgb_sfs_a       0.271136
lr_sfs2         0.258798
knn70_sfs_f2    0.251917
xgb_sfs_f       0.248518
lgb_sfs         0.233409
lgb_bfs         0.222004
lgb_bfs_f       0.218033
lr_sfs          0.216092
lgb_sfs_f       0.215340
lr_sfs_f        0.212659
xgb_bfs_f       0.210295
lr_sfs_a3       0.204883
lgb_sfs2        0.204608
lr_sfs_a2       0.193866
xgb_sfs         0.191407
lr_bfs          0.188418
xgb_sfs2        0.179209
lr_bfs_f        0.163221
knn70_bfs_f     0.162781
xgb_bfs         0.132673
lsvc_sfs_f      0.130629
lr_sfs_f2       0.129065
lr_sfs_a        0.124218
p2svc_bfs_f     0.098607
p2svc_sfs_f     0.096970
lr_bfs_a        0.088405
lsvc_bfs_f      0.037879
dtype: float64

In [44]:
roc_auc_score(
    s_target, df_stk[s_coef.loc[s_coef > 0].index.values].mean(axis=1)
)

0.9007845117845119

In [45]:
cv_result = cross_validate(
    LogisticRegression(C=0.1), df_stk[s_coef.loc[s_coef > 0].index.values], s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9018911335578002, 0.012146510989015206)

In [46]:
cv_result = cross_validate(
    LinearSVC(C=0.01, dual = 'auto'), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9007744107744108, 0.012695899748597007)