In [1]:
import pandas as pd
import numpy as np
import sgml, sgutil

sc = sgutil.SGCache('img', 'result')

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

target = 'rainfall'
s_target = sc.read_result('target')
config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [124]:
df_stk = sc.read_prds(sc.get_cv_list(), s_target.index)
df_stk.head()

Unnamed: 0_level_0,lgb_bfs,lgb_bfs_f,lgb_sfs,lgb_sfs2,lgb_sfs_a,lgb_sfs_a2,lgb_sfs_f,lr_bfs,lr_bfs_a,lr_bfs_f,...,lr_sfs_f2,lsvc_bfs_f,lsvc_sfs_f,p2svc_sfs_f,xgb_bfs,xgb_bfs_f,xgb_sfs,xgb_sfs2,xgb_sfs_a,xgb_sfs_f
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.963096,0.960541,0.957347,0.961141,0.966031,0.96306,0.959939,0.968675,0.960399,0.962606,...,0.970137,0.947012,0.954585,0.938617,0.964936,0.959594,0.961516,0.964605,0.967816,0.961254
1,0.959187,0.958526,0.957021,0.958315,0.960232,0.959045,0.952884,0.98646,0.979461,0.981487,...,0.984648,0.964604,0.980839,0.961252,0.968355,0.960039,0.966392,0.962522,0.975247,0.970552
2,0.281292,0.188287,0.297268,0.34576,0.271184,0.241745,0.250209,0.148145,0.121243,0.096252,...,0.135756,0.10781,0.123279,0.110698,0.340696,0.280284,0.259725,0.231905,0.201359,0.274945
3,0.952688,0.958418,0.955427,0.957935,0.952604,0.950954,0.95224,0.985884,0.98393,0.979312,...,0.988272,0.9706,0.982172,0.962549,0.967486,0.963505,0.962823,0.95994,0.975169,0.967548
4,0.134081,0.114338,0.1189,0.110785,0.107781,0.113644,0.134351,0.132335,0.085233,0.08259,...,0.09596,0.078978,0.131572,0.135218,0.062901,0.064365,0.080142,0.077906,0.191981,0.06613


In [125]:
roc_auc_score(
    s_target, df_stk.mean(axis=1)
)

0.8999618406285073

In [126]:
from sklearn.model_selection import cross_validate
cv_result = cross_validate(
    LogisticRegression(C=0.1), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9013524130190798, 0.012336229931549527)

In [127]:
clf_lr = LogisticRegression(C=0.1).fit(df_stk, s_target)

In [128]:
s_coef = pd.Series(
    clf_lr.coef_[0], index = df_stk.columns
)
s_coef.sort_values(ascending = False)

xgb_sfs_a      0.344361
xgb_sfs_f      0.340658
lr_sfs3        0.317614
lr_sfs2        0.281497
lgb_sfs        0.273732
xgb_sfs2       0.255695
xgb_bfs_f      0.245898
lgb_bfs        0.239870
lr_sfs         0.238800
lr_sfs_a3      0.238518
lr_sfs_f       0.234079
lgb_sfs2       0.233644
lgb_bfs_f      0.233532
lgb_sfs_f      0.228026
lr_sfs_a2      0.226884
xgb_sfs        0.225739
lgb_sfs_a      0.222320
lr_bfs         0.210029
lr_bfs_f       0.205904
xgb_bfs        0.182251
lgb_sfs_a2     0.166265
lsvc_sfs_f     0.165908
lr_sfs_f2      0.163403
lr_sfs_a       0.157520
p2svc_sfs_f    0.153823
lr_bfs_a       0.120682
lsvc_bfs_f     0.084123
dtype: float64

In [129]:
roc_auc_score(
    s_target, df_stk[s_coef.loc[s_coef > 0].index.values].mean(axis=1)
)

0.8999618406285073

In [130]:
cv_result = cross_validate(
    LogisticRegression(C=0.1), df_stk[s_coef.loc[s_coef > 0].index.values], s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9013524130190798, 0.012336229931549527)

In [142]:
cv_result = cross_validate(
    LinearSVC(C=0.01, dual = 'auto'), df_stk, s_target, scoring = 'roc_auc', cv = skf
)
cv_result['test_score'].mean(), cv_result['test_score'].std()

(0.9011560044893379, 0.011940544176577033)