In [28]:
import pandas as pd
import numpy as np
import sgml, sgutil
import joblib

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
sc = sgutil.SGCache('img', 'result', 'model')
data_processor = joblib.load('model/data_processor.joblib')

In [3]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedShuffleSplit

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

target = 'rainfall'
s_target = sc.read_result('target')
config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index = df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target], prds),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': target,
}

lr_adapter = sgml.SklearnAdapter(LogisticRegression)
svc_adapter = sgml.SklearnAdapter(SVC)

skf = StratifiedKFold(5, random_state = 123, shuffle = True)
ss = StratifiedShuffleSplit(1, random_state = 123)

In [4]:
df_stk = sc.read_prds(sc.get_cv_list(), s_target.index)
df_stk.head()

Unnamed: 0_level_0,knn70_bfs_f,knn70_sfs_f,knn70_sfs_f2,lgb2_bfs,lgb2_sfs,lgb_bfs,lgb_bfs_f,lgb_sfs,lgb_sfs2,lgb_sfs_a,...,lsvc_sfs_f,p2svc_bfs_f,p2svc_sfs_f,xgb2_sfs,xgb_bfs,xgb_bfs_f,xgb_sfs,xgb_sfs2,xgb_sfs_a,xgb_sfs_f
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.942857,0.942857,0.957143,0.974977,0.973798,0.961417,0.963167,0.961687,0.960689,0.956218,...,0.953288,0.937191,0.938471,0.96437,0.963953,0.955173,0.961235,0.963688,0.970289,0.961985
1,0.985714,0.971429,1.0,0.974478,0.970909,0.957803,0.958344,0.957796,0.957746,0.958206,...,0.98018,0.959864,0.960714,0.964913,0.969485,0.9639,0.960262,0.961113,0.976316,0.967845
2,0.314286,0.128571,0.185714,0.334941,0.340818,0.287065,0.197811,0.379307,0.30518,0.220279,...,0.122947,0.116077,0.115836,0.347704,0.273288,0.289338,0.273836,0.267013,0.180858,0.252246
3,0.985714,0.957143,0.985714,0.977193,0.975646,0.952438,0.958344,0.956876,0.955035,0.958135,...,0.981545,0.961186,0.962021,0.9663,0.967861,0.967488,0.954288,0.9593,0.97592,0.968815
4,0.314286,0.157143,0.285714,0.075772,0.066668,0.110976,0.113101,0.113991,0.106087,0.152921,...,0.131292,0.14129,0.141041,0.09643,0.066765,0.070751,0.079994,0.072386,0.218235,0.063515


In [5]:
df_stk.apply(
    lambda x: roc_auc_score(s_target, x)
).sort_values(ascending = False)

p2svc_sfs_f     0.899900
p2svc_bfs_f     0.899774
lr_sfs_f        0.898687
lsvc_bfs_f      0.898386
lr_sfs3         0.898169
lr_sfs2         0.898044
xgb_bfs_f       0.897747
lsvc_sfs_f      0.897714
knn70_bfs_f     0.897017
lr_sfs          0.896853
lgb2_bfs        0.896796
lr_sfs_a3       0.896734
lgb2_sfs        0.896718
lr_bfs          0.896706
xgb_sfs_f       0.896549
knn70_sfs_f     0.896539
lr_sfs_a2       0.896047
lgb_bfs_f       0.895994
xgb2_sfs        0.895639
xgb_sfs2        0.895558
lr_sfs_f2       0.895504
lr_bfs_f        0.895328
xgb_sfs         0.894973
lgb_sfs         0.894939
lgb_sfs_f       0.894705
lr_sfs_a        0.894342
lgb_sfs2        0.894047
xgb_bfs         0.893626
lr_bfs_a        0.892708
lgb_bfs         0.892308
xgb_sfs_a       0.891333
lgb_sfs_a2      0.891235
lgb_sfs_a       0.891017
knn70_sfs_f2    0.890999
dtype: float64

In [6]:
roc_auc_score(
    s_target, df_stk[[i for i in df_stk.columns if i not in ['xgb2_sfs']]].mean(axis=1)
)

0.9010145903479236

In [7]:
hparams = {
    'model_params': {'C': 0.1},
    'X_num': df_stk.columns.tolist()
}
result = sgml.cv(df_stk.assign(rainfall = s_target), skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9019472502805836, 0.012426092566867313)

In [8]:
df_coef = pd.concat([i['coef'] for i in result['model_result']], axis=1).agg(['mean', 'std'], axis = 1).assign(
    CV = lambda x: x['std'] / x['mean'].abs()
).sort_values('CV', ascending = False)
df_coef.iloc[:10]

Unnamed: 0,mean,std,CV
lsvc_bfs_f,0.04666,0.044418,0.951958
p2svc_bfs_f,0.097073,0.046657,0.480632
p2svc_sfs_f,0.10062,0.04547,0.451896
knn70_bfs_f,0.138438,0.039124,0.282613
xgb_bfs,0.123726,0.026737,0.216098
lr_bfs_a,0.092126,0.018626,0.202183
knn70_sfs_f2,0.213367,0.04065,0.190516
lr_sfs3,0.258689,0.045506,0.175911
lr_sfs_a,0.120263,0.021106,0.175501
lr_bfs_f,0.156056,0.025926,0.166132


In [9]:
hparams = {
    'model_params': {'C': 0.1},
    'X_num': df_coef.iloc[3:].index.tolist()
}
result = sgml.cv(df_stk.assign(rainfall = s_target), skf, hparams, config, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9020594837261504, 0.012224427528991378)

In [10]:
s_stk_lr = result['valid_prd'].rename('stk_lr')

In [11]:
roc_auc_score(
    s_target, df_stk[ df_coef.iloc[1:].index.tolist()].mean(axis=1)
)

0.9009921436588104

In [12]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': df_stk.columns.tolist()
}
result = sgml.cv(df_stk.assign(rainfall = s_target), skf, hparams, config, svc_adapter)
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.9020482603815937, 0.011932188906025045)

In [13]:
s_stk_svc = result['valid_prd'].rename('stk_svc')

In [26]:
s_avg = pd.concat([s_stk_lr, s_stk_svc, df_stk.mean(axis=1)], axis=1).mean(axis=1)
roc_auc_score(
    s_target, s_avg
)

0.9005858585858585

In [33]:
sc.cache_result(
    'e_prd', lambda : s_avg
)

id
0       0.946956
1       0.954458
2       0.152210
3       0.954709
4       0.100463
          ...   
2185    0.953396
2186    0.954695
2187    0.801946
2188    0.951806
2189    0.954069
Length: 2190, dtype: float64

In [16]:
predictors = {
    i: sc.get_predictor_cv(i, config) for i in sc.get_cv_list()
}

df_test = data_processor.transform(['data/test.csv'])
df_stk_test = pd.concat([
    v(df_test).rename(k) for k, v in predictors.items()
], axis=1)

# Submision 1

In [14]:
hparams = {
    'model_params': {'C': 0.1},
    'X_num': df_coef.iloc[3:].index.tolist()
}
objs, spec = sgml.train(df_stk.assign(rainfall=s_target), hparams, config, lr_adapter)
lr_stk = sgml.assemble_predictor(objs['model'], objs.get('preprocessor'), spec, config)
lr_stk(df_stk_test).rename(target).to_csv('result/submission1.csv')

In [42]:
# 0.84258
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission1.csv -m "1"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 15.9kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 2

In [15]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': df_stk.columns.tolist()
}
objs, spec = sgml.train(df_stk.assign(rainfall=s_target), hparams, config, svc_adapter)
svc_stk = sgml.assemble_predictor(objs['model'], objs.get('preprocessor'), spec, config)
svc_stk(df_stk_test).rename(target).to_csv('result/submission2.csv')

In [45]:
# 0.84768
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission2.csv -m "2"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 15.9kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submision 3 

In [46]:
df_stk_test.mean(axis=1).rename(target).to_csv('result/submission3.csv')

In [47]:
# 0.84714
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission3.csv -m "3"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 15.9kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

In [53]:
df_train = data_processor.transform(['data/train.csv'])

# Submission 4

In [17]:
hparams = {
    'model_params': {'C': 0.01},
    'X_num': df_coef.iloc[3:].index.tolist()
}
objs, spec = sgml.train(df_stk.assign(rainfall=s_target), hparams, config, lr_adapter)
lr_stk = sgml.assemble_predictor(objs['model'], objs.get('preprocessor'), spec, config)
lr_stk(df_stk_test).rename(target).to_csv('result/submission4.csv')

In [18]:
# 0.84499
#!kaggle competitions submit -c playground-series-s5e3 -f result/submission4.csv -m "4"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 15.8kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 5

In [19]:
hparams = {
    'model_params': {'C': 0.01, 'probability': True, 'kernel': 'linear'},
    'X_num': df_stk.columns.tolist()
}
objs, spec = sgml.train(df_stk.assign(rainfall=s_target), hparams, config, svc_adapter)
svc_stk = sgml.assemble_predictor(objs['model'], objs.get('preprocessor'), spec, config)
svc_stk(df_stk_test).rename(target).to_csv('result/submission5.csv')

In [20]:
# 0.84714
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission5.csv -m "5"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 17.0kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 6

In [21]:
df_stk_test.mean(axis=1).rename(target).to_csv('result/submission6.csv')

In [22]:
# 0.84553
# !kaggle competitions submit -c playground-series-s5e3 -f result/submission6.csv -m "6"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 16.3kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset

# Submission 7

In [24]:
pd.concat([
    lr_stk(df_stk_test),
    svc_stk(df_stk_test),
    df_stk_test.mean(axis=1),
], axis=1).mean(axis=1).rename(target).to_csv('result/submission7.csv')

In [25]:
# 0.84607
#!kaggle competitions submit -c playground-series-s5e3 -f result/submission7.csv -m "7"

100%|██████████████████████████████████████| 17.1k/17.1k [00:01<00:00, 15.3kB/s]
Successfully submitted to Binary Prediction with a Rainfall Dataset