In [1]:
import os, sys, re
from functools import partial

import scipy
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import polars as pl

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import dproc, sgml, sgutil, sgnn

print(sys.version)
for i in [scipy, sklearn, mpl, sns, np, pd, pl, cb, lgb, xgb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2024-12-29 22:49:30.942668: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-29 22:49:30.968093: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.3 (main, May  1 2024, 17:33:23) [GCC 11.4.0]
scipy 1.13.0
sklearn 1.4.2
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
pandas 2.2.2
polars 1.12.0
catboost 1.2.5
lightgbm 4.3.0
xgboost 2.1.2


In [2]:
data_path = 'data'

df_train = pd.read_feather(os.path.join(data_path, 'train.feather'))
df_test = pd.read_feather(os.path.join(data_path, 'test.feather'))
pd_vars= dproc.PD_Vars.load(os.path.join('data', 'vars'))
target = 'Premium Amount_l'

X_bool = ['Gender', 'Smoking Status']
X_num = ['Age', 'Annual Income', 'Credit Score', 'Previous Claims_fm1', 'Vehicle Age', 'Policy Days', 'Number of Dependents_fz']
X_ord = ['Education Level', 'Occupation_funk', 'Location', 'Policy Type', 'Exercise Frequency', 'Weekday', 'Year', 
         'Insurance Duration_fm', 'Property Type']
X_menc = ['CA_C', 'CA_A']
X_imp = ['Health Score_50', 'Annual Income_f', 'Credit Score_f', 'Previous Claims_f']
X_imp2 = ['Health Score_502', 'Annual Income_f2', 'Credit Score_f2', 'Previous Claims_f2']
X_comb = ['Marital_Feedback']
X_freq = ['Annual Income_Freq','Credit Score_Freq', 'Health Score_Freq', 'Annual Income_Freq_d', 'Health Score_Freq_d']

sc = sgutil.SGCache('img', 'result')

In [None]:
c_type = pd.CategoricalDtype(
    ['Unk'] + df_train['Health Score_fz'].astype('str').value_counts().pipe(
        lambda x: x.loc [x > 1]
    ).index.tolist()
)
df_train['Health Score_c'] = dproc.rearrange_cat(
    df_train['Health Score_fz'].astype('str').astype('category'), c_type, lambda c, v: 0 if v not in c else v, use_set = True
)

df_test['Health Score_c'] = dproc.rearrange_cat(
    df_test['Health Score_fz'].astype('str').astype('category'), c_type, lambda c, v: 0 if v not in c else v, use_set = True
)

c_type = pd.CategoricalDtype(
    ['Unk'] + df_train['Health Score_fz'].astype('str').value_counts().pipe(
        lambda x: x.loc [x > 2]
    ).index.tolist()
)
df_train['Health Score_c2'] = dproc.rearrange_cat(
    df_train['Health Score_fz'].astype('str').astype('category'), c_type, lambda c, v: 0 if v not in c else v, use_set = True
)

df_test['Health Score_c2'] = dproc.rearrange_cat(
    df_test['Health Score_fz'].astype('str').astype('category'), c_type, lambda c, v: 0 if v not in c else v, use_set = True
)

In [3]:
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, mean_squared_error

ss = ShuffleSplit(1, random_state=123)
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
kf = KFold(5, random_state = 123, shuffle = True)

df_train['target_2'] =  pd.cut(df_train[target], [-np.inf, 6, np.inf], labels = [0, 1])
df_train['target_3'] =  pd.cut(df_train[target], [-np.inf, 6, 7.6, np.inf], labels = [0, 1, 2])
df_train['target_5'] =  pd.cut(df_train[target], [-np.inf, 4, 5, 6, 7.6, np.inf], labels = [0, 1, 2, 3, 4])

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size=validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index=df.index),
    'score_func': lambda df, prds: -(mean_squared_error(df[target].sort_index(), prds.clip(3.044523, 8.517193).sort_index()) ** 0.5),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': target,
}

config_2 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: accuracy_score(df['target_3'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_3',
}

config_3 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: accuracy_score(df['target_5'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_5',
}

config_4 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: roc_auc_score(df['target_2'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_2',
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lgb_adapter =  sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)
nn_adapter = sgnn.NNAdapter(model=sgnn.NNRegressor)

cb_adapter_c = sgml.CBAdapter(cb.CatBoostClassifier)
lgb_adapter_c =  sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter_c = sgml.XGBAdapter(xgb.XGBClassifier)

# Classification

## 3 Class-model

In [4]:
lgb_c1 = sgml.CVModel.load_or_create('result', 'lgb_c1', kf, config_2, lgb_adapter_c)

In [5]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c1.adhoc(df_train, ss, hparams)
result = lgb_c1.cv(df_train, hparams)
result['valid_scores']

[0.7532,
 0.7530833333333333,
 0.7528833333333333,
 0.7539833333333333,
 0.7532833333333333]

In [6]:
lgb_c1.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c1.cv_best_['prd'].rename(columns = lambda x: 't{}'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't_test',
        lambda : lgb_c1.get_predictor()(df_test).rename(columns = lambda x: 't{}'.format(x)), rerun = 0
    )
)

## 5 Class-model

In [7]:
lgb_c2 = sgml.CVModel.load_or_create('result', 'lgb_c2', skf, config_3, lgb_adapter_c)

In [8]:
hparams = {
    'model_params': {'n_estimators': 700, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c2.adhoc(df_train, ss, hparams)
result = lgb_c2.cv(df_train, hparams)
result['valid_scores']

[0.7357208333333334,
 0.7352333333333333,
 0.7357,
 0.7356208333333333,
 0.7355958333333333]

In [9]:
lgb_c2.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c2.cv_best_['prd'].rename(columns = lambda x: 't5_{}'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't5_test',
        lambda : lgb_c2.get_predictor()(df_test).rename(columns = lambda x: 't5_{}'.format(x)), rerun = 0
    )
)

## 2 Class-model

In [10]:
lgb_c3 = sgml.CVModel.load_or_create('result', 'lgb_c3', skf, config_4, lgb_adapter_c)

In [11]:
hparams = {
    'model_params': {'n_estimators': 1200, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c3.adhoc(df_train, ss, hparams)
result = lgb_c3.cv(df_train, hparams)
result['valid_scores']

[0.6773070323730878,
 0.6784434943766143,
 0.6758764798800874,
 0.6788507140193604,
 0.6775366488321382]

In [12]:
lgb_c3.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c3.cv_best_['prd'].iloc[:, :1].rename(columns = lambda x: 't2_0')
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't2_test',
        lambda : lgb_c3.get_predictor()(df_test).iloc[:, :1].rename(columns = lambda x: 't2_0'), rerun = 0
    )
)

## Classification 2

In [13]:
lgb2_c = sgml.CVModel.load_or_create('result', 'lgb2_c', kf, config_2, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c.cv(df_train, hparams)
lgb2_c.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c.cv_best_['prd'].rename(columns = lambda x: 't{}_2'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't_test2',
        lambda : lgb2_c.get_predictor()(df_test).rename(columns = lambda x: 't{}_2'.format(x)), rerun = 0
    )
)

In [14]:
lgb2_c2 = sgml.CVModel.load_or_create('result', 'lgb2_c2', skf, config_3, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 700, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c2.cv(df_train, hparams)
lgb2_c2.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c2.cv_best_['prd'].rename(columns = lambda x: 't5_{}_2'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't5_test2',
        lambda : lgb2_c2.get_predictor()(df_test).rename(columns = lambda x: 't5_{}_2'.format(x)), rerun = 0
    )
)

In [15]:
lgb2_c3 = sgml.CVModel.load_or_create('result', 'lgb2_c3', skf, config_4, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 1200, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num  + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c3.cv(df_train, hparams)
lgb2_c3.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c3.cv_best_['prd'].iloc[:, :1].rename(columns = lambda x: 't2_0_2')
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't2_test2',
        lambda : lgb2_c3.get_predictor()(df_test).iloc[:, :1].rename(columns = lambda x: 't2_0_2'), rerun = 0
    )
)

In [41]:
cb_c3 = sgml.CVModel.load_or_create('result', 'cb_c3', skf, config_4, cb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'],
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb + ['Health Score_c2'], 
    'random_state': 123,
}
result = cb_c3.cv(df_train, hparams, task_type='GPU')
cb_c3.train(df_train, task_type='GPU')
df_train = dproc.join_and_assign(
    df_train,
    cb_c3.cv_best_['prd'].iloc[:, :1].rename(columns = lambda x: 't2_0_2c')
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't2_test2c',
        lambda : cb_c3.get_predictor()(df_test).iloc[:, :1].rename(columns = lambda x: 't2_0_2c'), rerun = 0
    )
)

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

# Main Model

In [16]:
X_clf = ['t0', 't1', 't2', 't5_0', 't5_1', 't5_2', 't5_3', 't5_4', 't2_0']

In [17]:
X_clf2 = ['{}_2'.format(i) for i in X_clf]

In [42]:
X_clf2c = ['t2_0_2c']

## LR

In [18]:
lr = sgml.CVModel.load_or_create('result', 'lr', kf, config, lr_adapter)

In [47]:
hparams = {
    'X_mm': ['Previous Claims_f', 'Credit Score_f', 'Annual Income_fl', 'Annual Income_Freqz'],
    'X_num': ['Year_2019', 'Annual Income_isna', 'Credit Score_isna', 'Health Score_isna', 'Health Score_50'] + X_menc + ['t0', 't1', 't5_0', 't5_1', 't5_2', 't5_3', 't2_0', 't2_0_2c'],
    'X_ohe': ['Marital_Feedback'], 'ohe': {'drop': 'first'},
}
result = lr.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-0.9980430105512881,
 -1.0011878619164016,
 -1.0040095531014983,
 -1.0039241956933571,
 -1.004249621690207]

## LR2

In [45]:
lr2 = sgml.CVModel.load_or_create('result', 'lr2', kf, config, lr_adapter)
hparams = {
    'X_mm': ['Previous Claims_f2', 'Credit Score_f2', 'Annual Income_fl2', 'Annual Income_Freqz'],
    'X_num': ['Year_2019', 'Annual Income_isna', 'Credit Score_isna', 'Health Score_isna', 'Health Score_502'] + X_menc + ['t0_2', 't1_2', 't5_0_2', 't5_1_2', 't5_2_2', 't5_3_2', 't2_0_2', 't2_0_2c'],
    'X_ohe': ['Marital_Feedback'], 'ohe': {'drop': 'first'},
}
result = lr2.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-0.9977664309969787,
 -1.001070138224725,
 -1.0039060157476771,
 -1.0038153290545833,
 -1.0039445810014593]

## LGB1

In [21]:
lgb1 = sgml.CVModel.load_or_create('result', 'lgb1', kf, config, lgb_adapter)

In [48]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 127, 'learning_rate': 0.007},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'], 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

[-0.9893291156678362,
 -0.9929592979108764,
 -0.9955115710039993,
 -0.9957358475740496,
 -0.9957158632371793]

## LGB2

In [23]:
lgb2 = sgml.CVModel.load_or_create('result', 'lgb2', kf, config, lgb_adapter)

In [50]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.007},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'],
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb2.adhoc(df_train, ss, hparams)
result = lgb2.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

[-0.9897231334445228,
 -0.9932241561750519,
 -0.995700492194899,
 -0.9961208750294792,
 -0.9960026454821133]

## CB1

In [27]:
cb1 = sgml.CVModel.load_or_create('result', 'cb1', kf, config, cb_adapter)

In [28]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.04},
    'X_tgt': ['Health Score', 'Annual Income'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_menc + X_imp + X_clf, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb1.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb1.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

[-1.0196199959691037,
 -1.0232915358709402,
 -1.0253532999410357,
 -1.0264667961515663,
 -1.025935222828204]

## CB2

In [29]:
cb2 = sgml.CVModel.load_or_create('result', 'cb2', kf, config, cb_adapter)

In [30]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Health Score', 'Annual Income'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_menc + X_imp + X_clf, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb2.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb2.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

[-1.0195165759443854,
 -1.0235181016085886,
 -1.0257297706100799,
 -1.0267506522703393,
 -1.0259914801276941]

## CB3

In [31]:
cb3 = sgml.CVModel.load_or_create('result', 'cb3', kf, config, cb_adapter)

In [62]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'], 
    'X_cat': X_ord + X_bool + X_comb + ['Health Score_c'], 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb3.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb3.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-0.9818935340167837,
 -0.9861427115353197,
 -0.988926020374132,
 -0.9883701883295035,
 -0.9890652461301852]

## CB4

In [37]:
cb4 = sgml.CVModel.load_or_create('result', 'cb4', kf, config, cb_adapter)

In [63]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'], 
    'X_cat': X_ord + X_bool + X_comb + ['Health Score_c2'], 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb4.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb4.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-0.981812193496832,
 -0.9849812149882682,
 -0.9871678804972944,
 -0.9875514786880626,
 -0.9865437819867661]

## XGB1

In [54]:
xgb1 = sgml.CVModel.load_or_create('result', 'xgb1', kf, config, xgb_adapter)

In [55]:
hparams = {
    'model_params': {'n_estimators': 1100, 'max_depth': 5, 'learning_rate': 0.01},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'], 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = xgb1.adhoc(df_train, ss, hparams, device='cuda')
result = xgb1.cv(df_train, hparams, device='cuda')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

[-0.9903399770433169,
 -0.9938605912768593,
 -0.9962033678960935,
 -0.9967098350383355,
 -0.9965209045054976]

## XGB2

In [56]:
xgb2 = sgml.CVModel.load_or_create('result', 'xgb2', kf, config, xgb_adapter)

In [57]:
hparams = {
    'model_params': {'n_estimators': 1300, 'max_depth': 4, 'learning_rate': 0.02},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2 + ['t2_0_2c'], 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = xgb2.adhoc(df_train, ss, hparams, device='cuda')
result = xgb2.cv(df_train, hparams, device='cuda')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

[-0.9904394294862963,
 -0.9938176497938996,
 -0.9960850135569694,
 -0.9965821209636014,
 -0.9966335554321478]

## NN1

In [33]:
nn1 = sgml.CVModel.load_or_create('result', 'nn1', kf, config, nn_adapter)

In [34]:
def nn_cat_param(df, name, size):
    return name, len(df[name].cat.categories), size

X_cat = ['Marital_Feedback']
X_nn_emb = [
    ('Marital_Feedback', 3)
]

nn_params = {
    'embedding':[(1, c, es, 0, 0) for _, c, es in [nn_cat_param(df_train, v, n) for v, n in X_nn_emb]], 
    'config':  [
        {'unit': 16, 'activation': 'relu', 'batch_norm': False},
    ]
}

hparams = {
    'model_params': {
        'model_params': nn_params,
        'epochs': 30,
        'optimizer': ('Adam', {'learning_rate': 0.001}),
        'batch_size': 2048, 'shuffle_size': 102400,
        'early_stopping': None, 'reduce_lr_on_plateau': None, 'lr_scheduler': None
    }, 
    'X_cat': [i for i, _ in X_nn_emb],
    'X_tgt': ['Health Score', 'Annual Income'], 'tgt': {'target_type': 'continuous'},
    'X_mm': ['Previous Claims_f', 'Credit Score_f', 'Annual Income_fl', 'Annual Income_Freqz'],
    'X_num': X_bool + X_menc + ['t0', 't1', 't5_0', 't5_1', 't5_2', 't5_3', 't2_0'],
    'X_ohe': X_ord, 
    'ohe': {'drop': 'first', 'min_frequency': 50, 'sparse_output': False},
    #'validation_fraction': 0.1
}

##nn1.adhoc(df_train, ss, hparams)
result = nn1.cv(df_train, hparams)
result['valid_scores']

[-1.0210798056831674,
 -1.024425291976256,
 -1.0266213433574347,
 -1.029317316362635,
 -1.0274583295739663]

# Ensemble

## Stk1

In [89]:
models = [lgb1, xgb1, xgb2, lgb2, cb3, cb4]
pd.Series({i.name: i.cv_best_['score'] for i in models}).rename('score').to_frame().T.sort_values('score', axis=1)

Unnamed: 0,xgb1,xgb2,lgb2,lgb1,cb3,cb4
score,-0.994727,-0.994712,-0.994154,-0.99385,-0.98688,-0.985611


In [90]:
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,lgb1,xgb1,xgb2,lgb2,cb3,cb4,Premium Amount_l
lgb1,1.0,0.994759,0.993381,0.99768,0.977165,0.97258,0.421304
xgb1,0.994759,1.0,0.997724,0.99641,0.978251,0.973465,0.419563
xgb2,0.993381,0.997724,1.0,0.994932,0.978183,0.973353,0.41957
lgb2,0.99768,0.99641,0.994932,1.0,0.978117,0.973497,0.420733
cb3,0.977165,0.978251,0.978183,0.978117,1.0,0.991574,0.434881
cb4,0.97258,0.973465,0.973353,0.973497,0.991574,1.0,0.437276
Premium Amount_l,0.421304,0.419563,0.41957,0.420733,0.434881,0.437276,1.0


In [91]:
lr_stk = sgml.CVModel.load_or_create('result', 'lr_stk', kf, config, lr_adapter)

In [92]:
result = lr_stk.cv(df_stk, {
    'model_params': {},
    'X_num': df_stk.columns[:-1].tolist()
})
np.mean(result['valid_scores'])

-0.9845649844910975

In [93]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type='GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device='cuda')
    else:
        i.train(df_train)
lr_stk.train(df_stk)

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

{'variables': ['cb4', 'xgb2', 'lgb2', 'xgb1', 'cb3', 'lgb1'],
 'train_shape': (1200000, 6),
 'target': 'Premium Amount_l',
 'target_func': None}

In [94]:
df_vt = pd.concat([
    i.get_predictor()(df_test).rename(i.name)
    for i in models
], axis=1)

In [95]:
s_prd = (np.exp(
    lr_stk.get_predictor()(df_vt)
) - 1).clip(20, 5000).rename('Premium Amount')

In [96]:
s_prd.to_frame().to_csv(os.path.join('result', 'submission6.csv'))

In [97]:
!head result/submission6.csv

id,Premium Amount
1200000,924.5759893103473
1200001,939.0543631244581
1200002,882.9565646339853
1200003,777.687407369077
1200004,837.2410585618129
1200005,931.6490615198575
1200006,837.7268376155982
1200007,706.5031624083746
1200008,245.25572361601002


In [105]:
!kaggle competitions submit -c playground-series-s4e12 -f result/submission6.csv -m "6"

100%|██████████████████████████████████████| 19.9M/19.9M [00:07<00:00, 2.89MB/s]
400 - Bad Request - Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (10 hours from now).
