In [1]:
import os, sys, re
from functools import partial

import scipy
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import polars as pl

import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import dproc, sgml, sgutil, sgnn

print(sys.version)
for i in [scipy, sklearn, mpl, sns, np, pd, pl, cb, lgb, xgb]:
    try:
        print(i.__name__, i.__version__)
    except:
        pass

2024-12-30 07:12:33.057621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735542753.069288   53768 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735542753.072984   53768 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 07:12:33.085414: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
scipy 1.12.0
sklearn 1.5.2
matplotlib 3.8.4
seaborn 0.13.2
numpy 1.26.4
pandas 2.2.3
polars 1.12.0
catboost 1.2.5
xgboost 2.1.2


In [2]:
data_path = 'data'

df_train = pd.read_feather(os.path.join(data_path, 'train.feather'))
df_test = pd.read_feather(os.path.join(data_path, 'test.feather'))
pd_vars= dproc.PD_Vars.load(os.path.join('data', 'vars'))
target = 'Premium Amount_l'

X_bool = ['Gender', 'Smoking Status']
X_num = ['Age', 'Annual Income', 'Credit Score', 'Previous Claims_fm1', 'Vehicle Age', 'Policy Days', 'Number of Dependents_fz']
X_ord = ['Education Level', 'Occupation_funk', 'Location', 'Policy Type', 'Exercise Frequency', 'Weekday', 'Year', 
         'Insurance Duration_fm', 'Property Type']
X_menc = ['CA_C', 'CA_A']
X_imp = ['Health Score_50', 'Annual Income_f', 'Credit Score_f', 'Previous Claims_f']
X_imp2 = ['Health Score_502', 'Annual Income_f2', 'Credit Score_f2', 'Previous Claims_f2']
X_comb = ['Marital_Feedback']
X_freq = ['Annual Income_Freq','Credit Score_Freq', 'Health Score_Freq', 'Annual Income_Freq_d', 'Health Score_Freq_d']

sc = sgutil.SGCache('img', 'result')

In [60]:
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, mean_squared_error

ss = ShuffleSplit(1, random_state=123)
ss2 = ShuffleSplit(1, train_size = 0.6, random_state=123)
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
kf = KFold(5, random_state = 123, shuffle = True)

df_train['target_2'] =  pd.cut(df_train[target], [-np.inf, 6, np.inf], labels = [0, 1])
df_train['target_3'] =  pd.cut(df_train[target], [-np.inf, 6, 7.6, np.inf], labels = [0, 1, 2])
df_train['target_5'] =  pd.cut(df_train[target], [-np.inf, 4, 5, 6, 7.6, np.inf], labels = [0, 1, 2, 3, 4])

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size=validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index=df.index),
    'score_func': lambda df, prds: -(mean_squared_error(df[target].sort_index(), prds.clip(3.044523, 8.517193).sort_index()) ** 0.5),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': target,
}

config_2 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: accuracy_score(df['target_3'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_3',
}

config_3 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: accuracy_score(df['target_5'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_5',
}

config_4 = {
    'predict_func': lambda m, df, X: pd.DataFrame(m.predict_proba(df[X]), index=df.index),
    'score_func': lambda df, prds: roc_auc_score(df['target_2'].sort_index(), prds.idxmax(axis=1).sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'return_train_scores': False,
    'y': 'target_2',
}

lr_adapter = sgml.SklearnAdapter(LinearRegression)
cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)
lgb_adapter =  sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)
nn_adapter = sgnn.NNAdapter(model=sgnn.NNRegressor)

cb_adapter_c = sgml.CBAdapter(cb.CatBoostClassifier)
lgb_adapter_c =  sgml.LGBMAdapter(lgb.LGBMClassifier)
xgb_adapter_c = sgml.XGBAdapter(xgb.XGBClassifier)

In [4]:
cat_list = []
for train_idx, _ in kf.split(df_train):
    s_vcnt = df_train['Health Score_fz'].astype('str').value_counts()
    cat_list.append(s_vcnt.loc[s_vcnt >= 3])


In [5]:
df_train['Health Score_c'] = df_train['Health Score_fz'].astype('str').astype('category')
df_test['Health Score_c'] = df_test['Health Score_fz'].astype('str').astype('category')

# Classification

## 3 Class-model

In [7]:
lgb_c1 = sgml.CVModel.load_or_create('result', 'lgb_c1', kf, config_2, lgb_adapter_c)

In [8]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c1.adhoc(df_train, ss, hparams)
result = lgb_c1.cv(df_train, hparams)
result['valid_scores']

[0.7534333333333333,
 0.7531291666666666,
 0.7526583333333333,
 0.7540916666666667,
 0.75335]

In [9]:
lgb_c1.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c1.cv_best_['prd'].rename(columns = lambda x: 't{}'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't_test',
        lambda : lgb_c1.get_predictor()(df_test).rename(columns = lambda x: 't{}'.format(x)), rerun = 0
    )
)

## 5 Class-model

In [10]:
lgb_c2 = sgml.CVModel.load_or_create('result', 'lgb_c2', skf, config_3, lgb_adapter_c)

In [11]:
hparams = {
    'model_params': {'n_estimators': 700, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c2.adhoc(df_train, ss, hparams)
result = lgb_c2.cv(df_train, hparams)
result['valid_scores']

[0.7358041666666667,
 0.7348916666666667,
 0.7354583333333333,
 0.7355083333333333,
 0.7355791666666667]

In [12]:
lgb_c2.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c2.cv_best_['prd'].rename(columns = lambda x: 't5_{}'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't5_test',
        lambda : lgb_c2.get_predictor()(df_test).rename(columns = lambda x: 't5_{}'.format(x)), rerun = 0
    )
)

## 2 Class-model

In [13]:
lgb_c3 = sgml.CVModel.load_or_create('result', 'lgb_c3', skf, config_4, lgb_adapter_c)

In [14]:
hparams = {
    'model_params': {'n_estimators': 1200, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_menc + X_imp + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income'],
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb_c3.adhoc(df_train, ss, hparams)
result = lgb_c3.cv(df_train, hparams)
result['valid_scores']

[0.6775183458512728,
 0.6786694499279009,
 0.6759428766088227,
 0.6784740707117934,
 0.6772269691294307]

In [15]:
lgb_c3.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb_c3.cv_best_['prd'].iloc[:, :1].rename(columns = lambda x: 't2_0')
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't2_test',
        lambda : lgb_c3.get_predictor()(df_test).iloc[:, :1].rename(columns = lambda x: 't2_0'), rerun = 0
    )
)

## Classification 2

In [16]:
lgb2_c = sgml.CVModel.load_or_create('result', 'lgb2_c', kf, config_2, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c.cv(df_train, hparams)
lgb2_c.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c.cv_best_['prd'].rename(columns = lambda x: 't{}_2'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't_test2',
        lambda : lgb2_c.get_predictor()(df_test).rename(columns = lambda x: 't{}_2'.format(x)), rerun = 0
    )
)

In [17]:
lgb2_c2 = sgml.CVModel.load_or_create('result', 'lgb2_c2', skf, config_3, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 700, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c2.cv(df_train, hparams)
lgb2_c2.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c2.cv_best_['prd'].rename(columns = lambda x: 't5_{}_2'.format(x))
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't5_test2',
        lambda : lgb2_c2.get_predictor()(df_test).rename(columns = lambda x: 't5_{}_2'.format(x)), rerun = 0
    )
)

In [18]:
lgb2_c3 = sgml.CVModel.load_or_create('result', 'lgb2_c3', skf, config_4, lgb_adapter_c)
hparams = {
    'model_params': {'n_estimators': 1200, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': X_num  + X_imp2 + X_freq, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'],
    'random_state': 123,
}
result = lgb2_c3.cv(df_train, hparams)
lgb2_c3.train(df_train)
df_train = dproc.join_and_assign(
    df_train,
    lgb2_c3.cv_best_['prd'].iloc[:, :1].rename(columns = lambda x: 't2_0_2')
)
df_test = dproc.join_and_assign(
    df_test,
    sc.cache_result(
        't2_test2',
        lambda : lgb2_c3.get_predictor()(df_test).iloc[:, :1].rename(columns = lambda x: 't2_0_2'), rerun = 0
    )
)

In [19]:
X_clf = ['t0', 't1', 't2', 't5_0', 't5_1', 't5_2', 't5_3', 't5_4', 't2_0']

In [20]:
X_clf2 = ['{}_2'.format(i) for i in X_clf]

# Main Model

## LR

In [21]:
lr = sgml.CVModel.load_or_create('result', 'lr', kf, config, lr_adapter)

In [22]:
hparams = {
    'X_mm': ['Previous Claims_f', 'Credit Score_f', 'Annual Income_fl', 'Annual Income_Freqz'],
    'X_num': ['Year_2019', 'Annual Income_isna', 'Credit Score_isna', 'Health Score_isna', 'Health Score_50'] + ['t0', 't1', 't5_0', 't5_1', 't5_2', 't5_3', 't2_0'],
    'X_ohe': ['Marital_Feedback'], 'ohe': {'drop': 'first'},
}
result = lr.cv(df_train, hparams)
result['valid_scores']

[-1.020907947738057,
 -1.0244113313734193,
 -1.0266929097821098,
 -1.027685938981645,
 -1.0274033734243435]

## LR2

In [23]:
lr2 = sgml.CVModel.load_or_create('result', 'lr2', kf, config, lr_adapter)
hparams = {
    'X_mm': ['Previous Claims_f2', 'Credit Score_f2', 'Annual Income_fl2', 'Annual Income_Freqz'],
    'X_num': ['Year_2019', 'Annual Income_isna', 'Credit Score_isna', 'Health Score_isna', 'Health Score_502'] + ['t0_2', 't1_2', 't5_0_2', 't5_1_2', 't5_2_2', 't5_3_2', 't2_0_2'],
    'X_ohe': ['Marital_Feedback'], 'ohe': {'drop': 'first'},
}
result = lr2.cv(df_train, hparams)
result['valid_scores']

[-1.0209981802987307,
 -1.0242917289700002,
 -1.026519528835961,
 -1.0277978112709072,
 -1.0273116323377496]

## LGB1

In [24]:
lgb1 = sgml.CVModel.load_or_create('result', 'lgb1', kf, config, lgb_adapter)

In [25]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 127, 'learning_rate': 0.007},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb1.adhoc(df_train, ss, hparams)
result = lgb1.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

[-1.0194825509544807,
 -1.0229964657428703,
 -1.0252964849015755,
 -1.0265707613818353,
 -1.0259000542442873]

## LGB2

In [26]:
lgb2 = sgml.CVModel.load_or_create('result', 'lgb2', kf, config, lgb_adapter)

In [27]:
hparams = {
    'model_params': {'n_estimators': 1000, 'num_leaves': 63, 'learning_rate': 0.007},
    'X_num': X_num + X_freq + X_imp2 + X_clf2,
    'X_cat': X_ord + X_bool + X_comb, 
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = lgb2.adhoc(df_train, ss, hparams)
result = lgb2.cv(df_train, hparams)
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

Round:   0%|          | 0/1000 [00:00<?, ?it/s]

[-1.0196454000717559,
 -1.0230054346561774,
 -1.0252564018679065,
 -1.0264906829663503,
 -1.0259224138735792]

## CB1

In [28]:
cb1 = sgml.CVModel.load_or_create('result', 'cb1', kf, config, cb_adapter)

In [29]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 9, 'learning_rate': 0.04},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp + X_clf, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb1.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb1.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.0195329716539823,
 -1.023208763213835,
 -1.0255487417970068,
 -1.0265576422608642,
 -1.0258635511682561]

## CB2

In [30]:
cb2 = sgml.CVModel.load_or_create('result', 'cb2', kf, config, cb_adapter)

In [31]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp + X_clf, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb2.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb2.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.0198124603222762,
 -1.0233958352835582,
 -1.0257110484408136,
 -1.0267100133305842,
 -1.0261159618593299]

## CB3

In [32]:
cb3 = sgml.CVModel.load_or_create('result', 'cb3', kf, config, cb_adapter)

In [33]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_ord': ['Health Score_c'], 'ord': {'min_frequency': 5, 'unknown_value': -1, 'handle_unknown': "use_encoded_value"},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb3.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb3.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.025042488333008,
 -1.0219134920026633,
 -1.0240171078328728,
 -1.0217235015230068,
 -1.0290155064007989]

## CB4

In [34]:
cb4 = sgml.CVModel.load_or_create('result', 'cb4', kf, config, cb_adapter)

In [35]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_ord': ['Health Score_c'], 'ord': {'min_frequency': 7, 'unknown_value': -1, 'handle_unknown': "use_encoded_value"},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb4.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb4.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.0195740108384055,
 -1.0212720754574072,
 -1.0233300397270204,
 -1.0262117440585858,
 -1.0242325734967839]

## CB5

In [68]:
cb5 = sgml.CVModel.load_or_create('result', 'cb5', kf, config, cb_adapter)

In [69]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_ord': ['Health Score_c'], 'ord': {'min_frequency': 15, 'unknown_value': -1, 'handle_unknown': "use_encoded_value"},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb4.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb5.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.0202103090565935,
 -1.0236188697776036,
 -1.0259082541852604,
 -1.0271804203847197,
 -1.0268048773138716]

## CB6

In [95]:
cb6 = sgml.CVModel.load_or_create('result', 'cb6', kf, config, cb_adapter)

In [96]:
hparams = {
    'model_params': {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04},
    'X_tgt': ['Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'X_ord': ['Health Score_c', 'Annual Income_fz'], 'ord': {'min_frequency': 15, 'unknown_value': -1, 'handle_unknown': "use_encoded_value"},
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = cb6.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb6.cv(df_train, hparams, task_type='GPU')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

[-1.0203919240791612,
 -1.0236693081353079,
 -1.0262721762888998,
 -1.0272071407716024,
 -1.0268210944227152]

## XGB1

In [36]:
xgb1 = sgml.CVModel.load_or_create('result', 'xgb1', kf, config, xgb_adapter)

In [37]:
hparams = {
    'model_params': {'n_estimators': 1100, 'max_depth': 5, 'learning_rate': 0.01},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = xgb1.adhoc(df_train, ss, hparams, device='cuda')
result = xgb1.cv(df_train, hparams, device='cuda')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

Round:   0%|          | 0/1100 [00:00<?, ?it/s]

[-1.0194211586161719,
 -1.0229488691452266,
 -1.0252834881715636,
 -1.0264535390314664,
 -1.025776468979094]

## XGB2

In [38]:
xgb2 = sgml.CVModel.load_or_create('result', 'xgb2', kf, config, xgb_adapter)

In [39]:
hparams = {
    'model_params': {'n_estimators': 1300, 'max_depth': 4, 'learning_rate': 0.02},
    'X_tgt': ['Health Score', 'Annual Income', 'Credit Score'], 'tgt': {'target_type': 'continuous'},
    'X_num': X_num + X_freq + X_imp2 + X_clf2, 
    'X_cat': X_ord + X_bool + X_comb, 
    'random_state': 123,
    #'validation_fraction': 0.1
}
#result = xgb2.adhoc(df_train, ss, hparams, device='cuda')
result = xgb2.cv(df_train, hparams, device='cuda')
result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

Round:   0%|          | 0/1300 [00:00<?, ?it/s]

[-1.0194907344791828,
 -1.0229514329107898,
 -1.0251839565724474,
 -1.0264971475686988,
 -1.025827776049124]

## NN1

In [None]:
#nn1 = sgml.CVModel.load_or_create('result', 'nn1', kf, config, nn_adapter)
"""
def nn_cat_param(df, name, size):
    return name, len(df[name].cat.categories), size

X_cat = ['Marital_Feedback']
X_nn_emb = [
    ('Marital_Feedback', 3)
]

nn_params = {
    'embedding':[(1, c, es, 0, 0) for _, c, es in [nn_cat_param(df_train, v, n) for v, n in X_nn_emb]], 
    'config':  [
        {'unit': 16, 'activation': 'relu', 'batch_norm': False},
    ]
}

hparams = {
    'model_params': {
        'model_params': nn_params,
        'epochs': 30,
        'optimizer': ('Adam', {'learning_rate': 0.001}),
        'batch_size': 2048, 'shuffle_size': 102400,
        'early_stopping': None, 'reduce_lr_on_plateau': None, 'lr_scheduler': None
    }, 
    'X_cat': [i for i, _ in X_nn_emb],
    'X_tgt': ['Health Score', 'Annual Income'], 'tgt': {'target_type': 'continuous'},
    'X_mm': ['Previous Claims_f', 'Credit Score_f', 'Annual Income_fl', 'Annual Income_Freqz'],
    'X_num': X_bool + X_menc + ['t0', 't1', 't5_0', 't5_1', 't5_2', 't5_3', 't2_0'],
    'X_ohe': X_ord, 
    'ohe': {'drop': 'first', 'min_frequency': 50, 'sparse_output': False},
    #'validation_fraction': 0.1
}

##nn1.adhoc(df_train, ss, hparams)
result = nn1.cv(df_train, hparams)
result['valid_scores']
"""

# Ensemble

## stk1

In [113]:
models = [lr2, lgb1, xgb1, cb1, cb2, lgb2, xgb2, cb3, cb4, cb5, cb6]
pd.Series({i.name: i.cv_best_['score'] for i in models}).rename('score').to_frame().T.sort_values('score', axis=1)

Unnamed: 0,lr2,cb6,cb5,cb2,cb3,cb1,lgb2,lgb1,xgb2,xgb1,cb4
score,-1.025378,-1.024872,-1.024745,-1.024349,-1.024342,-1.024142,-1.024064,-1.024049,-1.02399,-1.023977,-1.022924


In [114]:
df_stk = sgml.stack_cv(models, df_train[target])
df_stk.corr()

Unnamed: 0,lr2,lgb1,xgb1,cb1,cb2,lgb2,xgb2,cb3,cb4,cb5,cb6,Premium Amount_l
lr2,1.0,0.986127,0.988087,0.98267,0.98111,0.987984,0.987346,0.893229,0.958015,0.989673,0.989013,0.352753
lgb1,0.986127,1.0,0.995389,0.988369,0.987087,0.99707,0.994223,0.900946,0.962067,0.988992,0.988081,0.355954
xgb1,0.988087,0.995389,1.0,0.989606,0.988119,0.996755,0.997719,0.902038,0.963309,0.990482,0.989526,0.356127
cb1,0.98267,0.988369,0.989606,1.0,0.995282,0.989358,0.989218,0.899479,0.960098,0.98649,0.985426,0.355736
cb2,0.98111,0.987087,0.988119,0.995282,1.0,0.98796,0.98772,0.898342,0.959006,0.985437,0.984336,0.355247
lgb2,0.987984,0.99707,0.996755,0.989358,0.98796,1.0,0.995501,0.901733,0.963037,0.990116,0.989182,0.355925
xgb2,0.987346,0.994223,0.997719,0.989218,0.98772,0.995501,1.0,0.901926,0.96312,0.990165,0.989185,0.356096
cb3,0.893229,0.900946,0.902038,0.899479,0.898342,0.901733,0.901926,1.0,0.957871,0.901973,0.902662,0.360874
cb4,0.958015,0.962067,0.963309,0.960098,0.959006,0.963037,0.96312,0.957871,1.0,0.965813,0.965811,0.360664
cb5,0.989673,0.988992,0.990482,0.98649,0.985437,0.990116,0.990165,0.901973,0.965813,1.0,0.995374,0.354287


In [115]:
lr_stk = sgml.CVModel.load_or_create('result', 'lr_stk', kf, config, lr_adapter)

In [116]:
result = lr_stk.cv(df_stk, {
    'model_params': {},
    'X_num': ['lr2', 'lgb1', 'xgb1', 'cb1', 'cb2', 'lgb2', 'xgb2', 'cb3', 'cb4', 'cb5', 'cb6']
})
np.mean(result['valid_scores'])

-1.0188137343357986

In [None]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type='GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device='cuda')
    else:
        i.train(df_train)
lr_stk.train(df_stk)

In [90]:
df_vt = pd.concat([
    i.get_predictor()(df_test).rename(i.name)
    for i in models
], axis=1)

In [56]:
s_prd = (np.exp(
    lr_stk.get_predictor()(df_vt)
) - 1).clip(20, 5000).rename('Premium Amount')

In [58]:
s_prd.to_frame().to_csv(os.path.join('result', 'submission10.csv'))

In [None]:
#!kaggle competitions submit -c playground-series-s4e12 -f submission10.csv -m "10"