In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import matplotlib as mpl
import matplotlib.pyplot as plt

import sgml, sgutil

print(sys.version)
for i in [pd, np, sklearn, lgb, xgb, cb, mpl]:
    print(i.__name__, i.__version__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
numpy 1.26.4
sklearn 1.5.2
lightgbm 4.3.0
xgboost 2.1.1
catboost 1.2.5
matplotlib 3.8.4


# 초기화

In [2]:
data_path = 'data'
result_path = 'result'
img_path = 'img'

if not os.path.isdir(data_path):
    os.mkdir(data_path)

files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org_train', data_path, 'train_org.csv'),
        ('org_test', data_path, 'test_org.csv'),
        ('train_pkl', data_path, 'train.pkl'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_pkl', data_path, 'test.pkl'),
        ('var_pkl', data_path, 'var.pkl'),
        ('model_pkl', result_path, 'stk_s4_ep7.pkl'),
        ('model1_pkl', result_path, 'stk1_s4_ep7.pkl'),
        ('model2_pkl', result_path, 'stk2_s4_ep7.pkl')
    ]
}
sc = sgutil.SGCache(img_path, result_path)

In [3]:
df_test = pd.read_pickle(files['test_pkl'])
df_var = pd.read_pickle(files['var_pkl'])

In [4]:
df_var

Unnamed: 0,var_type,Description,min,max,na,count,n_unique,f32,i32,i16,i8,dtype,src
Age,(continous),Age of the Customer.,20.0,85.0,0.0,19682810.0,66.0,True,True,True,True,Int8,
Gender,(dichotomous),Gender of the Customer.,,,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Driving_License,(dichotomous),"0 for customer not having DL, 1 for customer h...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Int8,
Region_Code,(nominal),Unique code for the region of the customer.,0.0,52.0,0.0,19682810.0,53.25,True,True,True,True,Categorical,
Previously_Insured,(dichotomous),"0 for customer not having vehicle insurance, 1...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Vehicle_Age,(nominal),Age of the vehicle.,,,0.0,19682810.0,3.0,True,True,True,True,Categorical,
Vehicle_Damage,(dichotomous),Customer got his/her vehicle damaged in the pa...,,,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Annual_Premium,(continous),The amount customer needs to pay as premium in...,2630.0,540165.0,0.0,19682810.0,46734.25,True,True,False,False,Float32,
Policy_Sales_Channel,(nominal),Anonymized Code for the channel of outreaching...,1.0,163.0,0.0,19682810.0,150.75,True,True,True,False,Categorical,
Vintage,(continous),"Number of Days, Customer has been associated w...",10.0,299.0,0.0,19682810.0,290.0,True,True,True,False,Int16,


In [5]:
target = 'Response'
X_cat = df_var.loc[df_var['dtype'] == 'Categorical'].index.tolist()
X_bool = df_var.loc[(df_var['n_unique'] == 2) & (~df_var['dtype'].isin(X_cat))].index.tolist() + ['is_Annual_Premium_mode']
X_bool = [i for i in X_bool if i != target and i not in X_cat]
X_num = df_var.loc[~df_var.index.isin(X_cat + X_bool + ['id', 'Response (Dependent Feature)'])].index.tolist()
X_all = df_test.columns.tolist()
print("Target:", target)
print("Categorical:", np.array(X_cat))
print("Boolean:", np.array(X_bool))
print("Number:", np.array(X_num))

Target: Response
Categorical: ['Gender' 'Region_Code' 'Previously_Insured' 'Vehicle_Age'
 'Vehicle_Damage' 'Policy_Sales_Channel' 'Annual_Premium_c' 'Age_c'
 'Vintage_c' 'VA' 'VAge' 'PA' 'VV' 'PAge' 'VP' 'PVc' 'PP' 'RV' 'V2V' 'RP'
 'PV2' 'PV' 'GV2' 'GP' 'AA' 'AV' 'VPAV' 'VPPV' 'VPPA' 'VPVGAP' 'VPVGAV'
 'VPVGAR' 'VPRV' 'VPVA' 'VPPAc' 'VPRAc']
Boolean: ['Driving_License' 'is_Annual_Premium_mode' 'is_Annual_Premium_mode']
Number: ['Age' 'Annual_Premium' 'Vintage']


In [6]:
df_test

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,...,VPAV,VPPV,VPPA,VPVGAP,VPVGAV,VPVGAR,VPRV,VPVA,VPPAc,VPRAc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228,...,Yes_1_20_142,Yes_0_28.0_34,No_0_124.0_36,No_0_< 1 Year_Female_61_32.0,Yes_1_< 1 Year_Female_22_222,No_0_1-2 Year_Female_34_6.0,Yes_0_51.0_136,No_0_< 1 Year_2630.0,No_0_124.0_16278.0,No_0_35.0_16353.0
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123,...,No_1_57_81,No_1_90.0_79,No_1_119.0_75,No_0_< 1 Year_Female_24_158.0,No_1_1-2 Year_Male_45_157,No_0_< 1 Year_Female_61_38.0,No_1_9.0_191,No_1_> 2 Years_41510.0,No_1_50.0_38872.0,No_1_20.0_33542.0
11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271,...,Yes_1_59_293,Yes_1_90.0_131,No_1_119.0_79,No_0_< 1 Year_Female_34_87.0,Yes_1_1-2 Year_Male_61_269,Yes_0_< 1 Year_Male_83_38.0,Yes_1_27.0_77,No_0_< 1 Year_15246.0,No_0_124.0_15246.0,No_0_35.0_17806.0
11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115,...,No_1_49_190,No_1_21.0_250,No_0_135.0_34,No_0_< 1 Year_Female_41_163.0,No_1_1-2 Year_Female_65_210,No_0_1-2 Year_Female_35_49.0,No_1_7.0_252,No_0_1-2 Year_27181.0,No_0_159.0_29717.0,No_0_41.0_47907.0
11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148,...,No_1_80_26,No_1_57.0_188,No_1_101.0_82,No_0_< 1 Year_Female_25_124.0,No_1_> 2 Years_Male_50_122,Yes_0_> 2 Years_Male_85_7.0,No_1_32.0_136,No_1_1-2 Year_36335.0,No_1_128.0_25179.0,No_1_24.0_31713.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19174659,Male,57,1,28.0,0,1-2 Year,Yes,51661.0,124.0,109,...,No_1_44_195,No_1_59.0_145,Yes_0_25.0_29,No_0_< 1 Year_Female_25_67.0,No_1_1-2 Year_Female_35_261,No_0_< 1 Year_Female_66_47.0,No_1_26.0_123,Yes_1_< 1 Year_39797.0,Yes_1_18.0_25159.0,Yes_1_45.0_31315.0
19174660,Male,28,1,50.0,1,< 1 Year,No,25651.0,152.0,184,...,Yes_0_46_160,Yes_0_22.0_194,No_0_129.0_46,No_0_< 1 Year_Female_42_25.0,Yes_0_1-2 Year_Female_47_42,No_0_> 2 Years_Male_75_11.0,Yes_0_25.0_122,No_0_1-2 Year_40953.0,No_0_131.0_27898.0,No_0_2.0_37310.0
19174661,Male,47,1,33.0,1,1-2 Year,No,2630.0,138.0,63,...,No_0_68_192,No_0_105.0_40,No_1_70.0_20,Yes_0_< 1 Year_Male_52_163.0,No_0_> 2 Years_Female_47_238,No_1_> 2 Years_Male_69_35.0,No_0_0.0_42,No_0_< 1 Year_15297.0,No_0_124.0_19186.0,No_0_35.0_17690.0
19174662,Male,30,1,28.0,0,< 1 Year,Yes,38866.0,124.0,119,...,No_1_53_117,No_1_129.0_15,No_0_68.0_41,No_0_< 1 Year_Female_21_98.0,No_1_1-2 Year_Male_22_79,No_0_< 1 Year_Female_54_28.0,No_1_36.0_213,Yes_0_< 1 Year_20687.0,No_1_43.0_64590.0,No_1_52.0_32967.0


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

cv = StratifiedKFold(5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(1, train_size=0.8, random_state=123)

def predict(m, df_valid, X):
    return pd.Series(m.predict_proba(df_valid[X])[:, 1], index=df_valid.index)

def eval_metric(y_true, prds):
    return roc_auc_score(y_true[target].sort_index(), prds.sort_index())

def print_metrics(m):
    if len(m['train_metrics']) > 0:
        print("Valid. Score: {:.5f}±{:.5f}, Train Score: {:.5f}±{:.5f}".format(
            np.mean(m['valid_metrics']), np.std(m['valid_metrics']),
            np.mean(m['train_metrics']), np.std(m['train_metrics']))
        )
    else:
        print("Valid. Score: {:.5f}±{:.5f}".format(
            np.mean(m['valid_metrics']), np.std(m['valid_metrics'])
        ))

In [8]:
if os.path.isfile(files['model_pkl']):
    stk = sgml.SGStacking.load_model(files['model_pkl'])
else:
    stk = sgml.SGStacking(pd.read_pickle(files['train_pkl']), target, cv, predict, eval_metric, greater_better=True, return_train_scores=False)

## Logistic Regression

In [9]:
X_tgt = ['VPVA', 'VAge', 'PA', 'VV', 'PAge', 'VP', 'PVc', 'PP', 'RV', 'RP',  'VPPV', 'VPPA', 'VPVGAP', 'VPVGAV', 'VPVGAR', 'VPRV', 'VPAV', 'AA', 'AV', 'VPPAc', 'VPRAc']
X_ohe = ['Driving_License']
ct = ColumnTransformer([
    ('tgt', TargetEncoder(), X_tgt), 
    ('ohe', OneHotEncoder(drop='first'), X_ohe)
])
m, train_result = stk.eval_model('lr', LogisticRegression, {'C':100}, X_all, ct, progress_callback = sgml.ProgressCallBack())
print_metrics(m)

Round:   0%|          | 0/5 [00:00<?, ?it/s]

Valid. Score: 0.85545±0.00523


In [10]:
if not 'lr' in stk.get_selected_model():
    m = stk.select_model('lr')
    stk.save_model(files['model_pkl'])

## LightGBM

### LGBM1

In [11]:
X_lgb_tgt = ['VPVA', 'VAge', 'PA', 'VV', 'PAge', 'VP', 'PVc', 'PP', 'RV', 'RP',  'VPPV', 'VPPA', 'VPVGAP', 'VPVGAV', 'VPVGAR', 'VPRV', 'VPAV', 'AA', 'AV', 'VPPAc', 'VPRAc']
X_lgb_cat = ['V2V', 'PV2', 'PV', 'GV2', 'GP', 'Driving_License', 'Vehicle_Age', 'Gender', 'Region_Code', 'Policy_Sales_Channel', 'Previously_Insured', 'Vehicle_Damage', 'Vintage_c']
X_lgb_num = ['Annual_Premium', 'Age']
X_lgb = X_lgb_tgt + X_lgb_cat + X_lgb_num
result = stk.eval_model('lgbm1', lgb.LGBMClassifier, 
    {'verbose': -1, 'num_leaves': 255, 'n_estimators': 500, 'learning_rate': 0.01, 'colsample_bytree': 0.5, 'min_child_samples': 16}, 
    X_lgb, 
    preprocessor=ColumnTransformer([
        ('tgt', TargetEncoder(), X_lgb_tgt),
        ('pt', 'passthrough', X_lgb_cat + X_lgb_num)
    ]).set_output(transform="pandas"), 
    result_proc = sgml.lgb_learning_result, 
    train_params={
        'fit_params': {
            'categorical_feature': ['pt__' + i for i in X_lgb_cat], 'eval_metric': 'auc', 
            'callbacks': [sgml.LGBMFitProgressbar()]
        },
        'valid_config_proc': sgml.gb_valid_config
    }, progress_callback = sgml.ProgressCallBack()
)

Round:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/500 [00:00<?, ?it/s]

Round:   0%|          | 0/500 [00:00<?, ?it/s]

Round:   0%|          | 0/500 [00:00<?, ?it/s]

Round:   0%|          | 0/500 [00:00<?, ?it/s]

Round:   0%|          | 0/500 [00:00<?, ?it/s]

In [12]:
if not 'lgbm1' in stk.get_selected_model():
    m = stk.select_model('lgbm1')
    stk.save_model(files['model_pkl'])

Round:   0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Vintage']
X_num = ['Age']
X_tgt = ['Annual_Premium_c']
X_lgb = X_cat + X_num + X_tgt
ct = ColumnTransformer([
    ('pt', 'passthrough', X_cat + X_num),
    ('tgt', make_pipeline(TargetEncoder(), KBinsDiscretizer(n_bins=255, encode='ordinal', strategy='quantile')), X_tgt)
]).set_output(transform="pandas")

stk.eval_model_cv(ss, lgb.LGBMClassifier, 
    {'verbose': -1, 'n_estimators': 3000, 'learning_rate': 0.02, 'num_leaves': 511, 'min_child_samples': 32, 'max_bin': 255}, 
    X_lgb, 
    preprocessor=ct,
    result_proc = sgml.lgb_learning_result, 
    train_params = {'fit_params': {'categorical_feature': ['pt__' + i for i in X_cat]}}
)

In [None]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium', 'Vintage']
X_num = ['Age']
X_lgb = X_cat + X_num
stk.eval_model_cv(ss, lgb.LGBMClassifier, 
    {'verbose': -1, 'n_estimators': 3000, 'learning_rate': 0.03, 'num_leaves': 96, 'min_child_samples': 64, 'colsample_bytree': 0.5}, 
    X_lgb, 
    result_proc = sgml.lgb_learning_result, 
    train_params = {'fit_params': {'categorical_feature': X_cat}}
)

In [16]:
stk.get_model_results('lgbm1')

Unnamed: 0,model,preprocessor,model_params,X,train_metrics,valid_metrics,train_info
0,LGBMClassifier,,"{'verbose': -1, 'n_estimators': 3000, 'learnin...","Gender,Region_Code,Vehicle_Age,Policy_Sales_Ch...",0.91024±0.00011,0.89139±0.00016,{'result_proc': <function lgb_learning_result ...


# CatBoost

In [14]:
import catboost as cb
from sklearn.model_selection import train_test_split

In [15]:
def m_learning_result(m, train_result):
    return m, train_result

In [16]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium_c', 'Vintage_c', 'Age_c']
X_num = ['Age', 'Annual_Premium']
X_cb = X_cat + X_num
result = stk.eval_model('cb1', cb.CatBoostClassifier, 
    {'n_estimators': 3000, 'learning_rate': 0.06, 'max_depth': 9, 'task_type': 'GPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'verbose' : False}, 
    X_cb, 
    result_proc = sgml.cb_learning_result, 
    train_params = {
        'fit_params': {'cat_features': X_cat, 'verbose': 500},
    }
)
result

({'model': catboost.core.CatBoostClassifier,
  'preprocessor': None,
  'model_param': {'n_estimators': 3000,
   'learning_rate': 0.06,
   'max_depth': 9,
   'task_type': 'GPU',
   'loss_function': 'Logloss',
   'eval_metric': 'AUC',
   'verbose': False},
  'train_metrics': [0.9287528528571969,
   0.9292573693743261,
   0.9286679661215391,
   0.9297874580132984,
   0.9289578693902849],
  'valid_metrics': [0.8948233787575748,
   0.8945409478969022,
   0.8948517263507246,
   0.8946383469231585,
   0.8950575702216534]},
 None)

In [17]:
stk.get_model_results('cb1')

Unnamed: 0,model,preprocessor,model_params,X,train_metrics,valid_metrics,train_info
0,CatBoostClassifier,,"{'n_estimators': 3000, 'learning_rate': 0.06, ...","Gender,Region_Code,Vehicle_Age,Policy_Sales_Ch...",0.92908±0.00041,0.89478±0.00018,{'result_proc': <function cb_learning_result a...


In [18]:
if not 'cb1' in stk.get_selected_model():
    m = stk.select_model('cb1')
    stk.save_model(files['model_pkl'])

In [14]:
from sklearn.linear_model import LogisticRegression

stk.eval_meta_model(LogisticRegression, {}, ['lgbm1', 'lr'])

([0.8553576621274659,
  0.855758232890083,
  0.8612539645685355,
  0.8577486686046295,
  0.8599255774437096],
 [0.8692264343151918,
  0.8673389398241469,
  0.8459082009377867,
  0.8578928984259936,
  0.8502713369511593],
 id
 351         0.206733
 11614       0.039473
 12284       0.044243
 12605       0.150068
 13517       0.050472
               ...   
 11455268    0.268146
 11471197    0.040254
 11480601    0.257491
 11480796    0.038037
 11503386    0.418166
 Length: 11505, dtype: float64,
 [{'variables': ['lgbm1', 'lr'],
   'train_shape': (9204, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['lgbm1', 'lr'],
   'train_shape': (9204, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['lgbm1', 'lr'],
   'train_shape': (9204, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['lgbm1', 'lr'],
   'train_shape': (9204, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['lgbm1', 'lr'],
   'train_shape': (9

# XGBooost

In [20]:
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [36]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Vintage', 'Annual_Premium_c']
X_num = ['Age']
X_xgb = X_cat + X_num
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), X_cat),
    ('pt', 'passthrough', X_num)
])
stk.eval_model_cv(ss, xgb.XGBClassifier, 
    {'n_estimators': 4000, 'learning_rate': 0.05, 'max_depth': 9, 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'auc'}, 
    X_xgb, 
    preprocessor=ct,
    result_proc = sgml.cb_learning_result, 
    train_params = {
        'valid_splitter': lambda x: train_test_split(x, train_size=0.9, stratify=x[target], random_state=123),
        'valid_config_proc': sgml.gb_valid_config,
        'fit_params': {'verbose': 100}
    }
)

[0]	validation_0-auc:0.85704	validation_1-auc:0.85661
[100]	validation_0-auc:0.87016	validation_1-auc:0.86931
[200]	validation_0-auc:0.87500	validation_1-auc:0.87376
[300]	validation_0-auc:0.87738	validation_1-auc:0.87585
[400]	validation_0-auc:0.87888	validation_1-auc:0.87714
[500]	validation_0-auc:0.87997	validation_1-auc:0.87803
[600]	validation_0-auc:0.88088	validation_1-auc:0.87875
[700]	validation_0-auc:0.88164	validation_1-auc:0.87935
[800]	validation_0-auc:0.88233	validation_1-auc:0.87988
[900]	validation_0-auc:0.88292	validation_1-auc:0.88032
[1000]	validation_0-auc:0.88347	validation_1-auc:0.88070
[1100]	validation_0-auc:0.88398	validation_1-auc:0.88106
[1200]	validation_0-auc:0.88446	validation_1-auc:0.88139
[1300]	validation_0-auc:0.88490	validation_1-auc:0.88167
[1400]	validation_0-auc:0.88533	validation_1-auc:0.88195
[1500]	validation_0-auc:0.88572	validation_1-auc:0.88220
[1600]	validation_0-auc:0.88609	validation_1-auc:0.88242
[1700]	validation_0-auc:0.88645	validation_

({'model': xgboost.sklearn.XGBClassifier,
  'preprocessor': ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                   ['Gender', 'Region_Code', 'Vehicle_Age',
                                    'Policy_Sales_Channel', 'Driving_License',
                                    'Previously_Insured', 'Vehicle_Damage',
                                    'Vintage', 'Annual_Premium_c']),
                                  ('pt', 'passthrough', ['Age'])]),
  'model_param': {'n_estimators': 4000,
   'learning_rate': 0.05,
   'max_depth': 9,
   'tree_method': 'hist',
   'device': 'cuda',
   'eval_metric': 'auc'},
  'train_metrics': [0.891216451465306],
  'valid_metrics': [0.8857080638355445]},
 [(metric          auc             
   set    validation_0 validation_1
   0          0.857042     0.856610
   1          0.857768     0.857279
   2          0.858002     0.857370
   3          0.858272     0.857664
   4          0.858402     0.857815
