In [1]:
import os, sys
import pandas as pd
import numpy as np
import sklearn
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import matplotlib as mpl
import matplotlib.pyplot as plt

import sgml, sgutil

print(sys.version)
for i in [pd, np, sklearn, lgb, xgb, cb, mpl]:
    print(i.__name__, i.__version__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
pandas 2.2.3
numpy 1.26.4
sklearn 1.5.2
lightgbm 4.3.0
xgboost 2.1.1
catboost 1.2.5
matplotlib 3.8.4


# 초기화

In [2]:
data_path = 'data'
result_path = 'result'
img_path = 'img'

if not os.path.isdir(data_path):
    os.mkdir(data_path)

files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org_train', data_path, 'train_org.csv'),
        ('org_test', data_path, 'test_org.csv'),
        ('train_pkl', data_path, 'train.pkl'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_pkl', data_path, 'test.pkl'),
        ('var_pkl', data_path, 'var.pkl'),
        ('model_pkl', result_path, 'stk_s4_ep7.pkl'),
        ('model1_pkl', result_path, 'stk1_s4_ep7.pkl'),
        ('model2_pkl', result_path, 'stk2_s4_ep7.pkl')
    ]
}
sc = sgutil.SGCache(img_path, result_path)

In [3]:
df_train = pd.read_pickle(files['train_pkl'])
df_test = pd.read_pickle(files['test_pkl'])
df_var = pd.read_pickle(files['var_pkl'])

In [4]:
df_var

Unnamed: 0,type,Description,n_unique,src,dtype
Age,(continous),Age of the Customer.,66.0,org,Int8
Gender,(dichotomous),Gender of the Customer.,2.0,org,Categorical
Driving_License,(dichotomous),"0 for customer not having DL, 1 for customer h...",2.0,org,Int8
Region_Code,(nominal),Unique code for the region of the customer.,53.25,org,Categorical
Previously_Insured,(dichotomous),"0 for customer not having vehicle insurance, 1...",2.0,org,Categorical
Vehicle_Age,(nominal),Age of the vehicle.,3.0,org,Categorical
Vehicle_Damage,(dichotomous),Customer got his/her vehicle damaged in the pa...,2.0,org,Categorical
Annual_Premium,(continous),The amount customer needs to pay as premium in...,46734.25,org,Float32
Policy_Sales_Channel,(nominal),Anonymized Code for the channel of outreaching...,150.75,org,Categorical
Vintage,(continous),"Number of Days, Customer has been associated w...",290.0,org,Int16


In [5]:
# 빈도수가 적은 Policy_Sales_Channel의 5.0, 6.0, 33.0과 Region_Code 39.2를 제거합니다.
df_train.drop(
    index=df_train.loc[df_train['Policy_Sales_Channel'].isin(['5.0', '6.0', '33.0'])].index.tolist() + 
        df_train.loc[df_train['Region_Code'] == '39.2'].index.tolist(), inplace=True
)

In [6]:
target = 'Response'
X_cat = df_var.loc[df_var['dtype'] == 'Categorical'].index.tolist()
X_bool = df_var.loc[(df_var['n_unique'] == 2) & (~df_var['dtype'].isin(X_cat))].index.tolist() + ['is_Annual_Premium_mode']
X_bool = [i for i in X_bool if i != target and i not in X_cat]
X_num = df_var.loc[~df_var.index.isin(X_cat + X_bool + ['id', 'Response (Dependent Feature)'])].index.tolist()
X_all = df_test.columns.tolist()
print("Target:", target)
print("Categorical:", np.array(X_cat))
print("Boolean:", np.array(X_bool))
print("Number:", np.array(X_num))

Target: Response
Categorical: ['Gender' 'Region_Code' 'Previously_Insured' 'Vehicle_Age'
 'Vehicle_Damage' 'Policy_Sales_Channel']
Boolean: ['Driving_License' 'is_Annual_Premium_mode']
Number: ['Age' 'Annual_Premium' 'Vintage' 'Previously_Insured_Vehicle_Damage'
 'log_Annual_Premium']


In [7]:
df_test

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,is_Annual_Premium_mode,Previously_Insured_Vehicle_Damage,log_Annual_Premium
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228,1,0.0,10.511115
11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123,0,37483.0,10.531644
11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271,1,2630.0,10.511115
11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115,0,0.0,10.106511
11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148,0,0.0,10.437493
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19174659,Male,57,1,28.0,0,1-2 Year,Yes,51661.0,124.0,109,0,51661.0,10.852459
19174660,Male,28,1,50.0,1,< 1 Year,No,25651.0,152.0,184,0,0.0,10.152339
19174661,Male,47,1,33.0,1,1-2 Year,No,2630.0,138.0,63,1,0.0,10.511115
19174662,Male,30,1,28.0,0,< 1 Year,Yes,38866.0,124.0,119,0,38866.0,10.567876


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

cv = StratifiedKFold(5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(1, train_size=0.8, random_state=123)

def predict(m, df_valid, X):
    return pd.Series(m.predict_proba(df_valid[X])[:, 1], index=df_valid.index)

def eval_metric(y_true, prds):
    return roc_auc_score(y_true[target].sort_index(), prds.sort_index())

def print_metrics(m):
    print("Valid. Score: {:.5f}±{:.5f}, Train Score: {:.5f}±{:.5f}".format(
    np.mean(m['valid_metrics']), np.std(m['valid_metrics']),
    np.mean(m['train_metrics']), np.std(m['train_metrics']))
)

In [9]:
if os.path.isfile(files['model_pkl']):
    stk = sgml.SGStacking.load_model(files['model_pkl'])
else:
    stk = sgml.SGStacking(df_train, target, cv, predict, eval_metric, greater_better=True)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

## Logistic Regression

In [10]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'), ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Vehicle_Damage', 'Previously_Insured']),
    ('mm', MinMaxScaler(), ['log_Annual_Premium']), 
    ('tgt', TargetEncoder(), ['Vintage', 'Age']),
    ('pt', 'passthrough', ['Driving_License', 'is_Annual_Premium_mode'])
])
m, train_result = stk.eval_model('lr', LogisticRegression, {'C':100}, X_all, ct)
print_metrics(m)

Valid. Score: 0.86990±0.00027, Train Score: 0.86996±0.00009


In [11]:
if not 'lr' in stk.get_selected_model():
    m = stk.select_model('lr')
    stk.save_model(files['model_pkl'])

## LightGBM

### LGBM1

In [12]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium', 'Vintage']
X_num = ['Age']
X_lgb = X_cat + X_num
stk.eval_model('lgbm1', lgb.LGBMClassifier, 
    {'verbose': -1, 'n_estimators': 3000, 'learning_rate': 0.03, 'num_leaves': 127, 'min_child_samples': 32, 'colsample_bytree': 0.5}, 
    X_lgb, 
    result_proc = sgml.lgb_learning_result, 
    train_params = {'fit_params': {'categorical_feature': X_cat}}
)

({'model': lightgbm.sklearn.LGBMClassifier,
  'preprocessor': None,
  'model_param': {'verbose': -1,
   'n_estimators': 3000,
   'learning_rate': 0.03,
   'num_leaves': 127,
   'min_child_samples': 32,
   'colsample_bytree': 0.5},
  'train_metrics': [0.9100471725961341,
   0.9102638426335096,
   0.9102944181120641,
   0.9103928759368277,
   0.9101980245874196],
  'valid_metrics': [0.8914266094417738,
   0.8911112790196013,
   0.8915992217565045,
   0.8913563222047797,
   0.8914712584687462]},
 None)

In [13]:
if not 'lgbm1' in stk.get_selected_model():
    m = stk.select_model('lgbm1')
    stk.save_model(files['model_pkl'])

In [14]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.pipeline import make_pipeline

In [None]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Vintage']
X_num = ['Age']
X_tgt = ['Annual_Premium_c']
X_lgb = X_cat + X_num + X_tgt
ct = ColumnTransformer([
    ('pt', 'passthrough', X_cat + X_num),
    ('tgt', make_pipeline(TargetEncoder(), KBinsDiscretizer(n_bins=255, encode='ordinal', strategy='quantile')), X_tgt)
]).set_output(transform="pandas")

stk.eval_model_cv(ss, lgb.LGBMClassifier, 
    {'verbose': -1, 'n_estimators': 3000, 'learning_rate': 0.02, 'num_leaves': 511, 'min_child_samples': 32, 'max_bin': 255}, 
    X_lgb, 
    preprocessor=ct,
    result_proc = sgml.lgb_learning_result, 
    train_params = {'fit_params': {'categorical_feature': ['pt__' + i for i in X_cat]}}
)

In [None]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium', 'Vintage']
X_num = ['Age']
X_lgb = X_cat + X_num
stk.eval_model_cv(ss, lgb.LGBMClassifier, 
    {'verbose': -1, 'n_estimators': 3000, 'learning_rate': 0.03, 'num_leaves': 96, 'min_child_samples': 64, 'colsample_bytree': 0.5}, 
    X_lgb, 
    result_proc = sgml.lgb_learning_result, 
    train_params = {'fit_params': {'categorical_feature': X_cat}}
)

In [16]:
stk.get_model_results('lgbm1')

Unnamed: 0,model,preprocessor,model_params,X,train_metrics,valid_metrics,train_info
0,LGBMClassifier,,"{'verbose': -1, 'n_estimators': 3000, 'learnin...","Gender,Region_Code,Vehicle_Age,Policy_Sales_Ch...",0.91024±0.00011,0.89139±0.00016,{'result_proc': <function lgb_learning_result ...


# CatBoost

In [14]:
import catboost as cb
from sklearn.model_selection import train_test_split

In [15]:
def m_learning_result(m, train_result):
    return m, train_result

In [16]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Annual_Premium_c', 'Vintage', 'Age']
X_num = [] # ['Age']
X_cb = X_cat + X_num
result = stk.eval_model('cb1', cb.CatBoostClassifier, 
    {'n_estimators': 3000, 'learning_rate': 0.06, 'max_depth': 9, 'task_type': 'GPU', 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'verbose' : False}, 
    X_cb, 
    result_proc = sgml.cb_learning_result, 
    train_params = {
        'fit_params': {'cat_features': X_cat, 'verbose': 500},
    }
)
result

({'model': catboost.core.CatBoostClassifier,
  'preprocessor': None,
  'model_param': {'n_estimators': 3000,
   'learning_rate': 0.06,
   'max_depth': 9,
   'task_type': 'GPU',
   'loss_function': 'Logloss',
   'eval_metric': 'AUC',
   'verbose': False},
  'train_metrics': [0.9287528528571969,
   0.9292573693743261,
   0.9286679661215391,
   0.9297874580132984,
   0.9289578693902849],
  'valid_metrics': [0.8948233787575748,
   0.8945409478969022,
   0.8948517263507246,
   0.8946383469231585,
   0.8950575702216534]},
 None)

In [17]:
stk.get_model_results('cb1')

Unnamed: 0,model,preprocessor,model_params,X,train_metrics,valid_metrics,train_info
0,CatBoostClassifier,,"{'n_estimators': 3000, 'learning_rate': 0.06, ...","Gender,Region_Code,Vehicle_Age,Policy_Sales_Ch...",0.92908±0.00041,0.89478±0.00018,{'result_proc': <function cb_learning_result a...


In [18]:
if not 'cb1' in stk.get_selected_model():
    m = stk.select_model('cb1')
    stk.save_model(files['model_pkl'])

In [19]:
from sklearn.linear_model import LogisticRegression

stk.eval_meta_model(LogisticRegression, {}, ['cb1', 'lgbm1'])

([0.8949176042818464,
  0.8949892040887373,
  0.8949015303650533,
  0.8949520808429793,
  0.8948659304076494],
 [0.8949551467440692,
  0.8946705131634524,
  0.895033416750021,
  0.8948215248038708,
  0.895146839514495],
 id
 1           0.461058
 15          0.071781
 20          0.026933
 31          0.099489
 35          0.027781
               ...   
 11504765    0.026887
 11504767    0.026890
 11504772    0.235799
 11504780    0.027099
 11504795    0.026919
 Length: 11504790, dtype: float64,
 [{'variables': ['cb1', 'lgbm1'],
   'train_shape': (9203832, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['cb1', 'lgbm1'],
   'train_shape': (9203832, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['cb1', 'lgbm1'],
   'train_shape': (9203832, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['cb1', 'lgbm1'],
   'train_shape': (9203832, 2),
   'target': 'Response',
   'target_func': None},
  {'variables': ['cb1', 'lgbm1'],
 

# XGBooost

In [26]:
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [21]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Vintage']
X_num = ['Age', 'Annual_Premium']
X_xgb = X_cat + X_num
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(), X_cat),
    ('pt', 'passthrough', X_num)
])
stk.eval_model_cv(ss, xgb.XGBClassifier, 
    {'n_estimators': 3000, 'learning_rate': 0.1, 'max_depth': 9, 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'auc'}, 
    X_xgb, 
    preprocessor=ct,
    result_proc = sgml.cb_learning_result, 
    train_params = {
        'valid_splitter': lambda x: train_test_split(x, train_size=0.9, stratify=x[target], random_state=123),
        'valid_config_proc': sgml.gb_valid_config,
        'fit_params': {'verbose': 100}
    }
)

[0]	validation_0-auc:0.85707	validation_1-auc:0.85663
[100]	validation_0-auc:0.87579	validation_1-auc:0.87417
[200]	validation_0-auc:0.88001	validation_1-auc:0.87723
[300]	validation_0-auc:0.88221	validation_1-auc:0.87843
[400]	validation_0-auc:0.88391	validation_1-auc:0.87923
[500]	validation_0-auc:0.88516	validation_1-auc:0.87963
[600]	validation_0-auc:0.88622	validation_1-auc:0.87991
[700]	validation_0-auc:0.88719	validation_1-auc:0.88021
[800]	validation_0-auc:0.88808	validation_1-auc:0.88046
[900]	validation_0-auc:0.88888	validation_1-auc:0.88063
[1000]	validation_0-auc:0.88960	validation_1-auc:0.88075
[1100]	validation_0-auc:0.89037	validation_1-auc:0.88089
[1200]	validation_0-auc:0.89094	validation_1-auc:0.88094
[1300]	validation_0-auc:0.89156	validation_1-auc:0.88098
[1400]	validation_0-auc:0.89217	validation_1-auc:0.88103
[1500]	validation_0-auc:0.89274	validation_1-auc:0.88107
[1600]	validation_0-auc:0.89324	validation_1-auc:0.88107
[1700]	validation_0-auc:0.89384	validation_

KeyboardInterrupt: 

In [36]:
X_cat = ['Gender', 'Region_Code', 'Vehicle_Age', 'Policy_Sales_Channel', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Vintage', 'Annual_Premium_c']
X_num = ['Age']
X_xgb = X_cat + X_num
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), X_cat),
    ('pt', 'passthrough', X_num)
])
stk.eval_model_cv(ss, xgb.XGBClassifier, 
    {'n_estimators': 4000, 'learning_rate': 0.05, 'max_depth': 9, 'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'auc'}, 
    X_xgb, 
    preprocessor=ct,
    result_proc = sgml.cb_learning_result, 
    train_params = {
        'valid_splitter': lambda x: train_test_split(x, train_size=0.9, stratify=x[target], random_state=123),
        'valid_config_proc': sgml.gb_valid_config,
        'fit_params': {'verbose': 100}
    }
)

[0]	validation_0-auc:0.85704	validation_1-auc:0.85661
[100]	validation_0-auc:0.87016	validation_1-auc:0.86931
[200]	validation_0-auc:0.87500	validation_1-auc:0.87376
[300]	validation_0-auc:0.87738	validation_1-auc:0.87585
[400]	validation_0-auc:0.87888	validation_1-auc:0.87714
[500]	validation_0-auc:0.87997	validation_1-auc:0.87803
[600]	validation_0-auc:0.88088	validation_1-auc:0.87875
[700]	validation_0-auc:0.88164	validation_1-auc:0.87935
[800]	validation_0-auc:0.88233	validation_1-auc:0.87988
[900]	validation_0-auc:0.88292	validation_1-auc:0.88032
[1000]	validation_0-auc:0.88347	validation_1-auc:0.88070
[1100]	validation_0-auc:0.88398	validation_1-auc:0.88106
[1200]	validation_0-auc:0.88446	validation_1-auc:0.88139
[1300]	validation_0-auc:0.88490	validation_1-auc:0.88167
[1400]	validation_0-auc:0.88533	validation_1-auc:0.88195
[1500]	validation_0-auc:0.88572	validation_1-auc:0.88220
[1600]	validation_0-auc:0.88609	validation_1-auc:0.88242
[1700]	validation_0-auc:0.88645	validation_

({'model': xgboost.sklearn.XGBClassifier,
  'preprocessor': ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'),
                                   ['Gender', 'Region_Code', 'Vehicle_Age',
                                    'Policy_Sales_Channel', 'Driving_License',
                                    'Previously_Insured', 'Vehicle_Damage',
                                    'Vintage', 'Annual_Premium_c']),
                                  ('pt', 'passthrough', ['Age'])]),
  'model_param': {'n_estimators': 4000,
   'learning_rate': 0.05,
   'max_depth': 9,
   'tree_method': 'hist',
   'device': 'cuda',
   'eval_metric': 'auc'},
  'train_metrics': [0.891216451465306],
  'valid_metrics': [0.8857080638355445]},
 [(metric          auc             
   set    validation_0 validation_1
   0          0.857042     0.856610
   1          0.857768     0.857279
   2          0.858002     0.857370
   3          0.858272     0.857664
   4          0.858402     0.857815
