In [1]:
import os
import pandas as pd
import numpy as np
from functools import partial
import dproc, sgutil, sgnn, sgml

import seaborn as sns
import matplotlib.pyplot as plt

import shap

2024-12-05 07:34:51.716508: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733384091.813601   58205 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733384091.838255   58205 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 07:34:52.098142: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_path = 'data'
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org_train', data_path, 'train_org.csv'),
        ('org_test', data_path, 'test_org.csv'),
        ('train_parquet', data_path, 'train.parquet'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_parquet', data_path, 'test.parquet'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}
df_train = pd.read_parquet(files['train_parquet']).set_index('id')
pd_procs = dproc.PD_Vars.load(os.path.join('data', 'vars') )
target = 'Response'
df_train, _ = pd_procs.procs_all(df_train)
#del df
pd_procs.df_var

Unnamed: 0,Description,src,min,max,na,count,n_unique,dtype,f32,i32,i16,i8
Age,,,,,0.0,0.0,0.0,Int8,0.0,0.0,0.0,0.0
Age_S,Age의 범주형 변수,cat_proc,20.0,85.0,0.0,11504798.0,66.0,Categorical,,,,
Annual_Premium,,,,,0.0,0.0,0.0,Float32,0.0,0.0,0.0,0.0
Annual_Premium_S,Annual_Premium의 범주형 변수,cat_proc,2630.0,540165.0,0.0,11504798.0,43119.0,Categorical,,,,
Driving_License,,,,,0.0,0.0,0.0,Int8,0.0,0.0,0.0,0.0
Gender,,,,,0.0,0.0,0.0,Categorical(ordering='physical'),0.0,0.0,0.0,0.0
Policy_Sales_Channel,,,,,0.0,0.0,0.0,Categorical(ordering='physical'),0.0,0.0,0.0,0.0
Policy_Sales_Channel_S,Policy_Sales_Channel의 범주형 변수,cat_proc,,,0.0,11504798.0,141.0,Categorical,,,,
Previously_Insured,,,,,0.0,0.0,0.0,Categorical(ordering='physical'),0.0,0.0,0.0,0.0
Region_Code,,,,,0.0,0.0,0.0,Categorical(ordering='physical'),0.0,0.0,0.0,0.0


In [3]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import sgml

def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size=validation_fraction, stratify=x[target])

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict_proba(df[X])[:, 1], index=df.index),
    'score_func': lambda df, prds: roc_auc_score(df[target].sort_index(), prds.sort_index()),
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(),
    'y': target,
}

lgb_adapter = sgml.LGBMAdapter(model=lgb.LGBMClassifier)
xgb_adapter = sgml.XGBAdapter(model=xgb.XGBClassifier) 
cb_adapter = sgml.CBAdapter(model=cb.CatBoostClassifier)
lr_adapter = sgml.SklearnAdapter(LogisticRegression)


def print_metrics(title, m):
    print("roc_auc, {}: {:.5f}±{:.5f}".format(
        title, np.mean(m), np.std(m)
    ))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
ss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=123)

In [4]:
import tensorflow as tf
import sgnn
from functools import partial

class ResponseModel(tf.keras.Model):
    def __init__(self, emb_cols, cont_cols, f_size, left_layers, right_layers, top_layers , **argv):
        super().__init__()
        if emb_cols != None:
            self.emb_layers = {
                v: tf.keras.layers.Embedding(c, s, name=v) for v, c, s in emb_cols
            }
            self.emb_cc = tf.keras.layers.Concatenate(axis=-1)
            if left_layers != None:
                self.left_model = tf.keras.models.Sequential([
                    tf.keras.layers.Dense(**params) for params in left_layers
                ])
                self.right_model = tf.keras.models.Sequential([
                    tf.keras.layers.Dense(**params) for params in right_layers
                ])
                self.mul = tf.keras.layers.Multiply()
                self.left_slice_layer_1 = tf.keras.layers.Lambda(lambda x: x[:, :f_size])
                self.left_slice_layer_2 = tf.keras.layers.Lambda(lambda x: x[:, f_size:])
                self.right_slice_layer_1 = tf.keras.layers.Lambda(lambda x: x[:, :f_size])
                self.right_slice_layer_2 = tf.keras.layers.Lambda(lambda x: x[:, f_size:])
            else:
                self.left_model = None
        else:
            self.emb_layers = None
        self.top_cc = tf.keras.layers.Concatenate(axis=-1)
        self.top_model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(**params) for params in top_layers
        ])
        self.f_size = f_size
        self.cont_cols = cont_cols
        self.sigmoid = tf.keras.layers.Activation('sigmoid')

    def __call__(self, X):
        if self.emb_layers != None:
            X_emb = self.emb_cc([tf.squeeze(v(X[k]), axis=-2) for k, v in self.emb_layers.items()])
            if self.left_model !=None:
                X_left = self.left_model(X_emb)
                X_right = self.right_model(X_emb)
                X_left_1 = self.left_slice_layer_1(X_left)
                X_left_2 = self.left_slice_layer_2(X_left)
                
                X_right_1 = self.right_slice_layer_1(X_right)
                X_right_2 = self.right_slice_layer_2(X_right)
                X_mul = self.mul([X_left_1, X_right_1])
                cc_list = [X_left_2, X_right_2, X_mul]
            else:
                cc_list = [X_emb]
        else:
            cc_list = list()
        if self.cont_cols != None:
            cc_list.append(X[self.cont_cols])
        if len(cc_list) == 1:
            return self.sigmoid(
                self.top_model(cc_list[0])
            )
        else:
            return self.sigmoid(
                self.top_model(self.top_cc(cc_list))
            )

def to_tf_dataset(X, Y=None, sample_weights=None, cont=[], cat=[]):
    d = {}
    for i, n in enumerate(cat):
        d[n] = np.expand_dims(X[:, i], axis=-1)
    if len(cont) > 0:
        d['Continous'] =  X[:, (i + 1):]
    
    if Y is None:
        return tf.data.Dataset.from_tensor_slices(d)
    else:
        if type(Y) == pd.Series:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y.values))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y.values, sample_weights.values))
        else:
            if sample_weights is None:
                return tf.data.Dataset.from_tensor_slices((d, Y))
            else:
                return tf.data.Dataset.from_tensor_slices((d, Y, sample_weights))

def nn_cat_param(df, name, size):
    return name, len(df[name].cat.categories), size

nn_adapter = sgnn.NNAdapter(
    sgnn.NNClassifier, 
    to_tf_dataset=partial(to_tf_dataset, 
            cont=['mm__{}'.format(i) for i in ['Annual_Premium', 'Vintage', 'Age', 'Driving_License']], 
            cat= ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
                'Gender', 'VA_Age']
    )
)

# LR

In [5]:
lr = sgml.CVModel.load_or_create('result', 'LR', skf, config, lr_adapter)

In [6]:
hparams = {
    'model_params': {},
    'X_tgt': ['Vintage_S', 'Annual_Premium_S'],
    'X_mm': ['Annual_Premium', 'Vintage'], 
    'X_ohe': ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License'],
    'ohe': {'drop': 'first', 'handle_unknown': 'ignore'},
    'tgt': {}
}
result = lr.cv(df_train, hparams, result_proc=sgml.m_learning_result, progress_callback=sgml.ProgressCallBack())
np.mean(result['valid_scores']), np.mean(result['train_scores'])

(0.8841227265739551, 0.8864103836873618)

# CB1

In [7]:
cb1 = sgml.CVModel.load_or_create('result', 'CB1', skf, config, cb_adapter)

In [19]:
hparams = {
    'model_params': {'n_estimators': 2000, 'learning_rate': 0.15, 'max_depth': 9},
    'X_tgt': ['VP'],
    'X_num': ['Annual_Premium', 'Vintage', 'Age'], 
    'X_cat': ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License', 'VA_Age'],
}

# result = cb1.adhoc(df_train, ss, hparams, task_type='GPU')
# result['valid_scores'], result['train_scores']

In [9]:
hparams = {
    'model_params': {'n_estimators': 2000, 'learning_rate': 0.2, 'max_depth': 9},
    'X_num': ['Annual_Premium', 'Vintage', 'Age'], 
    'X_cat': ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License', 'VA_Age'],
}

result = cb1.cv(df_train, hparams, cb_adapter, task_type='GPU')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

# CB2

In [17]:
cb2 = sgml.CVModel.load_or_create('result', 'CB2', skf, config, cb_adapter)

In [18]:
hparams = {
    'model_params': {'n_estimators': 1000, 'learning_rate': 0.1, 'max_depth': 9},
    'X_num': ['Annual_Premium', 'Vintage', 'Age'], 
    'X_cat': ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License', 'VA_Age'],
    'validation_fraction': 0.1
}

result = cb2.adhoc(df_train, ss, hparams, task_type='GPU')
result['valid_scores'], result['train_scores']

Fold:   0%|          | 0/1 [00:00<?, ?it/s]

([0.894539082173708], [0.9221860641104421])

In [24]:
result['model_result'][0]['valid_result'][('Logloss', 'validation')].idxmin()

997

# LGB1

In [None]:
lgb1 = sgml.CVModel.load_or_create('result', 'LGB1', skf, config, lgb_adapter)

In [None]:
hparams = {
    'model_params': {'n_estimators': 1000, 'learning_rate': 0.05, 'num_leaves': 255},
    'X_tgt': ['VP'],
    'X_num': ['Annual_Premium', 'Vintage', 'Age'], 
    'X_cat': ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License', 'VA_Age'],
}

result = lgb1.cv(df_train, hparams, lgb_adapter, task_type='GPU')
result['valid_scores'], result['train_scores']

# XGB1

In [None]:
xgb1 = sgml.CVModel.load_or_create('result', 'XGB1', skf, config, xgb_adapter)

In [None]:
hparams = {
    'model_params': {'n_estimators': 1000, 'learning_rate': 0.05, 'max_depth': 8},
    'X_tgt': ['VP', 'VA_Age', 'Annual_Premium_S'],
    'X_num': ['Annual_Premium', 'Vintage', 'Age'], 
    'X_ohe': ['Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'Driving_License'],
    'ohe': {'drop': 'if_binary'}
}

result = lgb1.cv(df_train, hparams, xgb_adapter, device='cuda')
result['valid_scores'], result['train_scores']

# NN1

In [7]:
nn = sgml.CVModel.load_or_create('result', 'NN1', skf, config, nn_adapter)

In [11]:
X_nn_emb = [
    ('Annual_Premium_S', 3), ('Vintage_S', 2), ('Vehicle_Damage', 1), ('Previously_Insured', 1), ('Policy_Sales_Channel_S', 2), ('Age_S', 2), ('Vehicle_Age', 1), ('Region_Code_S', 2), 
    ('Gender', 1), ('VA_Age', 4)
]
hparams = {
    'model_params': {
        'model': ResponseModel,
        'model_params': {
            'emb_cols': [
                nn_cat_param(df_train, v, n) for v, n in X_nn_emb
            ], 
            'cont_cols':  'Continous',
            'f_size':  16, 
            'left_layers': [
                {'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 32, 'kernel_initializer': 'glorot_uniform'}]
            , 
            'right_layers': [
                {'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 16, 'kernel_initializer': 'glorot_uniform'}
            ], 
            'top_layers': [
                {'units': 32, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 16, 'activation': 'relu', 'kernel_initializer': 'he_uniform'}, 
                {'units': 1, 'kernel_initializer': 'glorot_uniform'}
            ]
        },
        'batch_size': 512,
        'shuffle_size': 204800,
        'optimizer': ('Adam', {'learning_rate': 0.0005}),
        'epochs': 5
    },
    'X_mm': ['Annual_Premium', 'Vintage', 'Age', 'Driving_License'], 
    'X_cat': ['Annual_Premium_S', 'Vintage_S', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel_S', 'Age_S', 'Vehicle_Age', 'Region_Code_S', 
              'Gender', 'VA_Age'],
}
result = nn.cv(df_train, hparams, progress_callback=sgml.ProgressCallBack())
np.mean(result['valid_scores']), np.mean(result['train_scores'])

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Step:   0%|          | 0/17977 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Step:   0%|          | 0/17977 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Step:   0%|          | 0/17977 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Step:   0%|          | 0/17977 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Step:   0%|          | 0/17977 [00:00<?, ?it/s]

(0.8906805650521935, 0.8994111951152824)

In [10]:
result['valid_scores']

[0.8904762718319137,
 0.8905780191107577,
 0.8906179017310272,
 0.8906771654051973,
 0.890385738839719]