In [None]:
# Boost models

In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

df = pd.read_parquet('./data/sub_train.parquet')

target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6926847,Male,21,1,8,1,< 1 Year,No,43872,160,106,0
2606866,Male,50,1,28,0,1-2 Year,Yes,40378,26,281,0
9356482,Female,24,1,29,1,< 1 Year,No,43801,152,165,0
11367445,Male,71,1,28,1,1-2 Year,No,2630,26,197,0
6003615,Male,36,1,45,0,1-2 Year,Yes,24647,124,126,0


In [14]:
cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32']).columns

In [15]:
from sklearn.model_selection import train_test_split

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=100_000, random_state=7)

In [16]:
# Define models to test
metric = 'auc'
models = {
    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(verbose=0),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OrdinalEncoder(), cat_feat)
    ])

# Iterate over models to create pipelines
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocess the data
        ('classifier', model)            # Classifier
    ])
    
    # Perform cross-validation
    # with parallel_backend('loky', n_jobs=-1):
    y_pred = cross_val_predict(pipeline, X_train, y_train, cv=5)
    metric = 'roc_auc'
    accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=metric)
    f1 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
    
    # Print metrics
    print(f'=== {name} ===')
    print(f'{metric}: {np.mean(accuracy):.4f} (+/- {np.std(accuracy):.4f})')
    print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
    print(classification_report(y_train, y_pred))
    print('\n' + '='*30 + '\n')



=== AdaBoost ===
roc_auc: 0.8587 (+/- 0.0023)
F1 Score: 0.7976 (+/- 0.0022)
              precision    recall  f1-score   support

           0       0.91      0.67      0.77     50166
           1       0.74      0.94      0.82     49834

    accuracy                           0.80    100000
   macro avg       0.82      0.80      0.80    100000
weighted avg       0.82      0.80      0.80    100000



=== GradientBoosting ===
roc_auc: 0.8636 (+/- 0.0021)
F1 Score: 0.7993 (+/- 0.0016)
              precision    recall  f1-score   support

           0       0.90      0.68      0.77     50166
           1       0.74      0.93      0.82     49834

    accuracy                           0.80    100000
   macro avg       0.82      0.80      0.80    100000
weighted avg       0.82      0.80      0.80    100000



=== XGBoost ===
roc_auc: 0.8691 (+/- 0.0017)
F1 Score: 0.8035 (+/- 0.0025)
              precision    recall  f1-score   support

           0       0.89      0.70      0.78     5016