In [1]:
# Base Models to test
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.compose import ColumnTransformer

# CPU opt
from joblib import parallel_backend

# Data
# df = pd.read_csv('./data/train.csv')
# df = pd.read_csv('./data/sub_train.csv')
df = pd.read_parquet('./data/sub_train.parquet')


target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

  from pandas.core import (


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6926847,Male,21,1,8,1,< 1 Year,No,43872,160,106,0
2606866,Male,50,1,28,0,1-2 Year,Yes,40378,26,281,0
9356482,Female,24,1,29,1,< 1 Year,No,43801,152,165,0
11367445,Male,71,1,28,1,1-2 Year,No,2630,26,197,0
6003615,Male,36,1,45,0,1-2 Year,Yes,24647,124,126,0


In [2]:
X.columns

Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage'],
      dtype='object')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2830118 entries, 6926847 to 11504796
Data columns (total 11 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
 10  Response              int8    
dtypes: category(3), int16(2), int32(1), int8(5)
memory usage: 64.8 MB


In [4]:
df[target_name].value_counts()

Response
0    1415059
1    1415059
Name: count, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=100_000, random_state=7)

In [9]:
cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32']).columns

In [10]:
X_train.shape

(100000, 10)

In [11]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OrdinalEncoder(), cat_feat)
    ])

# Iterate over models to create pipelines
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),  # Preprocess the data
        ('classifier', model)            # Classifier
    ])
    
    # Perform cross-validation
    # with parallel_backend('loky', n_jobs=-1):
    y_pred = cross_val_predict(pipeline, X_train, y_train, cv=5)
    metric = 'roc_auc'
    accuracy = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=metric)
    f1 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
    
    # Print metrics
    print(f'=== {name} ===')
    print(f'{metric}: {np.mean(accuracy):.4f} (+/- {np.std(accuracy):.4f})')
    print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
    print(classification_report(y_train, y_pred))
    print('\n' + '='*30 + '\n')

=== Logistic Regression ===
roc_auc: 0.8236 (+/- 0.0025)
F1 Score: 0.7785 (+/- 0.0015)
              precision    recall  f1-score   support

           0       0.97      0.59      0.74     50166
           1       0.71      0.98      0.82     49834

    accuracy                           0.79    100000
   macro avg       0.84      0.79      0.78    100000
weighted avg       0.84      0.79      0.78    100000



=== Random Forest ===
roc_auc: 0.8482 (+/- 0.0018)
F1 Score: 0.7882 (+/- 0.0022)
              precision    recall  f1-score   support

           0       0.86      0.70      0.77     50166
           1       0.74      0.88      0.81     49834

    accuracy                           0.79    100000
   macro avg       0.80      0.79      0.79    100000
weighted avg       0.80      0.79      0.79    100000



=== SVM ===
roc_auc: 0.8424 (+/- 0.0021)
F1 Score: 0.7940 (+/- 0.0018)
              precision    recall  f1-score   support

           0       0.92      0.66      0.77     