In [None]:
# Catboost optimization

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

df = pd.read_parquet('./data/sub_train.parquet')

target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6926847,Male,21,1,8,1,< 1 Year,No,43872,160,106,0
2606866,Male,50,1,28,0,1-2 Year,Yes,40378,26,281,0
9356482,Female,24,1,29,1,< 1 Year,No,43801,152,165,0
11367445,Male,71,1,28,1,1-2 Year,No,2630,26,197,0
6003615,Male,36,1,45,0,1-2 Year,Yes,24647,124,126,0


In [12]:
from sklearn.model_selection import train_test_split

cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32']).columns

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=100_000, random_state=7)

In [18]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

model = LGBMClassifier(verbose=0)

# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OrdinalEncoder(), cat_feat)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    # 'classifier__num_leaves': [31, 50, 70],
    # 'classifier__min_data_in_leaf': [20, 30, 40],
    # 'classifier__max_depth': [-1, 10, 20],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__n_estimators': [100, 200, 300],
    # 'classifier__subsample': [0.6, 0.8, 1.0],
    # 'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    # 'classifier__reg_alpha': [0, 0.1, 0.5],
    # 'classifier__reg_lambda': [0, 0.1, 0.5],
}

# Stratified k-fold cross-validation
cv = 5

# Perform grid search with cross-validation
metric = 'roc_auc'
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=metric, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation AUC: {:.4f}".format(grid_search.best_score_))

# Evaluate the best model on the test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# Print classification report
# print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters found:  {'classifier__learning_rate': 0.05, 'classifier__n_estimators': 300}
Best cross-validation AUC: 0.8714


In [19]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

model = LGBMClassifier(verbose=0)

# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OrdinalEncoder(), cat_feat)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__num_leaves': [31, 50, 70],
    # 'classifier__min_data_in_leaf': [20, 30, 40],
    'classifier__max_depth': [-1, 10, 20],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__n_estimators': [100, 200, 300],
    # 'classifier__subsample': [0.6, 0.8, 1.0],
    # 'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    # 'classifier__reg_alpha': [0, 0.1, 0.5],
    # 'classifier__reg_lambda': [0, 0.1, 0.5],
}

# Stratified k-fold cross-validation
cv = 5

# Perform grid search with cross-validation
metric = 'roc_auc'
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=metric, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation AUC: {:.4f}".format(grid_search.best_score_))

# Evaluate the best model on the test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# Print classification report
# print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters found:  {'classifier__learning_rate': 0.05, 'classifier__max_depth': -1, 'classifier__n_estimators': 300, 'classifier__num_leaves': 31}
Best cross-validation AUC: 0.8714


In [20]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

model = LGBMClassifier(verbose=0)

# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OrdinalEncoder(), cat_feat)
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__num_leaves': [31, 50, 70],
    # 'classifier__min_data_in_leaf': [20, 30, 40],
    'classifier__max_depth': [-1, 10, 20],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__n_estimators': [100, 200, 300],
    # 'classifier__subsample': [0.6, 0.8, 1.0],
    # 'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__reg_alpha': [0, 0.1, 0.5],
    'classifier__reg_lambda': [0, 0.1, 0.5],
}

# Stratified k-fold cross-validation
cv = 5

# Perform grid search with cross-validation
metric = 'roc_auc'
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=metric, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation AUC: {:.4f}".format(grid_search.best_score_))

# Evaluate the best model on the test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# Print classification report
# print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best parameters found:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 10, 'classifier__n_estimators': 200, 'classifier__num_leaves': 31, 'classifier__reg_alpha': 0.5, 'classifier__reg_lambda': 0.5}
Best cross-validation AUC: 0.8717


In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8283455305802404

In [18]:
# Save submission file
dfk = pd.read_csv('./data/test.csv')
X_kaggle = dfk.drop('id',axis=1)
y_kaggle = best_model.predict(X_kaggle)

In [19]:
df_sub = pd.DataFrame({'id':dfk['id'].values,'Target':y_kaggle.flatten()})
df_sub.to_csv(f'./submissions/catboost{round(accuracy,4)}.csv',index=False)