In [25]:
# Catboost optimization

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

# df = pd.read_parquet('./data/train.parquet')
# df = pd.read_parquet('./data/sub_train.parquet')

df = pd.read_parquet('./data/norm_train.parquet')

target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

  from pandas.core import (


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,-1.824556,1,0.670557,0,1-2 Year,Yes,2.301567,-0.276998,0.202098,0
1,Male,0.340163,1,-0.027604,0,> 2 Years,Yes,1.940704,-1.077446,1.744448,1
2,Female,-0.574595,1,-0.761353,1,< 1 Year,No,0.518764,0.49639,1.002056,0
3,Female,-0.037646,1,-2.40855,0,1-2 Year,Yes,-5.199338,1.321946,-0.93391,0
4,Female,-0.001255,1,-0.691121,1,1-2 Year,No,0.011561,0.49639,2.085356,0


In [14]:
X.shape

(11504798, 10)

In [13]:
df[target_name].value_counts()

Response
0    10089739
1     1415059
Name: count, dtype: int64

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504798 entries, 0 to 11504797
Data columns (total 10 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   float64 
 2   Driving_License       int8    
 3   Region_Code           float64 
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        float64 
 8   Policy_Sales_Channel  float64 
 9   Vintage               float64 
dtypes: category(3), float64(5), int8(2)
memory usage: 581.5 MB


In [12]:
X.nunique()

Gender                      2
Age                        62
Driving_License             2
Region_Code                53
Previously_Insured          2
Vehicle_Age                 3
Vehicle_Damage              2
Annual_Premium          51374
Policy_Sales_Channel      152
Vintage                   290
dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

cat_feat = X.select_dtypes(include=['int8','category']).columns
num_feat = X.select_dtypes(include=['int16','int32','float64']).columns
# num_feat = X.select_dtypes(include=['int8','int16','int32','float64']).columns
# bin_cat = X.select_dtypes(include=['int8']).columns

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

print(cat_feat, num_feat)

Index(['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage'],
      dtype='object') Index(['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel',
       'Vintage'],
      dtype='object')


In [8]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat),
        # ('bin_cat', OneHotEncoder(), bin_cat),
        ('cat', OrdinalEncoder(), cat_feat),
    ],  remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# Define the parameter grid
param_grid = {
    # 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'classifier__n_estimators': [100, 200, 300, 400],
    # 'num_leaves': [31, 50, 70, 100],
    # 'max_depth': [-1, 5, 10, 20],
    # 'min_child_samples': [20, 30, 40, 50],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'classifier__scale_pos_weight': [1, 10, 20]  # Adjust this based on the imbalance ratio
}

# # Create the grid search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)
pipeline.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]


# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)

# metric

# Print metrics
print(f'=== {str(model)} ===')
print(f'ROC_AUC: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1132626, number of negative: 8071212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 747
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123060 -> initscore=-1.963765
[LightGBM] [Info] Start training from score -1.963765
=== LGBMClassifier(n_estimators=300) ===
ROC_AUC: 0.8787 (+/- 0.0000)
F1 Score: 0.1693 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.89      0.99      0.94   2018527
           1       0.58      0.10      0.17    282433

    accuracy                           0.88   2300960
   macro avg       0.73      0.54      0.55   2300960
weighted avg       0.85      0.88      0.84   2300960





In [23]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        ('cat', OneHotEncoder(), cat_feat),
        # ('cat', OneHotEncoder(), bin_cat),
        # ('cat', OrdinalEncoder(), cat_feat),
    ],  remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# Define the parameter grid
param_grid = {
    # 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'classifier__n_estimators': [100, 200, 300, 400],
    # 'num_leaves': [31, 50, 70, 100],
    # 'max_depth': [-1, 5, 10, 20],
    # 'min_child_samples': [20, 30, 40, 50],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'classifier__scale_pos_weight': [1, 10, 20]  # Adjust this based on the imbalance ratio
}

# # Create the grid search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)
pipeline.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]


# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)

# metric

# Print metrics
print(f'=== {str(model)} ===')
print(f'ROC_AUC: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1132626, number of negative: 8071212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 749
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123060 -> initscore=-1.963765
[LightGBM] [Info] Start training from score -1.963765
=== LGBMClassifier(n_estimators=300) ===
ROC_AUC: 0.8787 (+/- 0.0000)
F1 Score: 0.1693 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.89      0.99      0.94   2018527
           1       0.58      0.10      0.17    282433

    accuracy                           0.88   2300960
   macro avg       0.73      0.54      0.55   2300960
weighted avg       0.85      0.88      0.84   2300960





In [17]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat)
        ('cat', OrdinalEncoder(), cat_feat)
        
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# Define the parameter grid
param_grid = {
    # 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'classifier__n_estimators': [100, 200, 300, 400],
    # 'num_leaves': [31, 50, 70, 100],
    # 'max_depth': [-1, 5, 10, 20],
    # 'min_child_samples': [20, 30, 40, 50],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'classifier__scale_pos_weight': [1, 10, 20]  # Adjust this based on the imbalance ratio
}

# # Create the grid search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)
pipeline.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:,1]

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)

# metric

# Print metrics
print(f'=== {str(model)} ===')
print(f'ROC_AUC: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1131903, number of negative: 1132191
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 744
[LightGBM] [Info] Number of data points in the train set: 2264094, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499936 -> initscore=-0.000254
[LightGBM] [Info] Start training from score -0.000254
=== LGBMClassifier(n_estimators=300) ===
ROC_AUC: 0.8780 (+/- 0.0000)
F1 Score: 0.8318 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.91      0.70      0.79    282868
           1       0.75      0.93      0.83    283156

    accuracy                           0.81    566024
   macro avg       0.83      0.81      0.81    566024
weighted avg       0.83      0.81      0.81    566024





In [85]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300, class_weight='balanced')


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat)
        ('cat', OrdinalEncoder(), cat_feat)
        
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# Define the parameter grid
param_grid = {
    # 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'classifier__n_estimators': [100, 200, 300, 400],
    # 'num_leaves': [31, 50, 70, 100],
    # 'max_depth': [-1, 5, 10, 20],
    # 'min_child_samples': [20, 30, 40, 50],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'classifier__scale_pos_weight': [1, 10, 20]  # Adjust this based on the imbalance ratio
}

# # Create the grid search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)
pipeline.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1131824, number of negative: 8072014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.097843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 718
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
=== LGBMClassifier(class_weight='balanced', n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.4541 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   2017725
           1       0.30      0.93      0.45    283235

    accuracy                           0.73   2300960
   macro avg       0.64      0.81      0.64   2300960
weighted avg       0.90      0.73      0.7

In [66]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat)
        ('cat', OrdinalEncoder(), cat_feat)
        
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# Define the parameter grid
param_grid = {
    # 'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    # 'classifier__n_estimators': [100, 200, 300, 400],
    # 'num_leaves': [31, 50, 70, 100],
    # 'max_depth': [-1, 5, 10, 20],
    # 'min_child_samples': [20, 30, 40, 50],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    # 'reg_alpha': [0, 0.1, 0.5, 1],
    # 'reg_lambda': [0, 0.1, 0.5, 1],
    'classifier__scale_pos_weight': [1, 10, 20]  # Adjust this based on the imbalance ratio
}

# # Create the grid search
# grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'ROC AUC Score: {roc_auc}')
print(f'F1 Score: {f1}')

[LightGBM] [Info] Number of positive: 1131824, number of negative: 8072014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 718
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122973 -> initscore=-1.964573
[LightGBM] [Info] Start training from score -1.964573
Best parameters found:  {'classifier__scale_pos_weight': 1}
ROC AUC Score: 0.5438551039525084
F1 Score: 0.16713556571587437


In [15]:
dfk = pd.read_parquet('./data/norm_test.parquet')
dfk.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11504798,Female,-5.199338,1,1.644369,0,< 1 Year,No,-5.199338,1.798663,0.607463
11504799,Male,0.556927,1,-0.006273,0,1-2 Year,Yes,0.498157,-0.251016,-0.385726
11504800,Male,0.556927,1,1.137677,0,1-2 Year,Yes,-5.199338,-1.053106,1.278137
11504801,Female,-1.355761,1,1.644369,1,< 1 Year,No,-0.747697,0.502074,-0.45982
11504802,Male,0.793602,1,-0.5467,0,1-2 Year,No,0.188388,-0.251016,-0.148584


In [18]:
dfk.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7669866 entries, 11504798 to 19174663
Data columns (total 10 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   float64 
 2   Driving_License       int8    
 3   Region_Code           float64 
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        float64 
 8   Policy_Sales_Channel  float64 
 9   Vintage               float64 
dtypes: category(3), float64(5), int8(2)
memory usage: 387.7 MB


In [16]:
# Save submission file
# dfk = pd.read_parquet('./data/test.parquet')
X_kaggle = dfk[X.columns]
y_kaggle = pipeline.predict(X_kaggle)

In [17]:
df_sub = pd.DataFrame({'id':dfk.index.values,'Response':y_kaggle})
# df_sub = pd.concat([dfk,df_sub],axis=1)
df_sub.to_parquet(f'./submissions/lightGBM_good_{round(roc_auc,4)}.parquet',index=False)
df_sub.head()

Unnamed: 0,id,Response
0,11504798,0
1,11504799,0
2,11504800,0
3,11504801,0
4,11504802,0
