In [1]:
# Catboost optimization

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

# df = pd.read_parquet('./data/train.parquet')
df = pd.read_parquet('./data/sub_train.parquet')


target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

  from pandas.core import (


Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6926847,Male,21,1,8,1,< 1 Year,No,43872,160,106,0
2606866,Male,50,1,28,0,1-2 Year,Yes,40378,26,281,0
9356482,Female,24,1,29,1,< 1 Year,No,43801,152,165,0
11367445,Male,71,1,28,1,1-2 Year,No,2630,26,197,0
6003615,Male,36,1,45,0,1-2 Year,Yes,24647,124,126,0


In [2]:
X.shape

(2830118, 10)

In [3]:
df[target_name].value_counts()

Response
0    1415059
1    1415059
Name: count, dtype: int64

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2830118 entries, 6926847 to 11504796
Data columns (total 10 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
dtypes: category(3), int16(2), int32(1), int8(4)
memory usage: 62.1 MB


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32']).columns

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print(cat_feat, num_feat)

Index(['Gender', 'Vehicle_Age', 'Vehicle_Damage'], dtype='object') Index(['Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage'],
      dtype='object')


In [6]:
# Define the CatBoost model with GPU support
# catboost = CatBoostClassifier(verbose=0, random_seed=7)

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)
# model = LGBMClassifier(learning_rate=0.1, n_estimators=300,  is_unbalance=True)



# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat)
        ('cat', OrdinalEncoder(), cat_feat)
        
    ])

min_pipe = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])


# # Fit the model
# grid_search.fit(X_train, y_train)
min_pipe.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = min_pipe.predict(X_test)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

metric = 'roc_auc'

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1132330, number of negative: 1131764
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065326 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 741
[LightGBM] [Info] Number of data points in the train set: 2264094, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500125 -> initscore=0.000500
[LightGBM] [Info] Start training from score 0.000500
=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.8318 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.91      0.70      0.79    283295
           1       0.75      0.93      0.83    282729

    accuracy                           0.81    566024
   macro avg       0.83      0.81      0.81    566024
weighted avg       0.83      0.81      0.81    566024





In [7]:
proba_minority = min_pipe.predict_proba(X_test)[:, 1]
ensemble_predictions = (proba_minority >= 0.5).astype(int)


roc_auc = roc_auc_score(y_test, ensemble_predictions)
f1 = f1_score(y_test, ensemble_predictions)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, ensemble_predictions))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.8318 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.91      0.70      0.79    283295
           1       0.75      0.93      0.83    282729

    accuracy                           0.81    566024
   macro avg       0.83      0.81      0.81    566024
weighted avg       0.83      0.81      0.81    566024





In [10]:
df = pd.read_parquet('./data/train.parquet')


# Number of random data points you want to select
n = 2_000_000

# Select n random data points from majority class
sample_df = df.sample(n=n, random_state=7)  # random_state is used for reproducibility

target_name = sample_df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X_sample = sample_df.drop([target_name],axis=1)
y_sample = sample_df[target_name]


sample_df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1320709,Female,29,1,30,1,1-2 Year,Yes,2630,156,176,0
2265982,Female,24,1,8,1,< 1 Year,No,29801,152,49,0
11116699,Male,21,1,6,0,< 1 Year,Yes,31742,160,229,0
1777034,Male,67,1,28,1,1-2 Year,No,55409,124,56,0
7222744,Male,28,1,28,0,< 1 Year,Yes,37365,152,194,0


In [12]:
y_pred_sample = min_pipe.predict(X_sample)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_sample, y_pred_sample)
f1 = f1_score(y_sample, y_pred_sample)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_sample, y_pred_sample))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8126 (+/- 0.0000)
F1 Score: 0.4538 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   1753818
           1       0.30      0.93      0.45    246182

    accuracy                           0.72   2000000
   macro avg       0.64      0.81      0.63   2000000
weighted avg       0.90      0.72      0.77   2000000





In [10]:
# 2nd Model, lookign for high prec on class==1


# load inbalanced dataset
df = pd.read_parquet('./data/train.parquet')
target_name = df.columns[-1]
X = df.drop([target_name],axis=1)
y = df[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32']).columns

# model = LGBMClassifier(learning_rate=0.1, n_estimators=300, verbose=0)
maj_model = LGBMClassifier(learning_rate=0.1, n_estimators=300, class_weight='balanced')


# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat)
        ('cat', OrdinalEncoder(), cat_feat)
        
    ])

maj_pipe = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', maj_model)            # Classifier
        ])


# # Fit the model
# grid_search.fit(X_train, y_train)
maj_pipe.fit(X_train,y_train)

# # Get the best parameters
# best_params = grid_search.best_params_
# print("Best parameters found: ", best_params)

# # Predict on the test data
# y_pred = grid_search.predict(X_test)
y_pred = maj_pipe.predict(X_test)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, y_pred))
print('\n' + '='*30 + '\n')

[LightGBM] [Info] Number of positive: 1131824, number of negative: 8072014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 718
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.4541 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   2017725
           1       0.30      0.93      0.45    283235

    accuracy                           0.73   2300960
   macro avg       0.64      0.81      0.64   2300960
weighted avg       0.90      0.73      0.77   2300960





In [12]:
proba_majority = maj_pipe.predict_proba(X_test)[:, 1]
ensemble_predictions = (proba_majority >= 0.5).astype(int)


roc_auc = roc_auc_score(y_test, ensemble_predictions)
f1 = f1_score(y_test, ensemble_predictions)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, ensemble_predictions))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.4541 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   2017725
           1       0.30      0.93      0.45    283235

    accuracy                           0.73   2300960
   macro avg       0.64      0.81      0.64   2300960
weighted avg       0.90      0.73      0.77   2300960





In [13]:
df = pd.read_parquet('./data/train.parquet')


# Number of random data points you want to select
n = 2_000_000

# Select n random data points from majority class
sample_df = df.sample(n=n, random_state=7)  # random_state is used for reproducibility

sample_df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1320709,Female,29,1,30,1,1-2 Year,Yes,2630,156,176,0
2265982,Female,24,1,8,1,< 1 Year,No,29801,152,49,0
11116699,Male,21,1,6,0,< 1 Year,Yes,31742,160,229,0
1777034,Male,67,1,28,1,1-2 Year,No,55409,124,56,0
7222744,Male,28,1,28,0,< 1 Year,Yes,37365,152,194,0


In [15]:
target_name = sample_df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X_sample = sample_df.drop([target_name],axis=1)
y_sample = sample_df[target_name]

In [16]:
y_pred = maj_pipe.predict(X_sample)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_sample, y_pred)
f1 = f1_score(y_sample, y_pred)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_sample, y_pred))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8125 (+/- 0.0000)
F1 Score: 0.4541 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   1753818
           1       0.30      0.93      0.45    246182

    accuracy                           0.73   2000000
   macro avg       0.64      0.81      0.64   2000000
weighted avg       0.90      0.73      0.77   2000000





In [17]:
y_pred = min_pipe.predict(X_sample)

# Calculate evaluation metrics
roc_auc = roc_auc_score(y_sample, y_pred)
f1 = f1_score(y_sample, y_pred)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_sample, y_pred))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.8126 (+/- 0.0000)
F1 Score: 0.4538 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.99      0.70      0.82   1753818
           1       0.30      0.93      0.45    246182

    accuracy                           0.72   2000000
   macro avg       0.64      0.81      0.63   2000000
weighted avg       0.90      0.72      0.77   2000000





In [None]:
# predict class==1
proba_minority = min_pipe.predict_proba(X_test)[:, 1]
ensemble_min = (proba_minority >= 0.5).astype(int)

# predict class==0
proba_majority = maj_pipe.predict_proba(X_test)[:, 1]
ensemble_aj = (proba_majority >= 0.5).astype(int)



In [40]:
proba_majority = maj_pipe.predict_proba(X_test)[:, 1]
proba_minority = pipeline.predict_proba(X_test)[:, 0]

In [41]:
ensemble_proba = 0.5 * proba_majority + 0.5 * proba_minority

# Threshold for converting probabilities to binary predictions
threshold = 0.5
ensemble_predictions = (ensemble_proba <= threshold).astype(int)

roc_auc = roc_auc_score(y_test, ensemble_predictions)
f1 = f1_score(y_test, ensemble_predictions)

# Print metrics
print(f'=== {str(model)} ===')
print(f'{metric}: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
print(classification_report(y_test, ensemble_predictions))
print('\n' + '='*30 + '\n')

=== LGBMClassifier(n_estimators=300) ===
roc_auc: 0.4561 (+/- 0.0000)
F1 Score: 0.2015 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.42      0.01      0.02   2017725
           1       0.11      0.90      0.20    283235

    accuracy                           0.12   2300960
   macro avg       0.27      0.46      0.11   2300960
weighted avg       0.38      0.12      0.04   2300960





In [40]:
# Save submission file
dfk = pd.read_parquet('./data/test.parquet')
X_kaggle = dfk[X.columns]
y_kaggle = pipeline.predict(X_kaggle)

In [46]:
df_sub = pd.DataFrame({'id':dfk.index.values,'Target':y_kaggle})
# df_sub = pd.concat([dfk,df_sub],axis=1)
df_sub.to_csv(f'./submissions/lightGBM{round(roc_auc,4)}.csv',index=False)
df_sub.head()

Unnamed: 0,id,Target
0,11504798,0
1,11504799,1
2,11504800,1
3,11504801,0
4,11504802,0
