In [None]:
# Cleaned LGBM Model

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score


# df = pd.read_parquet('./data/train.parquet')
df = pd.read_parquet('./data/sub_train.parquet')

# df = pd.read_parquet('./data/norm_train.parquet')

target_name = df.columns[-1]

# X = df.drop([target_name,'id'],axis=1)
X = df.drop([target_name],axis=1)

y = df[target_name]

df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6926847,Male,21,1,8,1,< 1 Year,No,43872,160,106,0
2606866,Male,50,1,28,0,1-2 Year,Yes,40378,26,281,0
9356482,Female,24,1,29,1,< 1 Year,No,43801,152,165,0
11367445,Male,71,1,28,1,1-2 Year,No,2630,26,197,0
6003615,Male,36,1,45,0,1-2 Year,Yes,24647,124,126,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2830118 entries, 6926847 to 11504796
Data columns (total 11 columns):
 #   Column                Dtype   
---  ------                -----   
 0   Gender                category
 1   Age                   int8    
 2   Driving_License       int8    
 3   Region_Code           int8    
 4   Previously_Insured    int8    
 5   Vehicle_Age           category
 6   Vehicle_Damage        category
 7   Annual_Premium        int32   
 8   Policy_Sales_Channel  int16   
 9   Vintage               int16   
 10  Response              int8    
dtypes: category(3), int16(2), int32(1), int8(5)
memory usage: 64.8 MB


In [10]:
# Feature selection
cat_feat = X.select_dtypes(include=['category']).columns
num_feat = X.select_dtypes(include=['int8','int16','int32','float64']).columns
# num_feat = X.select_dtypes(include=['int8','int16','int32','float64']).columns
# bin_cat = X.select_dtypes(include=['int8']).columns

# We need to train on smaller set for speed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

print(cat_feat, num_feat)

Index(['Gender', 'Vehicle_Age', 'Vehicle_Damage'], dtype='object') Index(['Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Annual_Premium', 'Policy_Sales_Channel', 'Vintage'],
      dtype='object')


In [18]:
# Model

# Define the preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_feat),
        # ('cat', OneHotEncoder(), cat_feat),
        # ('bin_cat', OneHotEncoder(), bin_cat),
        ('cat', OrdinalEncoder(), cat_feat),
    ],  remainder='passthrough')



def fit_model(X_train, X_test, y_train, y_test, model):
    pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', model)            # Classifier
        ])
    
    # Fit the model
    best_model = pipeline.fit(X_train,y_train)
    
    # Predict on the test data
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:,1]
    
    
    # Calculate evaluation metrics
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    f1 = f1_score(y_test, y_pred)
    
    # Print metrics
    print(f'=== {str(model)} ===')
    print(f'ROC_AUC: {np.mean(roc_auc):.4f} (+/- {np.std(roc_auc):.4f})')
    print(f'F1 Score: {np.mean(f1):.4f} (+/- {np.std(f1):.4f})')
    print(classification_report(y_test, y_pred))
    print('\n' + '='*30 + '\n')

    return best_model, roc_auc

In [19]:
model = LGBMClassifier(learning_rate=0.1, n_estimators=300)
trained_model, metric = fit_model(X_train, X_test, y_train, y_test, model)

[LightGBM] [Info] Number of positive: 1132330, number of negative: 1131764
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090896 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 741
[LightGBM] [Info] Number of data points in the train set: 2264094, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500125 -> initscore=0.000500
[LightGBM] [Info] Start training from score 0.000500
=== LGBMClassifier(n_estimators=300) ===
ROC_AUC: 0.8786 (+/- 0.0000)
F1 Score: 0.8318 (+/- 0.0000)
              precision    recall  f1-score   support

           0       0.91      0.70      0.79    283295
           1       0.75      0.93      0.83    282729

    accuracy                           0.81    566024
   macro avg       0.83      0.81      0.81    566024
weighted avg       0.83      0.81      0.81    566024





In [20]:
# Load test set and save sub file
dfk = pd.read_parquet('./data/test.parquet')
X_kaggle = dfk[X.columns]
y_kaggle = trained_model.predict_proba(X_kaggle)[:,1]

df_sub = pd.DataFrame({'id':dfk.index.values,'Response':y_kaggle})

df_sub.to_parquet(f'./submissions/lightGBM_base_{round(metric,4)}.parquet',index=False)
df_sub.head()

Unnamed: 0,id,Response
0,11504798,0.042889
1,11504799,0.837848
2,11504800,0.716822
3,11504801,0.000839
4,11504802,0.271018
