### CatBoost GBM Classifier Model Training

In [2]:
# Load necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv("C:/Users/win10/Desktop/Project_Aug25/data/accidents_cleaned.csv")
# df = df.sample(100000, random_state=42)


target = 'Severity'
X = df.drop(columns=target)
y = df[target]

In [4]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [7]:
clf_catboost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=8,
        loss_function='MultiClass',
        eval_metric='Accuracy',
        random_seed=42,
        verbose=100,
        early_stopping_rounds=30
    ))
])

clf_catboost.fit(X_train, y_train)

0:	learn: 0.7961047	total: 4.81s	remaining: 23m 57s
100:	learn: 0.8219207	total: 7m 55s	remaining: 15m 37s
200:	learn: 0.8321481	total: 15m 33s	remaining: 7m 40s
299:	learn: 0.8375243	total: 26m 57s	remaining: 0us


0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [9]:
y_pred_cb = clf_catboost.predict(X_test)

print('CatBoost Classifier Accuracy:', accuracy_score(y_test, y_pred_cb))
print('\nClassification Report:\n', classification_report(y_test, y_pred_cb))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred_cb))

CatBoost Classifier Accuracy: 0.8370394782488032

Classification Report:
               precision    recall  f1-score   support

           1       0.63      0.04      0.08      2551
           2       0.85      0.97      0.91    229484
           3       0.72      0.37      0.49     48776
           4       0.64      0.03      0.06      7449

    accuracy                           0.84    288260
   macro avg       0.71      0.36      0.39    288260
weighted avg       0.82      0.84      0.81    288260


Confusion Matrix:
 [[   114   2319    118      0]
 [    36 222645   6732     71]
 [    22  30408  18274     72]
 [    10   6837    350    252]]
