In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, precision_recall_curve, roc_curve, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('/kaggle/input/accidents-toronto/train.csv')
df_test = pd.read_csv('/kaggle/input/accidents-toronto/test.csv')
df.head()

In [None]:
class Cleaner:
    def __init__(self, df: pd.DataFrame, test = False):
        self.df = df
        self.test = test

    def date_time_clean(self):
        self.df["DATE"] = pd.to_datetime(self.df["DATE"])

        self.df["MONTH"] = self.df["DATE"].dt.month
        self.df["DAY"] = self.df["DATE"].dt.weekday
        self.df["QUARTER"] = self.df["DATE"].dt.quarter

        self.df["TIME"] = self.df["TIME"].apply(lambda x: int(str(x)[0]))

    def ffill_bfill(self):
        cols_to_fill = self.df.columns.tolist()
        if self.test == False:
            cols_to_fill.remove("ACCLASS")
        cols_to_fill.remove("INJURY")
        cols_to_fill.remove("FATAL_NO")

        if self.test == False:
            self.df[cols_to_fill] = (
                self.df.groupby("ACCNUM")[cols_to_fill]
                .ffill()
                .bfill()
                .combine_first(self.df[cols_to_fill])
            )
            self.df[cols_to_fill] = (
                self.df.groupby(["DATE", "STREET1"])[cols_to_fill]
                .ffill()
                .bfill()
                .combine_first(self.df[cols_to_fill])
            )

            grouped = self.df.groupby(["DATE", "STREET1"])
            self.df = grouped.transform(lambda group: group.ffill().bfill())

        else:
            grouped = self.df.groupby(["DATE", "STREET1"])
            self.df = grouped.transform(lambda group: group.ffill().bfill())

    def fill_unique1(self):
        one_val_cols = [col for col in self.df.columns if self.df[col].nunique() == 1]

        self.df[one_val_cols] = self.df[one_val_cols].fillna("No")

    def fill_missing_districts(self):
        non_missing_rows = self.df.dropna(subset = ['DISTRICT'])
        missing_rows = self.df[self.df['DISTRICT'].isna()]

        X_train = non_missing_rows[['LATITUDE', 'LONGITUDE']]
        y_train = non_missing_rows['DISTRICT']

        knn = KNeighborsClassifier(n_neighbors = 3)
        knn.fit(X_train, y_train)

        X_test = missing_rows[['LATITUDE', 'LONGITUDE']]
        pred_dists = knn.predict(X_test)

        self.df.loc[self.df['DISTRICT'].isna(), 'DISTRICT'] = pred_dists

    def clean(self):
        self.date_time_clean()
        self.ffill_bfill()
        self.fill_unique1()
        if self.test == True:
            self.fill_missing_districts()
        self.df = self.df.drop(['ACCNUM', 'OBJECTID', 'INDEX_', 'OFFSET', 'X', 'Y'], axis = 1)

        if self.test == False:
            self.df['ACCLASS'] = self.df['ACCLASS'].apply(lambda x: 0 if x == 'Non-Fatal Injury' else 1)

        return self.df

In [None]:
train_cleaner = Cleaner(df, test = False)
test_cleaner = Cleaner(df_test, test = True)

In [None]:
df = train_cleaner.clean()
df_test = test_cleaner.clean()

In [None]:
categorical_features = df.select_dtypes(exclude = np.number).columns.tolist()

print(f'\nCategorical features:\n{categorical_features}')

In [None]:
def pipeline_builder(classifier, preprocessor):
    pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])
    return pipeline

In [None]:
categorical_features = df.select_dtypes(exclude = np.number).columns.tolist()

print(f'\nCategorical features:\n{categorical_features}')

In [None]:
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse_output = False))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
X = df.drop(['ACCLASS'], axis = 1)
y = df['ACCLASS']
X_val = df_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = True)

In [None]:
# cuda hist for faster training in Kaggle using GPU T4 x2
xgb_classifier = xgb.XGBClassifier(n_estimators= 1000, random_state = 42, device = 'cuda', 
                                   verbosity = 0, n_jobs = -1, booster = 'dart', 
                                   rate_drop = 0.2, tree_method = 'hist', 
                                   subsample = 0.8, min_child_weight = 10, 
                                   max_depth = 4, gamma = 1.4, colsample_bytree = 0.6)

In [None]:
xgb_pipeline = pipeline_builder(xgb_classifier, preprocessor)
xgb_pipeline.fit(X_train, y_train)

In [None]:
y_pred = xgb_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels = xgb_pipeline.named_steps['classifier'].classes_)

In [None]:
y_probas_ = cross_val_predict(xgb_pipeline, X_train, y_train, cv = 4, method = "predict_proba")
y_scores = y_probas_[:, 1]

In [None]:
fpr, tpr, thresholds = roc_curve(y_train, y_scores)

def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth = 2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

#Plot the Roc Curve
plot_roc_curve(fpr, tpr, "XGBoost")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
auc = roc_auc_score(y_train, y_scores)
print(auc)

In [None]:
cross_val_preds = cross_val_predict(xgb_pipeline, X_train, y_train, cv = 4)
ConfusionMatrixDisplay(confusion_matrix(y_train, cross_val_preds), display_labels = xgb_pipeline.named_steps['classifier'].classes_)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label = "Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label = "Recall")
    plt.xlabel("Threshold")
    plt.legend(loc = "best")
    plt.ylim([0, 1])
    plt.grid(True)

plt.figure(figsize = (8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.show()

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])


plot_precision_vs_recall(precisions, recalls)
plt.show()

In [None]:
recall_threshhold = (y_scores > 0.285)
print(precision_score(y_train,recall_threshhold))
print(recall_score(y_train,recall_threshhold))
recall_threshhold

In [None]:
y_scores_thresh = xgb_pipeline.predict_proba(X_test)[:, 1]
y_pred_thresh = (y_scores_thresh > 0.285).astype(int)

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_thresh), display_labels = xgb_pipeline.named_steps['classifier'].classes_)

In [None]:
accuracy_score(y_test, y_pred_thresh)

In [None]:
print(classification_report(y_test, y_pred_thresh))

In [None]:
y_scores_val = xgb_pipeline.predict_proba(X_val)[:, 1]
y_pred_val = (y_scores_val > 0.285).astype(int)
y_pred_val

In [None]:
params = {
    'classifier__min_child_weight': [1, 5, 10],
    'classifier__gamma': [1.5, 2, 5],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__max_depth': [3, 4, 5, 7, 10]
}


In [None]:
# cuda hist for faster training in Kaggle using GPU T4 x2
xgb_classifier = xgb.XGBClassifier(n_estimators= 1000, random_state = 42, device = 'cuda', 
                                   verbosity = 0, n_jobs = -1, booster = 'dart', 
                                   rate_drop = 0.2, tree_method = 'hist')
xgb_pipeline = pipeline_builder(xgb_classifier, preprocessor)

In [None]:
folds = 3
param_comb = 3
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

# Set up GridSearchCV
search = RandomizedSearchCV(xgb_pipeline, param_distributions = params, n_iter = param_comb, scoring='roc_auc', n_jobs = -1, cv = skf.split(X_train, y_train), verbose = 1, random_state = 42)

In [None]:
search.fit(X_train, y_train)

In [None]:
xgb_tuned = search.best_estimator_
best_xgb = xgb_pipeline.named_steps['classifier']
xgb_tuned.predict_proba(X_val)[:, 1]
y_pred_val = (y_scores_val > 0.285).astype(int)

In [None]:
booster = best_xgb.get_booster()
importance_dict = booster.get_score(importance_type='weight')
keys = list(importance_dict.keys())
values = list(importance_dict.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").plot(kind='barh', figsize = (20,10))

In [None]:
df_op = pd.read_csv('/kaggle/input/accidents-toronto/test.csv')
df_op['ACCLASS'] = y_pred_val

df_op['ACCLASS'] = df_op['ACCLASS'].apply(lambda x: 'Non-Fatal Injury' if x == 0 else 'Fatal')
df_output = df_op[['OBJECTID', 'ACCLASS']]
df_output.to_csv('submission_tuned_xgb.csv', index=False)