## **Imports**

In [6]:
import os
import joblib
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

## **Paths & Global Variables**

In [7]:
ROOT_PATH = r"C:\Users\mario\OneDrive\Documents\Work\Side Hustles\Kaggle\titanic"

import sys

sys.path.append(ROOT_PATH)

from titanic.config import PROCESSED_DATA_DIR, MODELS_DIR

## **Reading Data**

In [8]:
os.chdir(PROCESSED_DATA_DIR)

features_selected = pd.read_json("knn_features.json").index.to_list()

X_train = pd.read_parquet("X_train.parquet", columns=features_selected)
y_train = pd.read_parquet("y_train.parquet")

X_test = pd.read_parquet("X_val.parquet", columns=features_selected)
y_test = pd.read_parquet("y_val.parquet")

# **K-Nearest Neighbors**

In [9]:
cols_to_remove = [
    "passenger_class_ord",
]
X_train.drop(columns=cols_to_remove, inplace=True)
X_test.drop(columns=cols_to_remove, inplace=True)

## **Model Fitting**

In [10]:
knn = KNeighborsClassifier()
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

## **Hyperparameter Tuning**

In [11]:
grid = {
    "n_neighbors": list(range(20)),
    "weights": ["uniform", "distance", "uniform"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": list(range(10)),
    "p": [1, 2],
}

In [13]:
clf = GridSearchCV(knn, grid, cv=10, scoring="roc_auc")
clf.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__algorithm': 'auto',
 'estimator__leaf_size': 30,
 'estimator__metric': 'minkowski',
 'estimator__metric_params': None,
 'estimator__n_jobs': None,
 'estimator__n_neighbors': 5,
 'estimator__p': 2,
 'estimator__weights': 'uniform',
 'estimator': KNeighborsClassifier(),
 'n_jobs': None,
 'param_grid': {'n_neighbors': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19],
  'weights': ['uniform', 'distance', 'uniform'],
  'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
  'leaf_size': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
  'p': [1, 2]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': 'roc_auc',
 'verbose': 0}

In [14]:
clf.fit(X_train, y_train.values.ravel())
best_model = clf.best_estimator_
print(best_model)
print(clf.best_params_)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=19, p=1,
                     weights='distance')
{'algorithm': 'brute', 'leaf_size': 1, 'n_neighbors': 19, 'p': 1, 'weights': 'distance'}


6960 fits failed out of a total of 48000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4800 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mario\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mario\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\mario\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_

## **Model Testing**

In [15]:
y_predicted = best_model.predict(X_test)
y_predicted_probs = best_model.predict_proba(X_test)[:, 1]

In [16]:
y_predicted_df = pd.DataFrame(y_predicted, columns=["predicted"], index=X_test.index)
y_predicted_probs_df = pd.DataFrame(
    y_predicted_probs, columns=["probability"], index=X_test.index
)
y_predicted_df = pd.concat([y_predicted_df, y_predicted_probs_df], axis=1)
y_predicted_df.head()

Unnamed: 0_level_0,predicted,probability
passenger_id,Unnamed: 1_level_1,Unnamed: 2_level_1
161,False,0.042896
127,False,0.333333
429,False,0.333333
423,False,0.044568
566,False,0.078308


## **Evaluation Metrics**

In [18]:
best_score = clf.best_score_
test_score = clf.score(X_test, y_test)
print(best_score)
print(test_score)

0.8719048527914808
0.8261363636363637


In [17]:
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_predicted),
    index=["actual no", "actual yes"],
    columns=["predicted no", "predicted yes"],
)

test_conf_matrix

Unnamed: 0,predicted no,predicted yes
actual no,94,16
actual yes,18,50


In [19]:
metrics_dict = {
    "Accuracy": accuracy_score(y_test, y_predicted),
    "Precision": precision_score(y_test, y_predicted),
    "Recall": recall_score(y_test, y_predicted),
    "F1 Score": f1_score(y_test, y_predicted),
}

metrics_df = pd.DataFrame(
    metrics_dict.values(), index=metrics_dict.keys(), columns=["Value"]
)

roc_auc = roc_auc_score(y_test, y_predicted_probs)
gini = 2 * roc_auc - 1

metrics_df.loc["ROC AUC"] = roc_auc
metrics_df.loc["Gini"] = gini

metrics_df

Unnamed: 0,Value
Accuracy,0.808989
Precision,0.757576
Recall,0.735294
F1 Score,0.746269
ROC AUC,0.826136
Gini,0.652273


## **Model & Prediction Export**

In [20]:
os.chdir(MODELS_DIR)
# Save model
joblib.dump(best_model, "knn-model.joblib")
y_predicted_df.to_parquet("knn-y_predicted.parquet")