## KNeighborsClassifier

#### Author: Mughees Warraich
#### Part of Step 4: Train Algorithms

In [22]:
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

import math
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from creditcard_preparation import create_creditcard_pipeline, prepare_creditcard_data

In [23]:
X_train, X_dev, X_test, y_train, y_dev, y_test = prepare_creditcard_data((1/10,1/10))

## Grid search for best hyperparameters

In [3]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Handle NaN values in y_train
nan_indices = np.isnan(y_train)
X_train_clean = X_train[~nan_indices]
y_train_clean = y_train[~nan_indices]

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Create a KNeighborsClassifier instance
knn = KNeighborsClassifier()

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform grid search on the cleaned data
grid_search.fit(X_train_clean, y_train_clean)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best Score: 0.9982371294851795
Test Accuracy: 0.9982942962932146


In [3]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Handle NaN values in y_train
nan_indices = np.isnan(y_train)
X_train_clean = X_train[~nan_indices]
y_train_clean = y_train[~nan_indices]

# Define the parameter grid
param_grid = {
    'n_neighbors': [3],
    'weights': ['uniform'],
    'algorithm': ['auto'],
    'leaf_size': [10],
    'p': [1]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Create a KNeighborsClassifier instance
knn = KNeighborsClassifier()

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Perform grid search on the cleaned data
grid_search.fit(X_train_clean, y_train_clean)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best Score: 0.9981487259490379
Test Accuracy: 0.9984607064109497


In [27]:
# Create the pipeline
pipeline = create_creditcard_pipeline()

nan_indices = np.isnan(y_train)
X_train_clean = X_train[~nan_indices]
y_train_clean = y_train[~nan_indices]

# Create algorithm with parameters found during grid search
# Leave all parameters as default besides criterion
KNC_model = KNeighborsClassifier(
     n_neighbors = 2,
     weights  = 'uniform',
     algorithm = 'auto',
     leaf_size = 10,
     p = 1  # 1 for Manhattan distance, 2 for Euclidean distance
)


# Combine the pipeline and the algorithm
pipeline_with_algo = Pipeline(steps=[
    ('preprocessor', pipeline),
    ('algo', KNC_model)
])

pipeline_with_algo.fit(X_train_clean, y_train_clean)
y_test_pred = pipeline_with_algo.predict(X_test)

In [28]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9997087822939634
Precision: 1.0
Recall: 0.8205128205128205
F1 Score: 0.9014084507042254
