# K Nearest Neighbour

## Importing packages

In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

pd.set_option('display.max_columns', None)

## Importing data

In [2]:
data = pd.read_csv('../data/6_train_dataset_augmented.csv')

## KNN

Getting X & y

In [3]:
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

Initialise base KNN instance

In [4]:
knn= KNeighborsClassifier()

Initialise parameter grid for grid search

In [5]:
parameter_grid = {"n_neighbors": range(1,21), 
                  "weights": ['uniform', 'distance'], 
                  "p": [1, 2]}

Run grid search on KNN

In [6]:
gs = GridSearchCV(estimator= knn,
                  param_grid= parameter_grid,
                  scoring= "recall",
                  verbose= 4)

In [7]:
gs.fit(X, y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[CV 1/5] END n_neighbors=1, p=1, weights=uniform;, score=0.551 total time=   6.0s
[CV 2/5] END n_neighbors=1, p=1, weights=uniform;, score=0.633 total time=   4.8s
[CV 3/5] END n_neighbors=1, p=1, weights=uniform;, score=0.647 total time=   5.2s
[CV 4/5] END n_neighbors=1, p=1, weights=uniform;, score=0.639 total time=   6.0s
[CV 5/5] END n_neighbors=1, p=1, weights=uniform;, score=0.638 total time=   6.9s
[CV 1/5] END n_neighbors=1, p=1, weights=distance;, score=0.551 total time=   5.3s
[CV 2/5] END n_neighbors=1, p=1, weights=distance;, score=0.633 total time=   4.8s
[CV 3/5] END n_neighbors=1, p=1, weights=distance;, score=0.647 total time=   5.6s
[CV 4/5] END n_neighbors=1, p=1, weights=distance;, score=0.639 total time=   5.2s
[CV 5/5] END n_neighbors=1, p=1, weights=distance;, score=0.638 total time=   4.7s
[CV 1/5] END n_neighbors=1, p=2, weights=uniform;, score=0.535 total time=   2.7s
[CV 2/5] END n_neighbors=1, p=2, weights=uniform;, score=0.611 total time=   2.2s
[CV 3/5] EN

Get results from grid search

In [8]:
print(f'Best hyper-parameters are: {gs.best_params_}\nRecall is: {round(gs.best_score_ * 100, 2)}%')

Best hyper-parameters are: {'n_neighbors': 19, 'p': 1, 'weights': 'uniform'}
Recall is: 73.9%


## Ideal model

Initialising KNN with ideal hyper-parameters

In [9]:
knn_ideal = KNeighborsClassifier(n_neighbors=19, p=1, weights='uniform')

Training and getting predictions

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
model = knn_ideal.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

Getting results

In [11]:
acc_result = accuracy_score(y_test, y_test_pred)
recall_result = recall_score(y_test, y_test_pred)
precision_result = precision_score(y_test, y_test_pred)
f1_result = f1_score(y_test, y_test_pred)
print(f'Accuracy score is : {round(acc_result*100, 2)}%')
print(f'Recall score is : {round(recall_result*100, 2)}%')
print(f'Precision score is : {round(precision_result*100, 2)}%')
print(f'F1 score is : {round(f1_result*100, 2)}%')

Accuracy score is : 64.74%
Recall score is : 73.94%
Precision score is : 62.46%
F1 score is : 67.72%
