In [88]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


In [89]:
# reading the dataset
df = pd.read_csv("fraud_prediction.csv")
df

Unnamed: 0,step,amount,oldbalanceorg,newbalanceorig,oldbalancedest,newbalancedest,isfraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,181.00,181.0,0.00,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0
1,1,181.00,181.0,0.00,21182.0,0.0,1,0.0,1.0,0.0,0.0,0.0
2,1,2806.00,2806.0,0.00,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0
3,1,2806.00,2806.0,0.00,26202.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,1,20128.00,20128.0,0.00,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20208,7,9443.02,898.0,0.00,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
20209,7,9105.47,358278.0,349172.53,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
20210,7,2761.71,0.0,0.00,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
20211,7,10204.13,0.0,0.00,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0


In [90]:
#Creating the features

X = df.drop("isfraud",axis =1).values
y = df["isfraud"].values

In [91]:
X, y

(array([[1.000000e+00, 1.810000e+02, 1.810000e+02, ..., 0.000000e+00,
         0.000000e+00, 1.000000e+00],
        [1.000000e+00, 1.810000e+02, 1.810000e+02, ..., 0.000000e+00,
         0.000000e+00, 0.000000e+00],
        [1.000000e+00, 2.806000e+03, 2.806000e+03, ..., 0.000000e+00,
         0.000000e+00, 1.000000e+00],
        ...,
        [7.000000e+00, 2.761710e+03, 0.000000e+00, ..., 0.000000e+00,
         1.000000e+00, 0.000000e+00],
        [7.000000e+00, 1.020413e+04, 0.000000e+00, ..., 0.000000e+00,
         1.000000e+00, 0.000000e+00],
        [7.000000e+00, 1.140002e+04, 5.140000e+02, ..., 0.000000e+00,
         1.000000e+00, 0.000000e+00]]),
 array([1, 1, 1, ..., 0, 0, 0], dtype=int64))

In [92]:
#Split the feature for training and testing using Sklearn

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3, random_state=42, stratify= y)

stratify= y arranges the ctaegorical values (0,1) in an equal manner in trained data and test data

In [93]:
X_train,y_train

(array([[6.0000000e+00, 9.1832500e+03, 1.1539840e+04, ..., 0.0000000e+00,
         1.0000000e+00, 0.0000000e+00],
        [7.0000000e+00, 2.7608100e+03, 1.0439000e+04, ..., 0.0000000e+00,
         1.0000000e+00, 0.0000000e+00],
        [7.0000000e+00, 2.6754300e+03, 3.3052500e+05, ..., 0.0000000e+00,
         1.0000000e+00, 0.0000000e+00],
        ...,
        [7.0000000e+00, 3.1059685e+05, 2.4732933e+06, ..., 0.0000000e+00,
         0.0000000e+00, 0.0000000e+00],
        [7.0000000e+00, 4.1369219e+05, 2.2680510e+04, ..., 0.0000000e+00,
         0.0000000e+00, 0.0000000e+00],
        [4.0000000e+00, 6.6503500e+03, 0.0000000e+00, ..., 0.0000000e+00,
         1.0000000e+00, 0.0000000e+00]]),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int64))

In [94]:
# Initializing the kNN classifier with 3 neighbors

knn_classifier =  KNeighborsClassifier(n_neighbors=3)

knn_classifier.fit(X_train,y_train)


In [95]:
#Extracting the accuracy score from the test sets
score = knn_classifier.score(X_test, y_test)
print(f"score:{score*100:.2f}")

score:97.77


Fine-tuning the Parameters of the k-NN algorithm

In [96]:
import numpy as np
from sklearn.model_selection import GridSearchCV

In [97]:
# Initializing a grid with possible number of neighbors from 1 to 24
grid = {'n_neighbors' : np.arange(1,25)}

#Initializing a k-NN classifier
knn_classifier = KNeighborsClassifier()


In [98]:
#Using cross validation to find optimal number of neighbors
knn = GridSearchCV(knn_classifier, grid, cv = 10)
knn.fit(X_train, y_train)

#Extracting the optimal number of neighbors
knn.best_params_

{'n_neighbors': 1}

In [99]:
#Extracting the accuracy score for optimal number of neighbors
best_score = knn.best_score_
print(f"Best_score:{best_score*100:.2f}")

Best_score:97.74


###### score is slightly less when compared to the k=3
###### but this tuing overrides the underfitting and overfitting issues

Scaling for optimized performance

The k-NN algorithm is an algorithm that works based on distance. When a new data point
is thrown into the dataset and the algorithm is given the task of classifying this new data
point, it uses distance to check the points that are closest to it.
If we have features that have different ranges of values – for example, feature one has a
range between 0 to 800 while feature two has a range between one to five – this distance
metric does not make sense anymore. We want all the features to have the same range of
values so that the distance metric is on level terms across all features.
One way to do this is to subtract each value of each feature by the mean of that feature and
divide by the variance of that feature. This is called "standardization"

In [100]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [101]:
# Setting up the scaling pipeline

pipeline_order =  [('scaler',StandardScaler()),
                   ('knn',KNeighborsClassifier(n_neighbors=1))]

pipeline =  Pipeline(pipeline_order)


In [102]:
# Fitting the classfier to the scaled dataset

knn_classifier_scaled = pipeline.fit(X_train, y_train)

In [106]:
#Extracting the score

score_scaled = knn_classifier_scaled.score(X_test, y_test)
print(f"Knn classified score: {score_scaled*100:.2f}%")

Knn classified score: 99.49%


This resulted in an accuracy score of 99.49, which is a substantial improvement from the
score of 97.74. Thus, we see how scaling the data results in improved performance.

The Pipeline function, as the name implies, is used to fit multiple functions into a pipeline
and execute them in a specified order that we think is apt for the process. This function
helps us streamline and automate common machine learning tasks.