Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import (precision_recall_curve,PrecisionRecallDisplay)
import matplotlib.pyplot as plt
import seaborn as sns

Load Data

In [2]:
training_data = pd.read_excel('training_data.xlsx')
testing_data = pd.read_excel('testing_data.xlsx')

In [3]:
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")
print(f"testing %: {100*testing_data.shape[0]/(training_data.shape[0]+testing_data.shape[0])}")

No. of training examples: 32950
No. of testing examples: 8238
testing %: 20.000971156647566


Seperate the target class, "y" column, from the rest of attributes

In [4]:
xs_train,ys_train = np.split(training_data.values, [19], axis=1)
xs_test,ys_test = np.split(testing_data.values, [19], axis=1)
ys_test, ys_train = ys_test.reshape(-1), ys_train.reshape(-1) #reshape y or else data conversion warning
print('training set shape:\t', xs_train.shape)
print('test set shape:\t\t', xs_test.shape)

training set shape:	 (32950, 19)
test set shape:		 (8238, 19)


Normalize the data

In [5]:
mu = np.mean(xs_train, axis=0) 
sigma = np.std(xs_train, axis=0)

xs_train = (xs_train - mu)/sigma
xs_test = (xs_test - mu)/sigma


Values should be close to 0

In [6]:
np.mean(xs_train, axis=0)

array([-2.58986897e-16,  2.93813195e-17, -1.09977783e-17,  1.48928248e-18,
        6.25363865e-18,  8.95456361e-17,  8.73352983e-17,  4.43145773e-17,
       -1.76072274e-16,  3.50419407e-17, -6.29676719e-17, -8.32380868e-17,
        2.17799139e-17,  9.73626844e-17, -5.28324644e-17, -3.80587822e-15,
        3.59799865e-16,  4.73335753e-17, -8.07420224e-15])

If we have properly sampled the dataset, we should get a mean vector for the test set that contains close to zero values

In [7]:
np.mean(xs_test, axis=0)

array([ 0.00072709,  0.01390276,  0.01242855, -0.0160084 , -0.01068466,
       -0.02077284, -0.0053078 ,  0.00663729,  0.00360701,  0.01758527,
       -0.01291794,  0.001612  ,  0.00219581, -0.00048917,  0.00267624,
        0.00676035, -0.00552899,  0.00069305, -0.00085189])

Knn algorithm and hyperparameter tuning

In [8]:
knn_clf = KNeighborsClassifier(n_neighbors=5, metric='euclidean')#Create a knn classifier with the given parameters



In [9]:
knn_clf.fit(xs_train, ys_train)

KNeighborsClassifier(metric='euclidean')

In [10]:
ys_test_pred = knn_clf.predict(xs_test)

print('Test accuracy of kNN', accuracy_score(ys_test, ys_test_pred))

Test accuracy of kNN 0.8895362952172857


In [11]:
knn_clf = KNeighborsClassifier(n_neighbors=1, metric='cosine')#Tried different hyperparameters

knn_clf.fit(xs_train, ys_train)

ys_test_pred = knn_clf.predict(xs_test)

print('Test accuracy of kNN', accuracy_score(ys_test, ys_test_pred))

Test accuracy of kNN 0.8437727603787327


In [12]:
param_grid = [{
    'weights': ["uniform", "distance"],#hyperparameters, try all of them 
    'n_neighbors': range(1, 11),
    'metric':['euclidean', 'manhattan', 'cosine']}]

knn_clf = KNeighborsClassifier()#Assigns a cluster, testing all the cluster numbers 1 through 11
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=2)#Similar to a nested for loop
grid_search.fit(xs_train, ys_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   1.8s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   1.7s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   1.7s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   1.7s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   1.7s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   1.6s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END ...metric=euclidean, n_neighbors=2, weights=uniform; total time=   2.6s
[CV] END ...metric=euclidean, n_neighbors=2, we

[CV] END ...metric=manhattan, n_neighbors=1, weights=uniform; total time=   2.1s
[CV] END ...metric=manhattan, n_neighbors=1, weights=uniform; total time=   2.1s
[CV] END ...metric=manhattan, n_neighbors=1, weights=uniform; total time=   2.1s
[CV] END ...metric=manhattan, n_neighbors=1, weights=uniform; total time=   2.1s
[CV] END ..metric=manhattan, n_neighbors=1, weights=distance; total time=   2.1s
[CV] END ..metric=manhattan, n_neighbors=1, weights=distance; total time=   2.1s
[CV] END ..metric=manhattan, n_neighbors=1, weights=distance; total time=   2.1s
[CV] END ..metric=manhattan, n_neighbors=1, weights=distance; total time=   2.1s
[CV] END ..metric=manhattan, n_neighbors=1, weights=distance; total time=   2.1s
[CV] END ...metric=manhattan, n_neighbors=2, weights=uniform; total time=   3.0s
[CV] END ...metric=manhattan, n_neighbors=2, weights=uniform; total time=   3.0s
[CV] END ...metric=manhattan, n_neighbors=2, weights=uniform; total time=   3.0s
[CV] END ...metric=manhattan

[CV] END ......metric=cosine, n_neighbors=1, weights=uniform; total time=   1.8s
[CV] END ......metric=cosine, n_neighbors=1, weights=uniform; total time=   1.8s
[CV] END .....metric=cosine, n_neighbors=1, weights=distance; total time=   1.9s
[CV] END .....metric=cosine, n_neighbors=1, weights=distance; total time=   1.8s
[CV] END .....metric=cosine, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END .....metric=cosine, n_neighbors=1, weights=distance; total time=   1.8s
[CV] END .....metric=cosine, n_neighbors=1, weights=distance; total time=   1.7s
[CV] END ......metric=cosine, n_neighbors=2, weights=uniform; total time=   2.6s
[CV] END ......metric=cosine, n_neighbors=2, weights=uniform; total time=   2.6s
[CV] END ......metric=cosine, n_neighbors=2, weights=uniform; total time=   2.6s
[CV] END ......metric=cosine, n_neighbors=2, weights=uniform; total time=   2.6s
[CV] END ......metric=cosine, n_neighbors=2, weights=uniform; total time=   2.7s
[CV] END .....metric=cosine,

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid=[{'metric': ['euclidean', 'manhattan', 'cosine'],
                          'n_neighbors': range(1, 11),
                          'weights': ['uniform', 'distance']}],
             verbose=2)

In [13]:
grid_search.best_estimator_#which hyperparameter had the best accurary for the given data

KNeighborsClassifier(metric='euclidean', n_neighbors=10)

In [14]:
knn_clf = KNeighborsClassifier(metric='euclidean', n_neighbors=10)

knn_clf.fit(xs_train, ys_train)

ys_train_pred = knn_clf.predict(xs_train)
ys_test_pred = knn_clf.predict(xs_test)

print('Train accuracy of kNN', accuracy_score(ys_train, ys_train_pred))#best case for result of your model.Overfit case
print('Test accuracy of kNN', accuracy_score(ys_test, ys_test_pred))#realistic for result of your model. Same distribution of your train data
print("Training classification_report:\n", classification_report(ys_train, ys_train_pred))
print("Testing classification_report:\n", classification_report(ys_test,ys_test_pred)) 

Train accuracy of kNN 0.9055538694992413
Test accuracy of kNN 0.8968196164117505
Training classification_report:
               precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     29225
         1.0       0.72      0.27      0.39      3725

    accuracy                           0.91     32950
   macro avg       0.82      0.63      0.67     32950
weighted avg       0.89      0.91      0.89     32950

Testing classification_report:
               precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      7323
         1.0       0.59      0.22      0.33       915

    accuracy                           0.90      8238
   macro avg       0.75      0.60      0.63      8238
weighted avg       0.87      0.90      0.88      8238



### Try Support Vector Machines

Here I will introduce you how to use the Support Vector Machine (SVM) implementation of scikit-learn.

Note how we are setting the $C$ hyper-parameter of SVM. $C$ controls the trade-off between having a small and strict
margin and a wider and loose margin. Following we will set $C$ to infinity which makes the margin infinitely strict.
This means that based on the dataset, the fitting of the SVM may fail if the training algorithm fails to separate all
the training examples perfectly.

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="linear", C=float("inf"))

### Same procedure as previous knn