In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from k_nearest_neighbors import KNearestNeighbor

In [2]:
df = pd.read_csv('datasets/results.csv', index_col=[0])
df

Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
0,65,10,59,3,71,37,245,0,3
1,94,56,4,67,91,50,362,1,1
2,7,85,76,99,60,25,352,0,2
3,88,46,59,94,52,38,377,1,1
4,39,81,37,38,6,54,255,1,3
...,...,...,...,...,...,...,...,...,...
995,55,89,68,58,13,13,296,0,2
996,42,62,25,87,51,68,335,1,2
997,9,83,70,14,11,78,265,0,3
998,23,82,31,42,84,52,314,0,2


In [3]:
df[(df['Div'] == 0) & (df['Results'] == 1)]

Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
243,3,35,38,49,62,83,270,1,0
660,37,29,79,87,40,88,360,1,0


In [4]:
df[df['Results'] == 0]

Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
0,65,10,59,3,71,37,245,0,3
2,7,85,76,99,60,25,352,0,2
6,73,48,6,38,50,21,236,0,3
7,18,23,97,65,15,20,238,0,3
8,15,64,14,43,59,59,254,0,3
...,...,...,...,...,...,...,...,...,...
990,30,68,30,14,16,6,164,0,0
991,98,53,29,61,40,11,292,0,2
995,55,89,68,58,13,13,296,0,2
997,9,83,70,14,11,78,265,0,3


In [5]:
df[df['Results'] == 1]

Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
1,94,56,4,67,91,50,362,1,1
3,88,46,59,94,52,38,377,1,1
4,39,81,37,38,6,54,255,1,3
5,51,43,14,53,64,59,284,1,2
9,53,85,96,76,83,62,455,1,1
...,...,...,...,...,...,...,...,...,...
992,55,94,59,68,82,89,447,1,1
993,72,30,34,90,71,36,333,1,2
994,94,62,68,84,13,35,356,1,2
996,42,62,25,87,51,68,335,1,2


We can notice that total mark doesn't matter. So, we will work in 6-dimensional space

In [6]:
df.loc[[4, 997]]

Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
4,39,81,37,38,6,54,255,1,3
997,9,83,70,14,11,78,265,0,3


We'll predict result (0 or 1 -- binary classification)

In [7]:
model = KNearestNeighbor()
k = 5
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_data = train_df.to_numpy()[:, :6]
train_labels = train_df['Results'].to_numpy()
test_data = test_df.to_numpy()[:, :6]
test_labels = test_df['Results'].to_numpy()

In [8]:
model.fit(train_data, train_labels)
predict = model.predict(test_data, k) 
predict

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0.])

In [9]:
predict == test_labels

array([ True,  True,  True, False,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,

In [10]:
accuracy_score(test_labels, predict)

0.86