In [1]:
import pandas as pd
import numpy as np

In [2]:
def distance(x, y): # Euclid
    return np.sqrt(((x - y) ** 2).sum())

In [3]:
x = np.array([0, 0])
y = np.array([1, 1])
distance(x, y)

1.4142135623730951

In [4]:
df = pd.read_csv('datasets/results.csv')
df

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
0,0,65,10,59,3,71,37,245,0,3
1,1,94,56,4,67,91,50,362,1,1
2,2,7,85,76,99,60,25,352,0,2
3,3,88,46,59,94,52,38,377,1,1
4,4,39,81,37,38,6,54,255,1,3
...,...,...,...,...,...,...,...,...,...,...
995,995,55,89,68,58,13,13,296,0,2
996,996,42,62,25,87,51,68,335,1,2
997,997,9,83,70,14,11,78,265,0,3
998,998,23,82,31,42,84,52,314,0,2


In [5]:
df.loc[(df['Div'] == 0) & (df['Results'] == 1)]

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
243,243,3,35,38,49,62,83,270,1,0
660,660,37,29,79,87,40,88,360,1,0


In [6]:
df.loc[df['Results'] == 0]

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
0,0,65,10,59,3,71,37,245,0,3
2,2,7,85,76,99,60,25,352,0,2
6,6,73,48,6,38,50,21,236,0,3
7,7,18,23,97,65,15,20,238,0,3
8,8,15,64,14,43,59,59,254,0,3
...,...,...,...,...,...,...,...,...,...,...
990,990,30,68,30,14,16,6,164,0,0
991,991,98,53,29,61,40,11,292,0,2
995,995,55,89,68,58,13,13,296,0,2
997,997,9,83,70,14,11,78,265,0,3


In [7]:
df.loc[df['Results'] == 1]

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
1,1,94,56,4,67,91,50,362,1,1
3,3,88,46,59,94,52,38,377,1,1
4,4,39,81,37,38,6,54,255,1,3
5,5,51,43,14,53,64,59,284,1,2
9,9,53,85,96,76,83,62,455,1,1
...,...,...,...,...,...,...,...,...,...,...
992,992,55,94,59,68,82,89,447,1,1
993,993,72,30,34,90,71,36,333,1,2
994,994,94,62,68,84,13,35,356,1,2
996,996,42,62,25,87,51,68,335,1,2


We can notice that total mark doesn't matter. So, we will work in 6-dimensional space

In [8]:
df.loc[[4, 997]]

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
4,4,39,81,37,38,6,54,255,1,3
997,997,9,83,70,14,11,78,265,0,3


We'll predict both result and division

result = 1 or 0 (binary classification)

division = 0, 1, 2, 3 (multi-class classification)

In [9]:
k = 5
new_obj = np.array([45, 49, 72, 90, 55, 40])
new_total = new_obj.sum()
new_result = np.nan
new_division = np.nan

Now, i'm going to create a matrix which will contain vectors of marks

In [10]:
matrix = df.to_numpy()
matrix = matrix[:, 1:7]
matrix

array([[65, 10, 59,  3, 71, 37],
       [94, 56,  4, 67, 91, 50],
       [ 7, 85, 76, 99, 60, 25],
       ...,
       [ 9, 83, 70, 14, 11, 78],
       [23, 82, 31, 42, 84, 52],
       [75, 78, 68, 33, 79, 91]], dtype=int64)

Calculating distances

In [11]:
distances_list = [(distance(new_obj, matrix[index]), index) for index in range(1000)]
distances_list[:k]

[(99.61927524329818, 0),
 (94.86305919587456, 1),
 (55.560777532356404, 2),
 (45.34313619501854, 3),
 (87.0976463516667, 4)]

Finding k nearest neighbors

In [12]:
distances_list.sort(key=lambda tup: tup[0])
nearest_neighbors = distances_list[:k]
nearest_neighbors

[(22.956480566497994, 416),
 (24.145392935299274, 940),
 (26.758176320519304, 348),
 (27.676705006196094, 903),
 (30.380915061926625, 504)]

In [13]:
nearest_neighbors_rows = [nearest_neighbors[i][1] for i in range(k)]
nearest_neighbors_rows

[416, 940, 348, 903, 504]

In [14]:
df.loc[nearest_neighbors_rows]

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
416,416,26,53,75,88,44,36,322,1,2
940,940,51,62,85,83,59,52,392,1,1
348,348,22,42,68,98,52,33,315,0,2
903,903,59,45,87,93,63,24,371,1,1
504,504,26,50,62,95,35,46,314,1,2


Counting the nearest results

In [15]:
res_list = df.loc[nearest_neighbors_rows]['Results'].tolist()
res_list

[1, 1, 0, 1, 1]

In [16]:
counts = np.array([res_list.count(i) for i in range(2)])
counts

array([1, 4])

In [17]:
new_result = counts.argmax()
new_result

1

Counting the nearest divisions

In [18]:
div_list = df.loc[nearest_neighbors_rows]['Div'].tolist()
div_list

[2, 1, 2, 1, 2]

In [19]:
counts = np.array([div_list.count(i) for i in range(4)])
counts

array([0, 2, 3, 0])

In [20]:
new_division = counts.argmax()
new_division

2

In [21]:
new_obj = np.append(new_obj, [new_total, new_result, new_division])
new_obj = np.insert(new_obj, 0, 1000, axis=0)
new_obj

array([1000,   45,   49,   72,   90,   55,   40,  351,    1,    2],
      dtype=int64)

In [22]:
df.loc[1000] = new_obj

In [23]:
df

Unnamed: 0.1,Unnamed: 0,Hindi,English,Science,Maths,History,Geograpgy,Total,Results,Div
0,0,65,10,59,3,71,37,245,0,3
1,1,94,56,4,67,91,50,362,1,1
2,2,7,85,76,99,60,25,352,0,2
3,3,88,46,59,94,52,38,377,1,1
4,4,39,81,37,38,6,54,255,1,3
...,...,...,...,...,...,...,...,...,...,...
996,996,42,62,25,87,51,68,335,1,2
997,997,9,83,70,14,11,78,265,0,3
998,998,23,82,31,42,84,52,314,0,2
999,999,75,78,68,33,79,91,424,1,1


In [24]:
print('New result =', new_result)
print('New division =', new_division)

New result = 1
New division = 2
