In [113]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [114]:
np.random.seed(42)

In [115]:
#Generate n synthetic samples (not a real dataset, randomly generated)
n=100

#Class 0
X1 = np.random.normal(loc=-2.0, scale=2, size=int(n/2))
Y1 = np.random.normal(loc=0, scale=1, size=int(n/2))

#Class 1
X2 = np.random.normal(loc=2.0, scale=2.0, size=int(n/2))
Y2 = np.random.normal(loc=0, scale=1.0, size=int(n/2))

In [116]:
# Create dataset from dimensions as columns, and output label (0 or 1)
# Every column is a dimension/feature from dataset & class label is last column

ds1 = np.empty([n, 3])
size = int(n/2)
for i in range(size):
        ds1[i] = [X1[i], Y1[i], 0]
        ds1[i+size] = [X2[i], Y2[i], 1]

In [117]:
df = pd.DataFrame({'Dim 1':ds1[:, 0], 'Dim 2':ds1[:, 1], 'Class labels':ds1[:, -1]})
print(df)

       Dim 1     Dim 2  Class labels
0  -1.006572  0.324084           0.0
1  -2.276529 -0.385082           0.0
2  -0.704623 -0.676922           0.0
3   1.046060  0.611676           0.0
4  -2.468307  1.031000           0.0
..       ...       ...           ...
95  3.563646  0.385317           1.0
96 -0.473901 -0.883857           1.0
97 -0.640913  0.153725           1.0
98  3.043883  0.058209           1.0
99  2.593969 -1.142970           1.0

[100 rows x 3 columns]


In [118]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=0)

In [121]:
def knn(newObservation, referenceData, k=3):
    data = referenceData.iloc[:, :-1]
    labels = referenceData.iloc[:, -1]
    
    distance = np.array((None, None) * int(n/2))
        
    size = data.shape[0]
    for i in range(size):
        x_diff = np.square(data.iloc[i, 0] - newObservation[0])
        y_diff = np.square(data.iloc[i, 1] - newObservation[1])
        distance[i] = (np.sqrt(x_diff + y_diff), labels[i])
    
    distance = np.sort(distance, axis=0)
    
    c0_count = 0
    c1_count = 0
    
    for i in range(0, k):
        if distance[i][1] == 0:
            c0_count+=1
        elif distance[i][1] == 1:
            c1_count+=1
    
    if c0_count > c1_count:
        return 'Class 0'
    return 'Class 1'

In [122]:
knn(x_test.iloc[0], df, k=50)

'Class 0'

In [130]:
for i in range(20):
    result = knn(x_test.iloc[i], df, k=20)
    print('Data point : ' + str(x_test.iloc[i].values))
    print(result)

Data point : [-4.30198715  0.08704707]
Class 0
Data point : [0.43349342 0.47323762]
Class 1
Data point : [-0.70462292 -0.676922  ]
Class 1
Data point : [ 2.80810171 -0.71435142]
Class 1
Data point : [6.38091125 0.82718325]
Class 1
Data point : [ 2.36926772 -1.24573878]
Class 1
Data point : [-4.02566224 -0.07201012]
Class 0
Data point : [-0.80370213  0.34115197]
Class 0
Data point : [1.67742858 0.29307247]
Class 1
Data point : [3.56364574 0.38531738]
Class 1
Data point : [0.39544546 0.2322537 ]
Class 1
Data point : [-1.21496647  0.21409374]
Class 0
Data point : [ 2.19930273 -0.26465683]
Class 1
Data point : [-5.82656049 -1.19620662]
Class 0
Data point : [-0.46513054 -0.30921238]
Class 1
Data point : [-3.20341322 -0.21967189]
Class 0
Data point : [-1.86494359 -0.03582604]
Class 0
Data point : [-3.08876545 -2.6197451 ]
Class 0
Data point : [-4.11542186 -0.51827022]
Class 0
Data point : [-2.93894877  0.33126343]
Class 0


In [144]:
print(x_test.shape[0])

for i in range(x_test.shape[0]):
    print(x_test.iloc[i].values)
    print(y_test.iloc[i])

20
[-4.30198715  0.08704707]
0.0
[0.43349342 0.47323762]
1.0
[-0.70462292 -0.676922  ]
0.0
[ 2.80810171 -0.71435142]
1.0
[6.38091125 0.82718325]
1.0
[ 2.36926772 -1.24573878]
1.0
[-4.02566224 -0.07201012]
0.0
[-0.80370213  0.34115197]
1.0
[1.67742858 0.29307247]
1.0
[3.56364574 0.38531738]
1.0
[0.39544546 0.2322537 ]
1.0
[-1.21496647  0.21409374]
1.0
[ 2.19930273 -0.26465683]
1.0
[-5.82656049 -1.19620662]
0.0
[-0.46513054 -0.30921238]
0.0
[-3.20341322 -0.21967189]
0.0
[-1.86494359 -0.03582604]
0.0
[-3.08876545 -2.6197451 ]
0.0
[-4.11542186 -0.51827022]
0.0
[-2.93894877  0.33126343]
0.0
