# K-NN Classification

Import Library

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Insert Data

In [3]:
df = pd.read_csv('HDB_database_final.csv')
df.head()

Unnamed: 0,town,flat_type,floor_area_sqm,remaining_lease_months,resale_price,distanceWithMrt,distanceWithRaffles,distanceWithGdPri
0,ANG MO KIO,3 ROOM,68.0,714,274000.0,881.003375,8703.096482,1041.995726
1,ANG MO KIO,3 ROOM,68.0,714,315000.0,985.490578,8617.203081,1048.316784
2,ANG MO KIO,3 ROOM,70.0,1093,500000.0,689.479121,9113.711978,1199.433818
3,ANG MO KIO,3 ROOM,82.0,713,315000.0,881.003375,8703.096482,1041.995726
4,ANG MO KIO,3 ROOM,67.0,696,235000.0,1072.597799,8872.72856,1542.266971


Preprocess data

In [4]:
def myfunction(x):
    if x <= 325000:
        return 0
    elif x <= 390000:
        return 1
    elif x <= 455000:
        return 2
    elif x <= 555000:
        return 3
    else:
        return 4

#apply function
df["resale_price"]=df["resale_price"].apply(myfunction)
df.head()

Unnamed: 0,town,flat_type,floor_area_sqm,remaining_lease_months,resale_price,distanceWithMrt,distanceWithRaffles,distanceWithGdPri
0,ANG MO KIO,3 ROOM,68.0,714,0,881.003375,8703.096482,1041.995726
1,ANG MO KIO,3 ROOM,68.0,714,0,985.490578,8617.203081,1048.316784
2,ANG MO KIO,3 ROOM,70.0,1093,3,689.479121,9113.711978,1199.433818
3,ANG MO KIO,3 ROOM,82.0,713,0,881.003375,8703.096482,1041.995726
4,ANG MO KIO,3 ROOM,67.0,696,0,1072.597799,8872.72856,1542.266971


Model Training

In [5]:
def KNNClass(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=0)
    
    knn = KNeighborsClassifier(n_neighbors = 5)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: ' + str(accuracy))
    print(confusion_matrix(y_test, y_pred))
    return

Model Validation

In [6]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","distanceWithRaffles","distanceWithGdPri","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.39603960396039606
[[86 28  9  7  5]
 [36 39 38 16  9]
 [14 26 38 18  9]
 [15 11 29 37 22]
 [15 17 16 26 40]]
training time: 0.02712416648864746 s


# Best Combination Based on the Parameter Test

In [7]:
#selected!!
t0=time.time()

x = df[["floor_area_sqm","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.5726072607260726
[[111  21   1   1   1]
 [ 23  73  34   3   5]
 [  3  26  46  23   7]
 [  4  16  25  53  16]
 [  1  10  15  24  64]]
training time: 0.023949146270751953 s


# Loop for the best k value

In [8]:
k_array = np.arange(1,21,2)
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=0)

for k in k_array:
    t0=time.time()
    knn_ex = KNeighborsClassifier(n_neighbors = k)
    knn_ex.fit(x_train, y_train)
    y_ex_pred = knn_ex.predict(x_test)
    ac = accuracy_score(y_test, y_ex_pred)
    print(k)
    print(ac)
    t1 = time.time()
    t2 = t1-t0
    print (f"training time: {t2} s")

1
0.5957095709570958
training time: 0.019944190979003906 s
3
0.570957095709571
training time: 0.016803741455078125 s
5
0.5726072607260726
training time: 0.017281293869018555 s
7
0.5742574257425742
training time: 0.018280029296875 s
9
0.5874587458745875
training time: 0.017794132232666016 s
11
0.5742574257425742
training time: 0.01779794692993164 s
13
0.5792079207920792
training time: 0.018801450729370117 s
15
0.570957095709571
training time: 0.018799781799316406 s
17
0.570957095709571
training time: 0.018789291381835938 s
19
0.5726072607260726
training time: 0.01983809471130371 s


In [9]:
k_depth = np.array([1,2])

for k in k_depth:
    t0=time.time()
    knn_ex = KNeighborsClassifier(n_neighbors = k)
    knn_ex.fit(x_train, y_train)
    y_ex_pred = knn_ex.predict(x_test)
    ac = accuracy_score(y_test, y_ex_pred)
    print(k)
    print(ac)
    t1 = time.time()
    t2 = t1-t0
    print (f"training time: {t2} s")

1
0.5957095709570958
training time: 0.01798248291015625 s
2
0.570957095709571
training time: 0.016783475875854492 s


# k of 1 has the best combination of accuracy and time

In [10]:
t0=time.time()
knn_final = KNeighborsClassifier(n_neighbors = 1)
knn_final.fit(x_train, y_train)
y_final_pred = knn_final.predict(x_test)
print(accuracy_score(y_test, y_final_pred))
print(confusion_matrix(y_test, y_final_pred))
t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

0.5957095709570958
[[109  20   4   2   0]
 [ 20  68  29  12   9]
 [  4  18  47  28   8]
 [  4  15  17  60  18]
 [  0   4   9  24  77]]
training time: 0.019856691360473633 s


# Prediction Test

In [11]:
t0=time.time()
print(knn_final.predict([[68,714]]))
t1 = time.time()
t2 = t1-t0
print (f"predict time: {t2} s")
print(knn_final.predict([[70,1093]]))

[0]
predict time: 0.0015342235565185547 s
[3]


# Parameter Test

# Reduce 1 parameter

In [65]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","distanceWithRaffles","distanceWithGdPri"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.3943894389438944
[[81 23 12  8 11]
 [31 40 36 20 11]
 [16 25 38 21  5]
 [19 14 21 40 20]
 [11 19 18 26 40]]
training time: 0.03452014923095703 s


In [66]:
#selected
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","distanceWithRaffles","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.4273927392739274
[[90 25  9  8  3]
 [39 44 33 15  7]
 [ 9 22 44 25  5]
 [15 19 24 39 17]
 [10 16 22 24 42]]
training time: 0.034853458404541016 s


In [67]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","distanceWithGdPri","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.4158415841584158
[[84 24 15  8  4]
 [32 34 37 21 14]
 [14 21 47 15  8]
 [10 15 23 45 21]
 [ 9 17 22 24 42]]
training time: 0.03301548957824707 s


In [68]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithRaffles","distanceWithGdPri","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.40594059405940597
[[90 22 14  4  5]
 [33 46 25 21 13]
 [12 20 38 24 11]
 [20 12 26 34 22]
 [15 25 12 24 38]]
training time: 0.03525519371032715 s


In [69]:
t0=time.time()

x = df[["distanceWithMrt","distanceWithRaffles","distanceWithGdPri","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.38943894389438943
[[86 27  9  8  5]
 [36 38 36 19  9]
 [16 25 36 19  9]
 [17 12 27 36 22]
 [15 18 14 27 40]]
training time: 0.02692699432373047 s


# Reduce 2 parameters

In [71]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","distanceWithRaffles"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.37623762376237624
[[75 24 12 11 13]
 [37 41 25 17 18]
 [11 24 41 25  4]
 [18 21 25 31 19]
 [13 14 23 24 40]]
training time: 0.022464275360107422 s


In [72]:
#selected
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.4768976897689769
[[103  19   7   5   1]
 [ 31  48  28  18  13]
 [ 10  15  47  23  10]
 [  3  14  31  45  21]
 [  3   7  25  33  46]]
training time: 0.03193092346191406 s


In [73]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithRaffles","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.41254125412541254
[[94 24  7  9  1]
 [35 41 35 22  5]
 [11 21 40 28  5]
 [11 12 41 34 16]
 [ 9 11 28 25 41]]
training time: 0.030837535858154297 s


In [74]:
t0=time.time()

x = df[["distanceWithMrt","distanceWithRaffles","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.4158415841584158
[[89 24 11  8  3]
 [40 41 31 18  8]
 [10 23 42 24  6]
 [17 20 22 38 17]
 [12 13 24 23 42]]
training time: 0.025930404663085938 s


# Reduce 3 parameters

In [75]:
t0=time.time()

x = df[["floor_area_sqm","distanceWithMrt"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.4521452145214521
[[104  13  10   5   3]
 [ 34  47  28  16  13]
 [  3  23  43  23  13]
 [  9  19  33  33  20]
 [  8  13  19  27  47]]
training time: 0.024005651473999023 s


In [76]:
#selected
t0=time.time()

x = df[["floor_area_sqm","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.5726072607260726
[[111  21   1   1   1]
 [ 23  73  34   3   5]
 [  3  26  46  23   7]
 [  4  16  25  53  16]
 [  1  10  15  24  64]]
training time: 0.03191089630126953 s


In [77]:
t0=time.time()

x = df[["distanceWithMrt","remaining_lease_months"]]
y = df["resale_price"]
KNNClass(x,y)

t1 = time.time()
t2 = t1-t0
print (f"training time: {t2} s")

Accuracy: 0.3481848184818482
[[77 30  7 15  6]
 [33 30 38 20 17]
 [16 26 36 15 12]
 [16 21 18 32 27]
 [12 17 26 23 36]]
training time: 0.02438187599182129 s


# Reduce 4 Parameters/Test with 1 Parameter

In [78]:
z = df[["floor_area_sqm","distanceWithMrt","distanceWithRaffles","distanceWithGdPri","remaining_lease_months"]]
for x in range(5):
    t0=time.time()
    x4 = z.iloc[:,x].to_frame()
    y4 = df["resale_price"]
    KNNClass(x4,y4)
    t1 = time.time()
    t2 = t1-t0
    print (f"training time: {t2} s")

Accuracy: 0.48514851485148514
[[112  13   7   1   2]
 [ 33  41  57   2   5]
 [  4  13  68  12   8]
 [  5   6  56  30  17]
 [  5  14  23  29  43]]
training time: 0.021500587463378906 s
Accuracy: 0.24917491749174916
[[40 33 23 21 18]
 [39 33 30 16 20]
 [29 21 28 15 12]
 [30 26 19 21 18]
 [32 18 20 15 29]]
training time: 0.020919084548950195 s
Accuracy: 0.2986798679867987
[[61 18 21 12 23]
 [41 33 24 25 15]
 [21 22 35 21  6]
 [24 16 36 25 13]
 [23 19 23 22 27]]
training time: 0.019947052001953125 s
Accuracy: 0.24092409240924093
[[49 30 15 22 19]
 [43 28 25 26 16]
 [22 24 21 18 20]
 [31 20 27 21 15]
 [24 14 21 28 27]]
training time: 0.01894998550415039 s
Accuracy: 0.36633663366336633
[[87 13 15  9 11]
 [36 30 39 13 20]
 [14 23 42 14 12]
 [17 22 33 20 22]
 [14 15 25 17 43]]
training time: 0.019979000091552734 s
