# K-Nearest Neighbor

KNN merupakan algoritma yang digunakan dalam melakukan memecahkan permasalahan klasifikasi, sehingga menghasilkan output diskrit. Contoh untuk output berupa diskrit adalah output yang hasilnya pasti seperti ketika menghitung 1 + 1 = 2, jawabannya bukan mendekati 2. KNN akan melakukan klasifikasi terhadap objek berdasarkan data pembelajaran yang jaraknya paling dekat dengan objek tersebut.

KNN akan bekerja berdasarkan jarak minimum dari data baru ke data training untuk menentukan tetangga terdekat. Setelah itu akan didapatkan data mayoritas sebagai hasil prediksi dari data baru tadi.

Berikut ini adalah langkah-langkah dalam menyelesaikan permasalahan klasifikasi menggunakan KNN :

### 1. Membaca data train

In [1]:
import pandas as pd

data_train_from_csv = pd.read_csv('dataTrain.csv')

data_train_from_csv.head()

Unnamed: 0,atribut 1,atribut 2,atribut 3,atribut 4,kelas
0,17.4,61.25,10.4,21.0,1
1,16.2,34.375,14.8,15.6,1
2,11.4,46.875,10.8,9.9,0
3,12.6,76.25,18.0,25.5,1
4,16.2,55.0,9.8,15.9,0


### 2. Menghitung Euclidean Distance

In [2]:
import numpy as np
def euclideanDistance(data_train, data_test):
    distanceSum = 0
    for i in range(len(data_train)-1):
        distanceSum += (data_train[i]-data_test[i])**2
    return np.sqrt(distanceSum)

### 3. KNN Algorithm

In [3]:
import operator
def kNearestNeighbor(data_train, data_test, k):
    distances = {}
    sort = {}
    neighbors = []
    vote_class = {}
  
    for i in range(len(data_train)):
        distance = euclideanDistance(data_train.iloc[i], data_test)
        distances[i] = distance
  
    sorted_distances = sorted(distances.items(), key=operator.itemgetter(1))
  
    for i in range(k):
        neighbors.append(sorted_distances[i][0])
    
    for x in range(len(neighbors)):
        class_in_datatrain = data_train.iloc[neighbors[x]][-1]
    
        if class_in_datatrain in vote_class:
            vote_class[class_in_datatrain] += 1
        else:
            vote_class[class_in_datatrain] = 1
  
    sorted_vote_class = sorted(vote_class.items(), key=operator.itemgetter(1))
  
    return sorted_vote_class[-1][0]

### 4. Menghitung akurasi

In [4]:
def predictionAccuracy(prediction_data, data_test):
    accurate = 0
  
    for i in range(len(prediction_data)):
        if prediction_data[i] == data_test.iloc[i][-1]:
            accurate += 1
      
    return (accurate/len(prediction_data)) * 100

### 5. Cross Validation & Tuning Parameter

In [18]:
def crossValFtTunParam(data_train_from_csv):
    fold1 = data_train_from_csv.iloc[0:800]
    fold2 = data_train_from_csv.iloc[800:1600]
    fold3 = data_train_from_csv.iloc[1600:2400]
    fold4 = data_train_from_csv.iloc[2400:3200]
    fold5 = data_train_from_csv.iloc[3200:]
    k_range = 100
    best_k = 51
#     for k in range(1, k_range):
#         for i in range(1,6):
    prediction_data = []
    accuracy_crossval = []
#             if i == 1:
#                 data_test = fold1
#                 data_train = pd.concat([fold2, fold3, fold4, fold5])
#             elif i == 2:
#                 data_test = fold2
#                 data_train = pd.concat([fold1, fold3, fold4, fold5])
#             elif i == 3:
#                 data_test = fold3
#                 data_train = pd.concat([fold1, fold2, fold4, fold5])
#             elif i == 4:
#                 data_test = fold4
#                 data_train = pd.concat([fold1, fold2, fold3, fold5])
#             else:
    data_test = fold5
    data_train = pd.concat([fold1, fold2, fold3, fold4])

    for x in range(len(data_test)):
        prediction_data.append(kNearestNeighbor(data_train, data_test.iloc[x], best_k))
            
    tmp_accuracy = predictionAccuracy(prediction_data, data_test)
    accuracy_crossval.append(tmp_accuracy)
        
#    best_k[k] = accuray_crossval.mean()
#    K = max(best_k.iteritems(), key=operator.itemgetter(1))[0]
        
    return best_k, accuracy_crossval
  

best_k, accuracy = crossValFtTunParam(data_train_from_csv)

In [40]:
print('Best k : ', best_k)
print('Accuracy : ', accuracy[0], '%')

Best k :  51
Accuracy :  71.375 %


### Calculate Data Test CSV

In [27]:
data_test_from_csv = pd.read_csv('dataTest.csv')

data_test_from_csv.head()

Unnamed: 0,atribut 1,atribut 2,atribut 3,atribut 4,kelas
0,8.4,54.375,13.6,13.5,
1,10.8,67.5,14.4,22.2,
2,21.6,62.5,13.6,20.7,
3,10.8,46.875,9.0,16.8,
4,10.8,58.75,14.2,13.5,


In [34]:
data_train = data_train_from_csv[:3200]
prediction_data = []
for x in range(len(data_test_from_csv)):
    prediction_data.append(kNearestNeighbor(data_train, data_test_from_csv.iloc[x], best_k))

In [38]:
data_test_from_csv['kelas'] = prediction_data

data_test_from_csv.to_csv('Prediksi_Tugas2AI_1301160790.csv')

### Hasil Prediksi

In [39]:
data_hasil_from_csv = pd.read_csv('Prediksi_Tugas2AI_1301160790.csv')
data_hasil_from_csv

Unnamed: 0.1,Unnamed: 0,atribut 1,atribut 2,atribut 3,atribut 4,kelas
0,0,8.4,54.375,13.6,13.5,0.0
1,1,10.8,67.500,14.4,22.2,1.0
2,2,21.6,62.500,13.6,20.7,1.0
3,3,10.8,46.875,9.0,16.8,0.0
4,4,10.8,58.750,14.2,13.5,0.0
5,5,21.6,61.875,11.2,18.3,1.0
6,6,6.0,54.375,14.2,18.3,0.0
7,7,7.8,45.000,10.2,21.6,0.0
8,8,21.0,42.500,14.6,18.3,0.0
9,9,6.0,58.750,12.8,21.3,0.0
