In [111]:
# Import necessary libraries/packages
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [112]:
# Read data from csv file
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [113]:
le = LabelEncoder()
sc = StandardScaler()

In [114]:
# Preprocess data
x = data.drop('label', axis=1)
y = data['label'].values

sc.fit(x)

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=1, stratify=y)

x_train_std, x_test_std = sc.transform(x_train), sc.transform(x_test)

In [115]:
def _train(n, train_set, test_set, train_ans, test_ans):
    knn = KNeighborsClassifier(n_neighbors=n, p=2)
    knn.fit(train_set, train_ans)
    train_pred = knn.predict(train_set)
    test_pred = knn.predict(test_set)
    
    print('Current N: {} '.format(n), end='')
    print('Train : {} '.format(accuracy_score(train_ans, train_pred)), end='')
    print('Test : {}'.format(accuracy_score(test_ans, test_pred)))


In [116]:
for i in range (2, 21):
    _train(i, x_train, x_test, y_train, y_test)


Current N: 2 Train : 0.9878787878787879 Test : 0.9676767676767677
Current N: 3 Train : 0.9939393939393939 Test : 0.9777777777777777
Current N: 4 Train : 0.9861471861471861 Test : 0.9777777777777777
Current N: 5 Train : 0.9896103896103896 Test : 0.9777777777777777
Current N: 6 Train : 0.9800865800865801 Test : 0.9717171717171718
Current N: 7 Train : 0.9844155844155844 Test : 0.9737373737373738
Current N: 8 Train : 0.9783549783549783 Test : 0.9656565656565657
Current N: 9 Train : 0.9818181818181818 Test : 0.9696969696969697
Current N: 10 Train : 0.9757575757575757 Test : 0.9676767676767677
Current N: 11 Train : 0.9800865800865801 Test : 0.9696969696969697
Current N: 12 Train : 0.9766233766233766 Test : 0.9696969696969697
Current N: 13 Train : 0.9774891774891775 Test : 0.9717171717171718
Current N: 14 Train : 0.9722943722943723 Test : 0.9595959595959596
Current N: 15 Train : 0.9748917748917749 Test : 0.9696969696969697
Current N: 16 Train : 0.9722943722943723 Test : 0.9575757575757575
Cur

In [117]:
for i in range (2, 21):
    _train(i, x_train_std, x_test_std, y_train, y_test)


Current N: 2 Train : 0.9887445887445887 Test : 0.9575757575757575
Current N: 3 Train : 0.9930735930735931 Test : 0.9717171717171718
Current N: 4 Train : 0.9835497835497835 Test : 0.9555555555555556
Current N: 5 Train : 0.9852813852813853 Test : 0.9616161616161616
Current N: 6 Train : 0.9818181818181818 Test : 0.9575757575757575
Current N: 7 Train : 0.9818181818181818 Test : 0.9555555555555556
Current N: 8 Train : 0.9774891774891775 Test : 0.9474747474747475
Current N: 9 Train : 0.9748917748917749 Test : 0.9454545454545454
Current N: 10 Train : 0.9688311688311688 Test : 0.9454545454545454
Current N: 11 Train : 0.9679653679653679 Test : 0.9434343434343434
Current N: 12 Train : 0.9662337662337662 Test : 0.9373737373737374
Current N: 13 Train : 0.9670995670995671 Test : 0.9373737373737374
Current N: 14 Train : 0.9601731601731601 Test : 0.9353535353535354
Current N: 15 Train : 0.9584415584415584 Test : 0.9333333333333333
Current N: 16 Train : 0.9549783549783549 Test : 0.9272727272727272
Cur

In [118]:
knn = KNeighborsClassifier(n_neighbors=4, p=2)
knn.fit(x_train, y_train)
res = knn.predict(test)
res = pd.DataFrame(data=res, columns=['label'])
res.to_csv('submit.csv', index_label='id')