Summary:
* kNN is more accurate when dataset is bigger.
* kNN is slow on large dataset.
* predict_proba() is useful for analyzing whether neighbor number is too small.
* Increasing neighbor number will only increase predicting time slightly, as kNN will calculate distance between target and all train data.
* Increasing neighbor number will NOT neccessarily bring precision increase, as non-relavant points are taken into consideration.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing

import numpy as np
import pandas as pd

In [10]:
train = pd.read_csv('train.csv')
labels = train.target.values
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)
train = train.drop('id', axis=1)
train = train.drop('target', axis=1)

# split 80%
X, X_test, y, y_test = train_test_split(
    train.values, labels.astype('int32'), test_size=0.20, random_state=1234)

In [15]:
from time import time
from sklearn.neighbors import KNeighborsClassifier


neigh = KNeighborsClassifier(n_neighbors=3)
t_before_fit = time()
neigh.fit(X, y) # doctest: +ELLIPSIS
t_after_fit = time()
y_pred = neigh.predict(X_test)
t_after_pred = time()
Y_proba = neigh.predict_proba(X_test)
t_after_pred_proba = time()
print("train time %s, predict time %s, predict probality time %s" %(
    t_after_fit - t_before_fit,
    t_after_pred - t_after_fit,
    t_after_pred_proba - t_after_pred
))

train time 2.6832759380340576, predict time 93.1043176651001, predict probality time 93.31927824020386


In [16]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred, average='weighted')

0.77829641505454594

In [13]:
precision_score(y_test, y_pred, average=None) 

array([ 0.5194508 ,  0.70176406,  0.49967511,  0.61129568,  0.95143885,
        0.94111425,  0.76355748,  0.92121588,  0.85387674])

In [18]:
print(Y_proba[0:30])

[[ 0.          0.          0.          0.          0.          0.          0.
   1.          0.        ]
 [ 0.          0.          0.          0.          0.          1.          0.
   0.          0.        ]
 [ 0.          0.          0.          0.          0.          1.          0.
   0.          0.        ]
 [ 0.          1.          0.          0.          0.          0.          0.
   0.          0.        ]
 [ 0.          1.          0.          0.          0.          0.          0.
   0.          0.        ]
 [ 0.          0.          0.          0.          0.          1.          0.
   0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          1.        ]
 [ 0.          1.          0.          0.          0.          0.          0.
   0.          0.        ]
 [ 0.          0.          0.          0.          1.          0.          0.
   0.          0.        ]
 [ 0.33333333  0.          0.          0.          0.  

In [19]:
neigh = KNeighborsClassifier(n_neighbors=5)
t_before_fit = time()
neigh.fit(X, y) # doctest: +ELLIPSIS
t_after_fit = time()
y_pred = neigh.predict(X_test)
t_after_pred = time()
print("train time %s, predict time %s" %(
    t_after_fit - t_before_fit,
    t_after_pred - t_after_fit  
))
print(precision_score(y_test, y_pred, average='weighted'))
print(precision_score(y_test, y_pred, average=None))

train time 2.62996768951416, predict time 96.38402724266052
0.779322887615
[ 0.56200528  0.6945629   0.5118004   0.70666667  0.94802867  0.93412754
  0.75475687  0.91153142  0.84501481]


In [20]:
neigh = KNeighborsClassifier(n_neighbors=9)
t_before_fit = time()
neigh.fit(X, y) # doctest: +ELLIPSIS
t_after_fit = time()
y_pred = neigh.predict(X_test)
t_after_pred = time()
print("train time %s, predict time %s" %(
    t_after_fit - t_before_fit,
    t_after_pred - t_after_fit  
))
print(precision_score(y_test, y_pred, average='weighted'))
print(precision_score(y_test, y_pred, average=None))

train time 2.620821237564087, predict time 99.30895853042603
0.778557931723
[ 0.53846154  0.68986852  0.52485795  0.77272727  0.94954955  0.94113475
  0.71784232  0.89928058  0.82449373]


In [21]:
neigh = KNeighborsClassifier(n_neighbors=111)
t_before_fit = time()
neigh.fit(X, y) # doctest: +ELLIPSIS
t_after_fit = time()
y_pred = neigh.predict(X_test)
t_after_pred = time()
print("train time %s, predict time %s" %(
    t_after_fit - t_before_fit,
    t_after_pred - t_after_fit  
))
print(precision_score(y_test, y_pred, average='weighted'))
print(precision_score(y_test, y_pred, average=None))

train time 2.423783540725708, predict time 119.24104928970337
0.749388554041
[ 0.56734694  0.62605228  0.5282838   0.7173913   0.92293907  0.93763518
  0.66442953  0.86662715  0.79378531]
