## KNN with large data set

In [21]:
# import everything first
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [22]:
# We are going to use the Iris data sets from sklearn as our example
from sklearn import datasets
iris = datasets.load_iris()

In [23]:
iris # see what the dataset is like

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [24]:
# build a dataframe with the data
# first four columns are the features, the last column is the target that we want to predict
df = pd.DataFrame(iris.data, columns = ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'])
df['class'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [25]:
print(iris.target_names) # these are the class names corresponding to their numeric label [0, 1, 2 ...]

['setosa' 'versicolor' 'virginica']


### Train our Model

In [26]:
# we'll use the sepal dimesnions as the features
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# we let k = 5 first, which means choosing 5 nearest neighbors.
knn = KNeighborsClassifier(n_neighbors = 5) 

In [27]:
# 
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:2], df['class'], random_state = 42)
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm)
4,5.0,3.6
32,5.2,4.1
142,5.8,2.7
85,6.0,3.4
86,6.7,3.1


In [28]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

### Test our Model

In [29]:
y_pred = knn.predict(X_test)
print(y_pred) # our prediction
print(y_test) # actual values

[1 0 2 1 1 0 1 2 1 2 2 0 0 0 0 2 2 1 2 2 0 1 0 2 2 1 1 2 0 0 0 0 1 0 0 2 2
 0]
73     1
18     0
118    2
78     1
76     1
31     0
64     1
141    2
68     1
82     1
110    2
12     0
36     0
9      0
19     0
56     1
104    2
69     1
55     1
132    2
29     0
127    2
26     0
128    2
131    2
145    2
108    2
143    2
45     0
30     0
22     0
15     0
65     1
11     0
42     0
146    2
51     1
27     0
Name: class, dtype: int32


In [30]:
# we should test how accurate our model is 

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8157894736842105


Closer to 1 accuracy score means better prediction. Our model has an accuracy score of 0.815 approximately.

### Explore more about model

We can try use different k values for our model.
Check its accuracy with these k values.<br>

We will try k = 1 to k =20, as smaller k means noises have large influence and larger k means comuptation becomes expensive. 

In [31]:
k_array = np.arange(1, 21, 2)

k_array

array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19])

In [20]:
# we can change k value to 1 - 20, and check the accuracy score
# Then we can choose the optimized k value

for k in k_array:
    knn_ex = KNeighborsClassifier(n_neighbors = k)
    knn_ex.fit(X_train, y_train)
    ac = accuracy_score(y_test, knn_ex.predict(X_test))
    print(k)
    print(ac)

1
0.7105263157894737
3
0.7631578947368421
5
0.8157894736842105
7
0.7631578947368421
9
0.8157894736842105
11
0.8157894736842105
13
0.7631578947368421
15
0.8157894736842105
17
0.7368421052631579
19
0.7894736842105263


In [32]:
knn_1 = KNeighborsClassifier(n_neighbors = 1)
knn_1.fit(X_train, y_train)
y_pred1 = knn_1.predict(X_test)
print(accuracy_score(y_test, y_pred1))

0.7105263157894737


The accuracy of the model using different numbers of trees varys. Choosing a optimized value for our model is important.

### Validation with Confusion Matrix
We can use Confusion Matrix to see how the prediction goes.
The matrix has the format:

|+                  |actual classA | actual classB| |
|-------------------|--------------|--------|-----|
|predicted classA   |              |        |     |
|predicted classB   |              |        |     |  



In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[15,  0,  0],
       [ 0,  7,  4],
       [ 0,  3,  9]], dtype=int64)

This Confusion Matrix shows that we have 15 predicted class 0 are correct.
7 predicted class 1 are correct; 4 predicted class 1 which are actually class 2.
9 predicted class 2 are correct; 3 predicted class 2 which are actually class 1 .


In [34]:
# The confusion matrix when k = 1
confusion_matrix(y_test, y_pred1)

array([[15,  0,  0],
       [ 0,  6,  5],
       [ 0,  6,  6]], dtype=int64)

In [35]:
# The F1 score can be interpreted as a weighted average of the precision and recall, 
# where an F1 score reaches its best value at 1 and worst score at 0.
from sklearn.metrics import f1_score
f1_score(y_test, y_pred1, average = 'micro')

0.7105263157894737

### Conclusion

Our accuracy score got from finding the suitable k is between 0.7 to 0.8