In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('iris.csv')
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data.shape

(150, 6)

In [4]:
data['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [6]:
data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [8]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [9]:
x = data.iloc[:,1:5]
x.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
y = data.iloc[:,-1]
y.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object

# Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [12]:
x = scaler.fit_transform(x)
x[0:5]

array([[-0.90068117,  1.03205722, -1.3412724 , -1.31297673],
       [-1.14301691, -0.1249576 , -1.3412724 , -1.31297673],
       [-1.38535265,  0.33784833, -1.39813811, -1.31297673],
       [-1.50652052,  0.10644536, -1.2844067 , -1.31297673],
       [-1.02184904,  1.26346019, -1.3412724 , -1.31297673]])

# Train Data

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2)

In [15]:
x_train.shape

(120, 4)

In [16]:
x_test.shape

(30, 4)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)
model.fit(x_train, y_train)

In [18]:
pred = model.predict(x_test)
pred[0:5]

array(['Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [19]:
y_test[0:5]

67     Iris-versicolor
146     Iris-virginica
97     Iris-versicolor
51     Iris-versicolor
12         Iris-setosa
Name: Species, dtype: object

# Accuracy

In [20]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,pred)
accuracy 

0.9333333333333333

In [23]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,pred)
cm

array([[ 9,  0,  0],
       [ 0,  9,  0],
       [ 0,  2, 10]], dtype=int64)

In [25]:
result = pd.DataFrame(data=[y_test.values,pred], index = ['y_test', 'pred'])
result.transpose()

Unnamed: 0,y_test,pred
0,Iris-versicolor,Iris-versicolor
1,Iris-virginica,Iris-versicolor
2,Iris-versicolor,Iris-versicolor
3,Iris-versicolor,Iris-versicolor
4,Iris-setosa,Iris-setosa
5,Iris-setosa,Iris-setosa
6,Iris-setosa,Iris-setosa
7,Iris-setosa,Iris-setosa
8,Iris-versicolor,Iris-versicolor
9,Iris-versicolor,Iris-versicolor


# Find most suitable K value for the model

In [27]:
correct_sum = []
for i in range(1,20):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    correct = np.sum(pred == y_test)
    correct_sum.append(correct)

In [28]:
correct_sum

[28, 27, 28, 27, 28, 28, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 29]

In [29]:
result = pd.DataFrame(data = correct_sum)
result_index = result.index+1
result.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,28,27,28,27,28,28,29,29,29,29,29,29,29,28,28,28,28,28,29


In [30]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(x_train, y_train)
pred = model.predict(x_test)

In [31]:
accuracy_score(y_test,pred)

0.9666666666666667

In [32]:
confusion_matrix(y_test,pred)

array([[ 9,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 11]], dtype=int64)