In [104]:
import sklearn
import numpy as np
import matplotlib as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold

In [105]:
iris = pd.read_csv('./datasets/iris.csv')
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [106]:
iris.shape

(150, 6)

In [107]:
iris['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [108]:
iris.isna().sum(axis=0)

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [109]:
iris[iris.duplicated()]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species


In [110]:
iris['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [111]:
# 데이터 전처리

df = iris.copy()

In [112]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [113]:
le = LabelEncoder()
le.fit(df['Species'])

df['Species'] = le.transform(df['Species'])
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0


In [114]:
df = df.drop(['Id'], axis = 1)

In [115]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [116]:
x = df.iloc[:, 0:4]
y = df.iloc[:, -1]

In [117]:
## 데이터 분리

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

In [118]:
x_train.shape, x_test.shape

((120, 4), (30, 4))

In [119]:
y_train.shape, y_test.shape

((120,), (30,))

In [120]:
## 스케일링

In [121]:
ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)

y_train = y_train.to_numpy()

In [122]:
## 모델 학습

In [123]:
model = KNeighborsClassifier()
model.fit(x_train_scaled, y_train)

In [124]:
x_test.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
38,4.4,3.0,1.3,0.2
127,6.1,3.0,4.9,1.8
57,4.9,2.4,3.3,1.0
93,5.0,2.3,3.3,1.0
42,4.4,3.2,1.3,0.2


In [125]:
x_test_scaled = ss.transform(x_test)
x_test_scaled[:5]

array([[-1.72156775, -0.0995174 , -1.40385252, -1.32016847],
       [ 0.30848902, -0.0995174 ,  0.64155823,  0.78333648],
       [-1.12449223, -1.45145189, -0.26751321, -0.268416  ],
       [-1.00507713, -1.67677431, -0.26751321, -0.268416  ],
       [-1.72156775,  0.35112743, -1.40385252, -1.32016847]])

In [126]:
y_pred = model.predict(x_test_scaled)

In [127]:
y_pred[:10]

array([0, 2, 1, 1, 0, 1, 0, 0, 2, 1])

In [128]:
accuracy_score(y_test, y_pred)

0.9333333333333333

In [129]:
hist = []

for k in tqdm(range(3, 50)):
    model = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(model, x_train_scaled, y_train, cv=5, scoring='accuracy')  # cv = cross validation, scoring -> 정확도로 설정
    
    hist.append(scores.mean())


100%|█████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 153.36it/s]


In [139]:
max(hist)

0.9666666666666668

In [140]:
np.argmax(hist)

2

In [136]:
model = KNeighborsClassifier(n_neighbors=10)
model.fit(x_train_scaled, y_train)

In [137]:
x_test_scaled = ss.transform(x_test)
y_pred = model.predict(x_test_scaled)
accuracy_score(y_test, y_pred)

0.9666666666666667