# KNN

In [None]:
import os
import numpy as np
import pandas as pd
from plt_rcs import *
import hds

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('WhiteWine.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_train, X_valid, y_train, y_valid = X_train, X_valid, y_train, y_valid

## 데이터 표준화

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# 표준화 객체
scaler = StandardScaler()

In [None]:
# 표준화
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

In [None]:
pd.DataFrame(data=X_train).describe().round(3)

In [None]:
pd.DataFrame(data=X_valid).describe().round(3)

## 가중치 없는 KNN 분류 모델 학습

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# 가중치 없는 KNN 분류 모델 생성
model_unif = KNeighborsClassifier()

In [None]:
model_unif.get_params()

In [None]:
# 모델 학습
model_unif.fit(X=X_train, y=y_train)

In [None]:
# 정확도 확인
model_unif.score(X=X_train, y=y_train)
# 0.8865227537922987
model_unif.score(X=X_valid, y=y_valid)
# 0.8061224489795918

In [None]:
# 이웃간 거리 및 인덱스 확인
distances, indices = model_unif.kneighbors(X=X_valid)

In [None]:
distances[0]
# array([0.84149839, 1.18088628, 1.25421004, 1.25455938, 1.25766403])
indices[0]
# array([1833, 1041, 1351, 3203, 2240])

## 최근접 이웃과의 거리 분포 확인

In [None]:
kth_distances = pd.Series(data=distances[:, -1])
kth_distances.describe().round(3)

In [None]:
kth_distances.sort_values().tail()

In [None]:
plt.rc(group='figure', figsize=(4, 4))

In [None]:
sns.histplot(x=kth_distances, binrange=(0, 6), binwidth=0.5)
plt.show()

## 가중치 있는 KNN 분류 모델 학습

In [None]:
model_dist = KNeighborsClassifier(weights='distance')

In [None]:
# 가중치가 있는 KNN 분류 모델로 학습
model_dist.fit(X_train, y_train)

In [None]:
# 가중치가 있는 정확도를 확인
model_dist.score(X=X_train, y=y_train)
# 1.0
model_dist.score(X=X_valid, y=y_valid)
# 0.8469387755102041

## 분류 모델 성능 평가

In [None]:
y_pred_unif = model_unif.predict(X_valid)
y_pred_dist = model_dist.predict(X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_unif)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_dist)

## ROC 곡선

In [None]:
y_prob_unif = model_unif.predict_proba(X_valid)
y_prob_dist = model_dist.predict_proba(X_valid)

In [None]:
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_unif, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_dist, color='blue')

In [None]:
y_valid.value_counts(normalize=True)
# grade
# 0    0.787075
# 1    0.212925
# Name: proportion, dtype: float64

In [None]:
cutoff = 0.21
y_pred_dist_2 = np.where(y_prob_dist[:, 1] >= cutoff, 1, 0)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_dist_2)

## PR 곡선

In [None]:
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_unif, color='red')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_dist, color='blue')

## 최적의 이웃 개수(k) 탐색

In [None]:
def valid_score(k):
    model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    model.fit(X=X_train, y=y_train)
    score = model.score(X=X_valid, y=y_valid)
    return score

In [None]:
# 탐색할 k의 범위 설정
ks = range(1, 100, 2)
# 검증셋 정확도를 원소로 갖는 리스트로 생성
vl_acc = [valid_score(k) for k in ks]

In [None]:
sns.lineplot(x=ks, y=vl_acc)
plt.show()

In [None]:
# 검증셋 정확도의 최댓값 확인
np.max(vl_acc)

In [None]:
# 검증셋 정확도가 최댓값인 인덱스 조회
index = np.argmax(vl_acc)

In [None]:
# 검증셋 정확도가 최댓값일 때의 k
best_k = ks[index]

## 최적의 KNN 분류 모델 학습

In [None]:
model_best = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
# 최적의 k로 학습
model_best.fit(X=X_train, y=y_train)

In [None]:
# 정확도 확인
model_best.score(X=X_train, y=y_train)
# 1.0
model_best.score(X=X_valid, y=y_valid)
# 0.8727891156462585

## F1 스코어 값 출력 함수 생성

In [None]:
from sklearn.metrics import f1_score

In [None]:
def valid_f1_score(k):
    model = KNeighborsClassifier(n_neighbors=k, weights='distance')
    model.fit(X=X_train, y=y_train)
    y_pred = model.predict(X=X_valid)
    score = f1_score(y_true=y_valid, y_pred=y_pred)
    return score

In [None]:
vl_f1s = [valid_f1_score(k) for k in ks]

In [None]:
sns.lineplot(x=ks, y=vl_f1s)
plt.show()

In [None]:
np.max(vl_f1s)

In [None]:
index = np.argmax(vl_f1s)

In [None]:
best_k = ks[index]

In [None]:
model_best = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
model_best.fit(X=X_train, y=y_train)

In [None]:
model_best.score(X=X_train, y=y_train)
model_best.score(X=X_valid, y=y_valid)

## 데이터 균형화

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# SMOTE 모델 생성
smote = SMOTE(k_neighbors=5, random_state=0)

In [None]:
# 소수 클래스를 다수 클래스 개수만큼 오버 샘플링
X_bal, y_bal = smote.fit_resample(X=X_train, y=y_train)

In [None]:
# 범주별 도수 확인
y_bal.value_counts(normalize=True)

## 표준화된 데이터로 모델 학습

In [None]:
model_bal = KNeighborsClassifier(n_neighbors=best_k, weights='distance')
model_bal.fit(X=X_bal, y=y_bal)

In [None]:
model_bal.score(X=X_bal, y=y_bal)
# 1.0
model_bal.score(X=X_valid, y=y_valid)
# 0.7952380952380952

In [None]:
y_pred_bal = model_bal.predict(X=X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_bal)

In [None]:
y_pred_best = model_best.predict(X=X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_best)