# KNN 회귀

In [None]:
import os
import pandas as pd
import numpy as np
import hds
from plt_rcs import *

In [None]:
plt.rc(group='figure', figsize=(4,4))

In [None]:
os.getcwd()

In [None]:
os.chdir('../../data')

In [None]:
sorted(os.listdir())

In [None]:
objs = pd.read_pickle('Diabetes.pkl')

In [None]:
globals().update(objs)

In [None]:
%whos

In [None]:
X_train, X_valid, y_train, y_valid = X_train, X_valid, y_train, y_valid

## 데이터 표준화

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X=X_train)
X_valid = scaler.transform(X=X_valid)

In [None]:
pd.DataFrame(data=X_valid).describe().round(3)

## 가중치 없는 KNN 회귀 모델 학습

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
model_unif = KNeighborsRegressor(p=1)

In [None]:
model_unif.fit(X=X_train, y=y_train)

In [None]:
model_unif.score(X=X_train, y=y_train)
# 0.5783751948120422
model_unif.score(X=X_valid, y=y_valid)
# 0.34874094754106255

In [None]:
distances, indices = model_unif.kneighbors(X=X_valid)

In [None]:
distances[0]

In [None]:
kth_distance = pd.Series(data=distances[:, -1])
kth_distance

In [None]:
kth_distance.describe().round(3)

In [None]:
kth_distance.sort_values().tail()

In [None]:
sns.histplot(x=kth_distance, binrange=(1, 7), binwidth=0.5)
plt.show()

## 가중치 있는 KNN 회귀 모델

In [None]:
model_dist = KNeighborsRegressor(weights='distance', p=1)

In [None]:
model_dist.fit(X=X_train, y=y_train)

In [None]:
model_dist.score(X=X_train, y=y_train)
# 1.0
model_dist.score(X=X_valid, y=y_valid)
# 0.35088449132245114

In [None]:
y_pred_unif = model_unif.predict(X=X_valid)
y_pred_dist = model_dist.predict(X=X_valid)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_unif)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_dist)

## 최적의 이웃 개수 탐색

In [None]:
def valid_score(k):
    model = KNeighborsRegressor(n_neighbors=k, weights='distance', p=1)
    model.fit(X_train, y_train)
    score = model.score(X=X_valid, y=y_valid)
    return score

In [None]:
ks = range(1, 100)

In [None]:
vl_rsq = [valid_score(k) for k in ks]

In [None]:
np.max(vl_rsq)
# np.float64(0.36093215506286147)

In [None]:
index = np.argmax(vl_rsq)

In [None]:
best_k = ks[index]
best_k

In [None]:
sns.lineplot(x=ks, y=vl_rsq)
plt.show()

In [None]:
model_best = KNeighborsRegressor(n_neighbors=best_k, p=1, weights='distance')
model_best.fit(X=X_train, y=y_train)

In [None]:
model_best.score(X=X_train, y=y_train)
# 1.0
model_best.score(X=X_valid, y=y_valid)
# 0.36093215506286147

In [None]:
y_pred_best = model_best.predict(X_valid)

In [None]:
hds.stat.regmetrics(y_true=y_valid, y_pred=y_pred_best)