# K-Nearest Neighbors


## 1.환경준비

### (1) Import

In [1]:
#라이브러리들을 불러오자.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### (2) data loading

In [2]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/boston.csv'
data = pd.read_csv(path)

|	변수	|	설명	|
|	----	|	----	|
|	**medv**	|	**타운별 집값(중위수), target**	|
|	crim	|	범죄율	|
|	zn	|	25,000 평방피트를 초과한 거주지역 비율	|
|	indus	|	비소매상업지역 면적 비율, 편의시설(관공서, 주요 시설)	|
|	chas	|	찰스강변 위치(범주 : 강변1, 아니면 0)	|
|	nox	|	일산화질소 농도	|
|	rm	|	주택당 방 수	|
|	age	|	1940년 이전에 건축된 주택의 비율	|
|	dis	|	직업센터의 거리	|
|	rad	|	방사형 고속도로까지의 거리	|
|	tax	|	재산세율	|
|	ptratio	|	학생/교사 비율	|
|	black	|	인구 중 흑인 비율	|
|	lstat	|	인구 중 하위 계층 비율	|


## 2.데이터 이해

### (1) 둘러보기

In [3]:
# 상/하위 몇개 행을 살펴 봅시다.
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# 각 칼럼의 타입을 살펴 봅시다.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


## 3.데이터 준비

### (1) 데이터 정리

In [5]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [6]:
data.drop(['black'], axis = 1, inplace = True)

### (2) 데이터분할1 : x, y 나누기

In [7]:
target = 'medv'
x = data.drop(target, axis=1)
y = data.loc[:, target]

### (3) NA 조치

### (4) 가변수화

### (5) 데이터분할2 : train : validation 나누기

In [16]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3, random_state = 2022)

### (6) Scaling
KNN 알고리즘을 적용하기 위해서는 스케일링을 해야 합니다.

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # 선언만
x_train_s1 = scaler.fit_transform(x_train) # fit_transform : fit + transform => 기준을 찾고 적용
x_val_s1 = scaler.transform(x_val) #적용만, numpy로 변환됨 => sklearn/머신러닝은 numpy나 df 상관없지만 .describe() 못쓰기 때문에 df로 변환

In [18]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler()
x_train_s2 = scaler2.fit_transform(x_train) # fit_transform : fit + transform => 기준을 찾고 적용
x_val_s2 = scaler2.transform(x_val) # 적용만

In [19]:
x_train_s1 = pd.DataFrame(x_train_s1, columns = list(x))
x_train_s2 = pd.DataFrame(x_train_s2, columns = list(x))

In [20]:
x.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97


In [23]:
x_train_s1.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,0.035949,0.114788,0.377422,0.070621,0.330976,0.503112,0.654099,0.28918,0.349546,0.40113,0.614166,0.286954
std,0.096155,0.229696,0.252717,0.256554,0.238801,0.141718,0.296748,0.220639,0.365802,0.313395,0.233143,0.185959
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000737,0.0,0.16404,0.0,0.122407,0.416804,0.393085,0.108411,0.130435,0.170172,0.468085,0.138107
50%,0.002398,0.0,0.289223,0.0,0.280083,0.482517,0.732447,0.232989,0.173913,0.270554,0.664894,0.253725
75%,0.029021,0.2,0.646628,0.0,0.466805,0.566794,0.93484,0.42767,0.304348,0.476099,0.808511,0.388797
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
x_train_s2.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
count,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0,354.0
mean,-1.3799380000000001e-17,7.02514e-17,8.969598e-17,-1.3328950000000002e-17,-3.048409e-16,-6.443371e-16,-1.79392e-16,6.711518000000001e-17,-1.1290400000000001e-17,-5.0179570000000006e-17,3.070363e-16,2.195356e-16
std,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415,1.001415
min,-0.374396,-0.5004462,-1.49557,-0.2756589,-1.387952,-3.55512,-2.207344,-1.312501,-0.9569127,-1.281763,-2.638016,-1.545286
25%,-0.3667255,-0.5004462,-0.8455478,-0.2756589,-0.8746373,-0.6098789,-0.8808263,-0.8204568,-0.5998357,-0.7379981,-0.6274599,-0.8015607
50%,-0.3494235,-0.5004462,-0.3494974,-0.2756589,-0.2134189,-0.1455293,0.2643949,-0.2550316,-0.48081,-0.4172382,0.2178877,-0.1789413
75%,-0.07215742,0.3715014,1.066752,-0.2756589,0.5696029,0.4499863,0.9473991,0.6285651,-0.1237329,0.239556,0.834763,0.5484387
max,10.04022,3.859292,2.467023,3.627671,2.805565,3.511133,1.167289,3.226202,1.780678,1.913617,1.657263,3.839852


## 4.모델링 : KNN

### (1) import

In [24]:
# 모델링용
from sklearn.neighbors import KNeighborsRegressor    

# 회귀모델 평가용
from sklearn.metrics import * 

### (2) 모델선언

In [25]:
knn = KNeighborsRegressor() # default k값은 5

## 스케일링 1) MinMaxScaler

### (3) 모델링(학습)

In [33]:
knn.fit(x_train_s1, y_train)

KNeighborsRegressor()

### (4) 검증 : 예측

In [34]:
pred = knn.predict(x_val_s1)
pred

array([19.5 , 13.88, 18.88, 30.2 , 14.92, 14.24, 24.06, 16.22,  8.72,
       19.08, 23.02, 22.82, 19.56, 16.2 , 19.66, 29.46, 10.56, 13.82,
       17.6 , 20.44, 21.8 , 19.68, 11.84, 34.54, 32.24, 43.84, 24.66,
       18.74, 19.34, 26.08, 23.  , 23.06, 22.02, 19.48, 11.2 , 13.14,
       14.34, 19.7 , 14.66, 32.44, 10.46, 11.9 , 18.38,  8.94, 14.24,
       29.42, 19.78, 12.5 , 14.76, 17.98, 22.4 , 11.18, 27.28, 29.24,
       24.02, 19.2 , 24.2 , 25.58, 25.46, 20.  , 14.8 , 14.02, 14.24,
       27.62, 21.8 , 18.52, 26.14, 20.16, 26.6 , 17.62, 13.3 , 23.26,
       11.32, 23.02, 24.98, 15.88, 15.3 , 10.28, 11.04, 24.86, 15.88,
       21.34, 18.6 , 33.82, 15.56, 11.82, 24.06, 21.46, 13.14, 34.8 ,
       21.34, 23.16, 23.16, 14.24, 14.1 , 24.18, 24.6 , 15.86, 13.38,
       22.64, 29.1 , 13.38, 40.92, 22.84, 21.42, 16.64, 41.16, 23.2 ,
       15.3 , 30.08, 15.7 , 12.48, 10.02, 21.2 , 24.72, 24.1 , 22.4 ,
       15.86, 18.92, 30.86, 31.34, 21.92, 24.66, 21.1 , 21.5 , 22.66,
        9.76, 21.66,

### (5) 검증 : 평가

In [35]:
# RMSE
mean_squared_error(y_val, pred, squared=False)

4.227564933050456

In [36]:
# MAE
mean_absolute_error(y_val, pred)

3.035

In [37]:
# MAPE : 평균 오차율
mean_absolute_percentage_error(y_val, pred)

0.16339021011462934

In [38]:
# 1 - MAPE : 정확도
1 - mean_absolute_percentage_error(y_val, pred)

0.8366097898853706

## 스케일링 2) StandardScaler

### (3) 모델링(학습)

In [39]:
knn.fit(x_train_s2, y_train)

KNeighborsRegressor()

### (4) 검증 : 예측

In [40]:
pred = knn.predict(x_val_s2)
pred

array([19.5 , 15.46, 21.46, 30.66, 12.6 , 14.24, 24.06, 16.22,  8.72,
       18.4 , 21.58, 22.82, 19.3 , 15.04, 19.66, 31.32, 11.96, 10.34,
       19.28, 20.4 , 21.8 , 18.44, 13.3 , 31.98, 32.24, 43.84, 23.58,
       18.74, 19.34, 27.42, 23.  , 23.48, 21.9 , 19.48, 10.34, 14.38,
       14.64, 17.98, 14.48, 38.88, 10.66, 12.08, 18.3 ,  7.8 , 14.24,
       30.72, 19.8 , 13.32, 14.54, 17.98, 22.4 , 11.18, 28.96, 29.24,
       22.84, 19.26, 23.36, 25.58, 25.46, 19.58, 14.88, 14.02, 14.24,
       27.62, 21.8 , 19.52, 28.28, 20.16, 23.68, 18.42, 13.3 , 22.2 ,
       13.32, 23.02, 24.98, 15.24, 15.3 , 12.5 , 13.3 , 24.46, 15.88,
       20.86, 19.8 , 28.24, 15.54, 11.88, 24.5 , 19.84, 13.66, 34.8 ,
       20.56, 23.16, 22.84, 14.24, 14.24, 24.12, 25.94, 15.86, 14.84,
       21.84, 30.8 , 13.6 , 43.1 , 22.84, 22.26, 15.88, 42.56, 24.66,
       15.3 , 30.08, 15.32, 12.5 ,  9.24, 21.46, 25.96, 22.44, 22.4 ,
       15.5 , 17.86, 28.84, 35.44, 23.26, 24.46, 21.1 , 21.86, 21.84,
       10.82, 22.78,

### (5) 검증 : 평가

In [41]:
# RMSE
mean_squared_error(y_val, pred, squared=False)

4.191138835309815

In [42]:
# MAE
mean_absolute_error(y_val, pred)

2.971710526315789

In [43]:
# MAPE : 평균 오차율
mean_absolute_percentage_error(y_val, pred)

0.17378681288843553

In [44]:
# 1 - MAPE : 정확도
1 - mean_absolute_percentage_error(y_val, pred)

0.8262131871115644

## 5.Hyper Parameter

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

* n_neighbors : k 의 갯수. k가 달라지면 예측결과도 달라지고, 성능도 달라집니다!
* metric : 거리계산 방식.
    * euclidean : 유클리디안 거리 :  sqrt(a^2 + b^2)
    * manhattan : 맨하탄거리 : a + b

In [45]:
# model1 : n_neighbors = 10, metric = 'euclidean'
model1 = KNeighborsRegressor(n_neighbors = 10, metric = 'euclidean')
model1.fit(x_train_s1, y_train)
pred1 = model1.predict(x_val_s1)

In [46]:
# model2 : n_neighbors = 10, metric = 'manhattan'
model2 = KNeighborsRegressor(n_neighbors = 10, metric = 'manhattan')
model2.fit(x_train_s1, y_train)
pred2 = model2.predict(x_val_s1)

In [47]:
print(f'metric = euclidean : rmse {mean_squared_error(y_val, pred1, squared = False)}')
print(f'metric = manhattan : rmse {mean_squared_error(y_val, pred2, squared = False)}')

metric = euclidean : rmse 4.841955074901916
metric = manhattan : rmse 4.420933680260671


In [48]:
print(f'metric = euclidean : mae {mean_absolute_error(y_val, pred1)}')
print(f'metric = manhattan : mae {mean_absolute_error(y_val, pred2)}')

metric = euclidean : mae 3.462763157894737
metric = manhattan : mae 3.2408552631578953


In [49]:
print(f'metric = euclidean : mape {mean_absolute_percentage_error(y_val, pred1)}')
print(f'metric = manhattan : mape {mean_absolute_percentage_error(y_val, pred2)}')

metric = euclidean : mape 0.18505949742155064
metric = manhattan : mape 0.17819962243785456


## 6.연습문제
* 다음의 조건을 조정하며 모델을 생성하고 성능을 비교해 봅시다.
* 조건
    * 스케일링 데이터 : 하이퍼파라미터는 default로 두고, 스케일링 데이터만 달리하며 비교해 봅시다.
        * 정규화 : x_train_s1, x_val_s1
        * 표준화 : x_train_s2, x_val_s2
    * k : 
        * k 값을 1에서 50까지 1씩 증가시켜가며 
        * 성능 rmse, mae, mape를 구하고 최적의 k 값을 찾아 봅시다.

### (1) 스케일링 데이터 비교.

In [None]:
# 정규화




In [None]:
# 표준화




### (2) k 값