##### 실습 - 농어 무게 예측 모델 비교

농어의 길이, 높이, 너비값을 사용해 무게를 예측하는 문제를 최근접이웃모델과 선형회귀모델로 해결해보세요.

성능 평가지표로는 평균제곱오차(MSE), 평균절대오차(MAE), 제곱평균제곱근오차(RMSE), 평균제곱로그오차(MSLE), 제곱평균제곱근로그오차(RMSLE), 결정계수(R²)를 사용하고, 결과를 출력하세요.

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
perch_df = pd.read_csv('./data/perch_full.csv')
perch_df

Unnamed: 0,length,height,width
0,8.4,2.11,1.41
1,13.7,3.53,2.0
2,15.0,3.82,2.43
3,16.2,4.59,2.63
4,17.4,4.59,2.94
5,18.0,5.22,3.32
6,18.7,5.2,3.12
7,19.0,5.64,3.05
8,19.6,5.14,3.04
9,20.0,5.08,2.77


In [7]:
perch_weight = np.array([
    5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0,
    110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0,
    130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0,
    197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0,
    514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0,
    820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0,
    1000.0, 1000.0])

In [8]:
perch_full = perch_df.to_numpy()

In [9]:
from sklearn.model_selection import train_test_split

# 훈련/테스트 데이터 분리
train_input, test_input, train_label, test_label = train_test_split(perch_full, perch_weight, random_state=42)
train_input.shape, test_input.shape, train_label.shape, test_label.shape

((42, 3), (14, 3), (42,), (14,))

In [10]:
# 특성 스케일링
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
# scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_input)
test_scaled = scaler.transform(test_input)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_log_error
from sklearn.metrics import mean_squared_log_error, root_mean_squared_error

# 평가 함수 (MSE, MAE, RMSE, MSLE, RMSLE, R^2)
def evaluate_regressor(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    # log 함수에 음수값이 전달해 처리할 수 없다
    # - 세밀한 제어를 위해 꼭 log함수로 필요하다면, 양수로 전환하기 위한 상수를 더해준다.
    y_true += 1000
    y_pred += 1000
    msle = mean_squared_log_error(y_true, y_pred)
    rmsle = root_mean_squared_log_error(y_true, y_pred)

    print('mse={0:.4f}, mae={1:.4f}, rmse={2:.4f}, msle={3:.4f}, rmsle={4:.4f}, r^2={5:.4f}'.format(mse, mae, rmse, msle, rmsle, r2))

In [12]:
# 훈련
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

knr = KNeighborsRegressor()
knr.fit(train_scaled, train_label)

print('KNeighborsRegressor 모델 평가 :')
y_pred = knr.predict(train_scaled)
evaluate_regressor(train_label, y_pred)
y_pred = knr.predict(test_scaled)
evaluate_regressor(test_label, y_pred)

lr = LinearRegression()
lr.fit(train_scaled, train_label)

print('\nLinearRegression 모델 평가 : ')
y_pred = lr.predict(train_scaled)
evaluate_regressor(train_label, y_pred)
y_pred = lr.predict(test_scaled)
evaluate_regressor(test_label, y_pred)

KNeighborsRegressor 모델 평가 :
mse=1701.2457, mae=29.0952, rmse=41.2462, msle=0.0006, rmsle=0.0250, r^2=0.9862
mse=973109.0129, mae=985.9143, rmse=986.4629, msle=0.3375, rmsle=0.5810, r^2=-8.6950

LinearRegression 모델 평가 : 
mse=5437.9005, mae=57.7017, rmse=73.7421, msle=0.0009, rmsle=0.0308, r^2=0.9559
mse=12080.6531, mae=78.2072, rmse=109.9120, msle=0.0031, rmsle=0.0557, r^2=0.8796
