In [1]:
# 당뇨병데이터에 대해서
# 모든 Feature
# - Linear 선형회귀
# - Ridge
# - Lasso
# - SV
# - RF 
# - XGB
# 에 대해 
# R^2(결정계수), MSE, 예측치 비교

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import load_diabetes 
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target 
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [4]:
X = df.iloc[:, :-1].values
y = df.target.values

In [5]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=2023
)

In [6]:
from sklearn.metrics import r2_score, mean_squared_error 

In [7]:
# Linear
from sklearn.linear_model import LinearRegression 
linear = LinearRegression()
linear.fit(X_train, y_train)
pred_lr = linear.predict(X_test)
r2_lr = r2_score(y_test, pred_lr)
mse_lr = mean_squared_error(y_test, pred_lr)

In [8]:
# Ridge
from sklearn.linear_model import Ridge 
ridge = Ridge(random_state=2023)
ridge.fit(X_train, y_train)
pred_rg = ridge.predict(X_test)
r2_rg = r2_score(y_test, pred_rg)
mse_rg = mean_squared_error(y_test, pred_rg)

In [9]:
# Lasso
from sklearn.linear_model import Lasso 
lasso = Lasso(random_state=2023)
lasso.fit(X_train, y_train)
pred_ls = lasso.predict(X_test)
r2_ls = r2_score(y_test, pred_ls)
mse_ls = mean_squared_error(y_test, pred_ls)

In [10]:
# SV
from sklearn.svm import SVR 
sv = SVR()
sv.fit(X_train, y_train)
pred_sv = sv.predict(X_test)
r2_sv = r2_score(y_test, pred_sv)
mse_sv = mean_squared_error(y_test, pred_sv)

In [11]:
# RF
from sklearn.ensemble import RandomForestRegressor 
rfr = RandomForestRegressor(random_state=2023)
rfr.fit(X_train, y_train)
pred_rfr = rfr.predict(X_test)
r2_rfr = r2_score(y_test, pred_rfr)
mse_rfr = mean_squared_error(y_test, pred_rfr)

In [12]:
# XGB
from xgboost import XGBRegressor
xgr = XGBRegressor()
xgr.fit(X_train, y_train)
pred_xgr = xgr.predict(X_test)
r2_xgr = r2_score(y_test, pred_xgr)
mse_xgr = mean_squared_error(y_test, pred_xgr)

In [13]:
# Ridge 모델이 R2값(0.4885)도 가장 크고, MSE값(3316.0657) 은 작아서 가장 예측도가 좋다.
print('\t\tLinear\t\tRidge\t\tLasso\t\tSV\t\tRF\t\tXGB')
print(f'R squared\t{r2_ls:.4f}\t\t{r2_rg:.4f}\t\t{r2_ls:.4f}\t\t{r2_sv:.4f}\t\t{r2_rfr:.4f}\t\t{r2_xgr:.4f}')
print(f'mean_squared_E\t{mse_ls:.4f}\t{mse_rg:.4f}\t{mse_ls:.4f}\t{mse_sv:.4f}\t{mse_rfr:.4f}\t{mse_xgr:.4f}')

		Linear		Ridge		Lasso		SV		RF		XGB
R squared	0.3414		0.4885		0.3414		0.1736		0.4374		0.3298
mean_squared_E	4269.5669	3316.0657	4269.5669	5357.8379	3647.4062	4345.2940


In [14]:
df = pd.DataFrame({
    'y': y_test, 'LR': pred_lr, 'RG': pred_rg, 'LS': pred_ls, 'SV': pred_sv, 'RF': pred_rfr, 'XG': pred_xgr
})
df

Unnamed: 0,y,LR,RG,LS,SV,RF,XG
0,265.0,164.438865,160.151435,163.228322,144.190589,196.66,240.086166
1,261.0,230.141881,202.152287,185.709959,157.247371,257.69,294.357269
2,160.0,112.861523,130.893089,136.437283,126.542081,82.91,69.451904
3,249.0,205.771219,193.404176,172.572808,155.329271,162.19,135.287415
4,102.0,105.694296,130.014978,125.594654,129.916467,89.69,63.781631
5,200.0,181.051091,168.462653,171.165806,144.85424,164.64,148.613083
6,42.0,82.067894,104.536203,109.488925,119.731117,96.46,97.405067
7,55.0,77.253222,94.52976,111.518618,129.977129,141.39,97.169762
8,178.0,128.891822,131.573181,138.565391,131.056091,145.14,157.598083
9,317.0,226.235038,208.643264,182.521455,150.626285,193.59,207.042252


In [15]:
# corr()  상관관계 종속 변수(y)와의 상관 관계
# corr() 함수는 데이터프레임의 각 변수들 간의 상관 관계를 계산하는 역할
df.corr()['y'].sort_values()

XG    0.617179
RF    0.664430
LS    0.700036
LR    0.744314
SV    0.760495
RG    0.761997
y     1.000000
Name: y, dtype: float64