# 목표 : 당뇨병 예측 
- 데이터 : scikit-learn dataset-
- 피쳐 : 10
- 타겟 : 정수값  
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes

## 1. 모듈 로딩 및 데이터 준비

In [1]:
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

In [76]:
# data loading
diabetes = load_diabetes()
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [77]:
featureArray = diabetes["data"]
targetArray = diabetes["target"]
feature_names = diabetes["feature_names"]

In [78]:
featureArray.shape

(442, 10)

In [79]:
targetArray.shape

(442,)

In [80]:
feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [81]:
# 데이터 로딩 DF ver
featureDF, targetDF = load_diabetes(return_X_y = True, as_frame = True) # target은 어떤 데이터인지 알 수 없음 

In [82]:
targetDF

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [83]:
correlation_with_target = featureDF.corrwith(targetDF)
correlation_with_target

age    0.187889
sex    0.043062
bmi    0.586450
bp     0.441482
s1     0.212022
s2     0.174054
s3    -0.394789
s4     0.430453
s5     0.565883
s6     0.382483
dtype: float64

In [25]:
featureDF.drop("sex", axis=True, inplace=True) # corr은 관련성이 적어 보여 drop 나머지는 모르겠음 
featureDF

Unnamed: 0,age,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...
437,0.041708,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [26]:
from sklearn.model_selection import train_test_split
# train:test = 7:3
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetDF, test_size=0.2, random_state=42)

In [84]:
from sklearn.linear_model import LinearRegression, Ridge
LR_model = LinearRegression()
Ridge_model = Ridge()

<hr> 리니어

In [32]:
LR_model.fit(X_train, y_train)

In [59]:
LR_model.score(X_train, y_train) 

0.5107312251753249

In [46]:
LR_model.score(X_test, y_test)

0.4362688983095515

두 score가 비슷한 것으로 보아 최적적합인듯함

In [61]:
test_y_predict = LR_model.predict(X_test) # 예측

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
error = mean_squared_error(y_test, test_y_predict)
print(f"mse => {error}")
print(f"RMSE => {error ** 0.5}")
print(f"R2(결정계수) => {r2_score(y_test, test_y_predict)}")
# 결과가... 엉엉

mse => 2986.7329221144396
RMSE => 54.65101025703404
R2(결정계수) => 0.4362688983095515


<hr> 릿지

In [85]:
Ridge_model.fit(X_train, y_train)

In [86]:
Ridge_model.score(X_train, y_train) 

0.432649200412765

In [87]:
Ridge_model.score(X_test, y_test)

0.41379383889429944