#### 보스턴 집값 예측 모델
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 >> 회귀
- 피쳐/독립 : 13개
- 타겟/종속 : 1개

[1] 데이터 준비

In [25]:
# 모듈로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, minmax_scale, RobustScaler
from sklearn.model_selection import train_test_split

In [26]:
# 데이터
DATA_FILE='../DATA/boston.csv'

In [27]:
# 데이터 => DataFrame으로 변환
dataDF=pd.read_csv(DATA_FILE)
dataDF.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6


[2] 전처리
- [2-1] 데이터 정제

### 결측치, 중복값, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크

- [2-2] 표준화 & 정규화 => 진행여부에 따라 성능의 변화는 경우 마다 다르다
    * 정규분포 데이터셋을 기반으로 한 모델 => StandardScaler, MinMaxScaler, Log 변환
    * 피쳐 값의 범위 차이를 줄이기 => 피쳐 스케일링, MinMaxScaler, RobustScaler...
    * 범주형 피쳐 => 수치화 인코딩 OneHotEncoder, OrdinalEncoder
    * 문자열 타겟 => 정수 라벨인코딩 LabelEncoder

- [2-3] 피쳐와 타겟 분리

In [28]:
featureDF=dataDF.iloc[:, :-1]
targetSR=dataDF['MEDV']

In [29]:
print(f' featureDF : {featureDF.shape} targetSR : {targetSR.shape}')

 featureDF : (506, 13) targetSR : (506,)


[3] 학습 준비

- [3-1] 학습용 데이터셋과 테스트용 데이터셋 분리

In [30]:
X_train, X_test, y_train, y_test=train_test_split(featureDF, targetSR, random_state=10)

In [31]:
print(f'X_train : {X_train.shape} y_train : {y_train.shape}')
print(f'X_test : {X_test.shape} y_test : {y_test.shape}')

X_train : (379, 13) y_train : (379,)
X_test : (127, 13) y_test : (127,)


- [3-2] 학습용 데이터셋으로 스케일러 생성

In [32]:
# 수치 피쳐 값의 범위 차가 큼 => 스케일링 진행
ssScaler=StandardScaler()

ssScaler.fit(X_train)

In [33]:
X_train_scaled=ssScaler.transform(X_train)
X_test_scaled=ssScaler.transform(X_test)

[4] 학습 진행 => 교차검증으로 진행

In [40]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_validate

In [35]:
# 모델 인스턴스 생성
ridge_model=Ridge(alpha=1.0) # 기본값 1.0

In [36]:
# 학습 진행
# - cv : 3개
# - scoring : 'mean_squared_error', 'r2'
# - return_train_score
result=cross_validate(ridge_model,
                      X_train_scaled, y_train,
                      cv=3, scoring=['neg_mean_squared_error', 'r2'],
                      return_train_score=True)

In [37]:
resultDF=pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.0,0.0,-17.320297,-20.143636,0.748283,0.755663
1,0.0,0.0,-22.582566,-18.210772,0.756292,0.740039
2,0.001604,0.0,-22.657585,-17.293662,0.680991,0.786097


In [42]:
# 모델의 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_value=[0.,1.,10,100,1000]

for value in alpha_value:

    # 모델 인스턴스 생성
    ridge_model=Ridge(alpha=value)

    # 학습 진행
    # - cv : 3개
    # - scoring : 'mean_squared_error', 'r2'
    # - return_train_score
    result=cross_validate(ridge_model,
                        X_train_scaled, y_train,
                        cv=3, scoring=['neg_mean_squared_error', 'r2'],
                        return_train_score=True,
                        return_estimator=True)
    
    resultDF=pd.DataFrame(result)[['test_r2','train_r2']]

    resultDF['diff']=abs(resultDF['test_r2']-resultDF['train_r2'])
    
    print(result['estimator'][0].coef_)
    print(f'[Ridge(alpha={value})]')
    print(resultDF, end='\n\n')

[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
[Ridge(alpha=0.0)]
    test_r2  train_r2      diff
0  0.747022  0.755720  0.008699
1  0.756482  0.740082  0.016400
2  0.680801  0.786156  0.105355

[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[Ridge(alpha=1.0)]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106

[-1.23221033  1.29302258 -0.12737786  0.70280521 -1.80949922  2.48028701
 -0.00860666 -2.99831755  1.75466332 -1.51704375 -1.73434856  1.00368486
 -3.30809117]
[Ridge(alpha=10)]
    test_r2  train_r2      diff
0  0.753103  0.752474  0.000629
1  0.755100  0.737457  0.017643
2  0.677471  0.783225  0.105755

[-0.78141029  0.70910255 -0.46407849  0.72503917 -0.69294458  2.41757287
 -0.

- 하이퍼파라미터 튜닝과 교차 검증을 동시에 진행

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
# ridge의 Hyper parameter 값 설정
params={'alpha':[0., 0.1, 0.5, 1.0],
        'max_iter':[3,5]}

In [46]:
# 인스턴스 생성
rModel=Ridge()

# GridSearchCV 인스턴스 생성
serchCV=GridSearchCV(rModel, params, cv=3, verbose=True, return_train_score=True)

In [47]:
# 학습 진행
serchCV.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [48]:
# fit() 진행 후 모델 파라미터 확인
serchCV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [50]:
bestModel=serchCV.best_estimator_
bestModel

In [51]:
resultDF=pd.DataFrame(serchCV.cv_results_)
resultDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001151,0.000843,0.00019,0.000269,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.002843,0.004021,0.000337,0.000476,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.000338,0.000479,0.001324,0.000474,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.000664,0.000469,0.000331,0.000469,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.000999,0.000817,0.000665,0.00047,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.00066,0.000466,0.000669,0.000473,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.00066,0.000467,0.0,0.0,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.000596,0.000444,0.0,0.0,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
