#### 보스턴 집값 예측 모델
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 >> 회귀
- 피쳐/독립 : 13개
- 타겟/종속 : 1개

데이터준비

In [12]:
# 모듈로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [13]:
# 데이터
DATA_FILE = '../data/boston.csv'


In [14]:
# CSV -> DataFrame
datadf = pd.read_csv(DATA_FILE)
datadf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [15]:
datadf.value_counts()

CRIM      ZN    INDUS  CHAS  NOX    RM     AGE    DIS      RAD  TAX    PTRATIO  B       LSTAT  MEDV
0.00632   18.0  2.31   0     0.538  6.575  65.2   4.0900   1    296.0  15.3     396.90  4.98   24.0    1
1.00245   0.0   8.14   0     0.538  6.674  87.3   4.2390   4    307.0  21.0     380.23  11.98  21.0    1
1.38799   0.0   8.14   0     0.538  5.950  82.0   3.9900   4    307.0  21.0     232.60  27.71  13.2    1
1.35472   0.0   8.14   0     0.538  6.072  100.0  4.1750   4    307.0  21.0     376.73  13.04  14.5    1
1.34284   0.0   19.58  0     0.605  6.066  100.0  1.7573   5    403.0  14.7     353.89  6.43   24.3    1
                                                                                                      ..
0.11027   25.0  5.13   0     0.453  6.456  67.8   7.2255   8    284.0  19.7     396.90  6.73   22.2    1
0.10959   0.0   11.93  0     0.573  6.794  89.3   2.3889   1    273.0  21.0     393.45  6.48   22.0    1
0.10793   0.0   8.56   0     0.520  6.195  54.4   2.7778   5

### 2. 데이터 전처리
- 2-1 데이터 정제

결측치, 중복값, 이상치 처리, 컬럼별 고유값 추출로 이상 데이터 체크

#### 2-2 : 표준화 & 정규화 ---> 진행 여부에 따라 성능의 변화는 경우에 따라 다름
 - 정규분포 데이터셋을 기반으로한 모델 -> standardScaler, Log 변환
 - 피쳐의 값의 범위 차이를 줄이기 --> 피쳐 스케일링, MinMaxScaler, RobustScaler....
 - 범주형 피쳐 ---> 수치화 인코딩 OneHotEncoder, OrdinalEncoder
 - 문자열 타겟 ---> 정수 라벨인코딩 LableEncoder 

#### 2-3 : 피쳐와 타겟 분리

In [16]:
featuredf=datadf.iloc[:, :-1]
target=datadf['MEDV']
print(f'featuredf : {featuredf.shape}   target : {target.shape}'  )

featuredf : (506, 13)   target : (506,)


#### 3. 학습준비

3-1 학습용 데이터셋과 테스트용 데이터셋 분리

In [17]:
x_train, x_test, y_train, y_test = train_test_split(featuredf, target, random_state=11)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(379, 13) (127, 13) (379,) (127,)


3-2 학습용 데이터셋으로 스케일러 생성

In [18]:
### 수치 피쳐 값의 범위 차가 큼 --> Scaling 진행
ssScaler = StandardScaler()
ssScaler.fit(x_train)

In [19]:
x_train_s = ssScaler.transform(x_train)
x_test_s = ssScaler.transform(x_test)

### 4. 학습 진행 ---> 교차검증 진행


In [35]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge, Lasso

In [36]:
#### 모델의 성능을 좌우하는 Hyper-parameter 제어, 튜닝
alpha_values = [0., 1., 10, 100, 1000]

for value in alpha_values:

    # 모델 인스턴스 생성 (alpha 값을 반복문에서 설정한 value로 변경)
    ridge_model = Lasso(alpha=value, max_iter=3)

    # 학습 진행
    # cv : 3개
    # scoring : 'neg_mean_squared_error', 'r2'
    # - return_train_score
    res = cross_validate(ridge_model, x_train_s, y_train, cv=3, scoring=['neg_mean_squared_error', 'r2'],
                         return_train_score=True,
                         return_estimator=True)
    
    resdf = pd.DataFrame(res)[['test_r2','train_r2']]

    resdf['diff'] = (resdf['test_r2'] - resdf['train_r2'])
    best_idx = resdf['diff'].sort_values()
    
    print(res['estimator'][0].coef_)
    print(f'[Ridge alpha = {value}]')
    print(resdf, end='\n\n')


[-0.44536893  0.64235274 -0.5894122   1.00002976 -1.74553684  3.53450646
  0.36673905 -2.50600017  0.8354207  -0.70590486 -1.72392346  1.29147292
 -3.03752577]
[Ridge alpha = 0.0]
    test_r2  train_r2      diff
0  0.821414  0.697573  0.123840
1  0.623890  0.769995 -0.146104
2  0.652630  0.745696 -0.093066

[-0.         -0.         -0.          0.23179892 -0.          3.44605627
 -0.         -0.         -0.         -0.08736485 -1.29427183  0.81369581
 -2.72850689]
[Ridge alpha = 1.0]
    test_r2  train_r2      diff
0  0.741372  0.651422  0.089950
1  0.518588  0.707633 -0.189045
2  0.656041  0.695648 -0.039607

[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[Ridge alpha = 10]
    test_r2  train_r2      diff
0 -0.029650       0.0 -0.029650
1 -0.008880       0.0 -0.008880
2 -0.008048       0.0 -0.008048

[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[Ridge alpha = 100]
    test_r2  train_r2      diff
0 -0.029650       0.0 -0.029650
1 -0.008880       0.0 -0.008880
2 -0.00804

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [25]:
resdf=pd.DataFrame(res)
resdf

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.050559,0.000987,-19.109413,-22.886698,0.791931,0.71801
1,0.0,0.0,-30.792542,-17.156589,0.660381,0.791863
2,0.003849,0.00101,-22.263024,-21.14948,0.689283,0.770055


- 하이퍼파라미터 튜닝과 교차 검증을 동시에 진행

In [37]:
from sklearn.model_selection import GridSearchCV

In [39]:
# Ridge의 Hyper-parameter 값 설정
params = {'alpha':[0.,0.1,0.5,1.0],
          'max_iter':[3, 5]}
# --> 0., 3 => Model # --> 0., 5 -> Model
# --> 0., 3 => Model # --> 0.1, 5 -> Model
# --> 0., 3 => Model # --> 0.5, 5 -> Model
# --> 1.0, 3 => Model # --> 1.0, 5 -> Model
## -> 8개의 Ridge 모델 생성

In [41]:
# 인스턴스 생성
rModel=Ridge()

# GridSearchCV 인스턴스 생성
serchCV=GridSearchCV(rModel, params, cv=3, verbose=True, return_train_score=True)

In [42]:
# 학습 진행
serchCV.fit(x_train_s, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [43]:
# fit() 진행 후 모델 파라미터 확인
serchCV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [46]:
bestModel=serchCV.best_estimator_
bestModel

In [45]:
serchCV.best_index_

6

In [47]:
resdf=pd.DataFrame(serchCV.cv_results_)
resdf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001681,0.0006187083,0.000848,0.0002353036,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.791372,0.660149,0.687806,0.713109,0.056481,7,0.718084,0.791911,0.7701,0.760032,0.030969
1,0.000962,3.462756e-05,0.000336,0.0004746297,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.791372,0.660149,0.687806,0.713109,0.056481,7,0.718084,0.791911,0.7701,0.760032,0.030969
2,0.001,4.590203e-06,0.000661,0.000467624,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.791442,0.660176,0.687963,0.713194,0.056481,5,0.718083,0.791911,0.7701,0.760031,0.03097
3,0.001327,0.0004715406,0.000997,7.37001e-07,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.791442,0.660176,0.687963,0.713194,0.056481,5,0.718083,0.791911,0.7701,0.760031,0.03097
4,0.000997,1.461091e-06,0.000667,0.0004719468,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.791689,0.660276,0.688568,0.713511,0.056474,3,0.718064,0.791899,0.770089,0.760017,0.030973
5,0.001025,4.122382e-05,0.000663,0.0004690664,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.791689,0.660276,0.688568,0.713511,0.056474,3,0.718064,0.791899,0.770089,0.760017,0.030973
6,0.000664,0.0004694603,0.000664,0.0004698532,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.791931,0.660381,0.689283,0.713865,0.056448,1,0.71801,0.791863,0.770055,0.759976,0.030982
7,0.000997,2.973602e-07,0.0,0.0,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.791931,0.660381,0.689283,0.713865,0.056448,1,0.71801,0.791863,0.770055,0.759976,0.030982
