In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import tarfile
import urllib
import pandas as pd

random_seed = 42

In [2]:
from utils import load_housing_data, save_fig, CombinedAttributesAdder

housing = load_housing_data()

In [3]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# 연속적인 값은 카테고리로 Grouping
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed)
for train_index, test_index in split.split(housing, housing["income_cat"]) :
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]
    
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=random_seed)

housing = strat_train_set.copy()
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),  ## Fit & Transformer
    ('attribs_adder', CombinedAttributesAdder()),   ## Fit & Transformer
    ('std_scaler', StandardScaler()),                 ## Fit
])

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

## 5. 모델 훈련 (Train a Model)

드디어 모델을 훈련시킬 준비가 되었습니다! 
#### 지난 시간에 배웠던 선형회귀모델(Linear Regression)을 사용해보겠습니다.

In [6]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

모델 훈련은 딱 3줄의 코드면 충분합니다!

몇 개의 샘플에 모델을 적용해 예측값을 확인해보고 실제값과 비교해보겠습니다.

In [7]:
lin_reg.coef_

array([-54137.20708781, -55013.28721161,  14392.68343212,  -4044.59678427,
         8934.3560095 , -44845.68344469,  44884.77513654,  63514.86266688,
        14179.47426921,   6474.20838974,    994.94379601,  10585.71987677,
       -19784.87521119, -55494.63192495, 114626.55255433, -24057.75380645,
       -15289.29161173])

In [8]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder   = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes    = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(lin_reg.coef_, attributes), reverse=True)

## coef의 절댓값이 높을수록 집 가격에 영향 많이 줌 (양의 값은 집값 높고, 음의 값은 집값 낮고) 
## -> 계수들의 크기만 가지고 반드시 중요하다 안 중요하다 판단하면 안됨

[(114626.55255432795, 'ISLAND'),
 (63514.86266687522, 'median_income'),
 (44884.775136536126, 'households'),
 (14392.683432117343, 'housing_median_age'),
 (14179.474269208375, 'income_cat'),
 (10585.719876767198, 'bedrooms_per_room'),
 (8934.35600949974, 'total_bedrooms'),
 (6474.208389735375, 'rooms_per_hhold'),
 (994.9437960060471, 'pop_per_hhold'),
 (-4044.596784266778, 'total_rooms'),
 (-15289.291611731773, 'NEAR OCEAN'),
 (-19784.875211193415, '<1H OCEAN'),
 (-24057.753806449313, 'NEAR BAY'),
 (-44845.68344469058, 'population'),
 (-54137.20708780599, 'longitude'),
 (-55013.287211605086, 'latitude'),
 (-55494.63192495342, 'INLAND')]

In [9]:
# 몇 개의 샘플에 대해 데이터 변환 및 예측을 해보자
some_data   = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared).round(decimals=1))

Predictions: [203682.4 326371.4 204218.6  58685.5 194213.1]


In [10]:
print("Labels:", list(some_labels))

Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


전체 훈련 데이터셋에 대한 RMSE를 측정해 보겠습니다.

In [11]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse  = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse   ## 약 7만불 = 8000만원

68376.64295459939

훈련 데이터셋의 RMSE가 이 경우처럼 큰 경우 => 과소 적합 (Under-fitting)

과소 적합이 일어나는 이유?
- 특성들(Features)이 충분한 정보를 제공하지 못함
- 모델이 충분히 강력하지 못함

#### 강력한 비선형모델인 DecisionTreeRegressor를 사용해보겠습니다.

In [12]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state = random_seed)
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(random_state=42)

In [13]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse  = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

이 모델이 선형모델보다 낫다고 말할 수 있을까요? 어떻게 알 수 있을까요? Over-fitting일 가능성 있음

- 테스트 데이터셋을 이용한 검증
- 훈련 데이터셋의 일부를 검증데이터(Validation Data)셋으로 분리해서 검증
- K-겹 교차 검증 (K-folded Cross Validation)

### 교차 검증(Cross-Validation)을 사용한 평가

#### 결정트리 모델에 대한 평가

In [14]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [15]:
def display_scores(scores) : #{
    print("Scores :", scores)
    print("Mean :", scores.mean())
    print("Standard Deviation :", scores.std())
#}

display_scores(tree_rmse_scores)

Scores : [70274.7991723  67258.3624668  71350.42593227 68882.91340979
 70987.99296566 74177.52037059 70788.57311306 70850.53018019
 76430.62239321 70212.6471067 ]
Mean : 71121.4387110585
Standard Deviation : 2434.3080046605132


#### 선형 모델에 대한 평가

In [16]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scores(lin_rmse_scores)

Scores : [66877.52325028 66608.120256   70575.91118868 74179.94799352
 67683.32205678 71103.16843468 64782.65896552 67711.29940352
 71080.40484136 67687.6384546 ]
Mean : 68828.99948449328
Standard Deviation : 2662.761570610338


#### RandomForestRegressor에 대한 평가

나무가 여러개 있는 숲과 같은 모델

In [17]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=random_seed)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(random_state=42)

In [19]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores : [49633.454487   47565.18736589 50034.52343496 52444.25008076
 49463.18392461 53515.10766804 48577.59023701 47678.90315126
 53433.75372939 49947.64462367]
Mean : 50229.359870258486
Standard Deviation : 2082.3174179664184
