In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import tarfile
import urllib
import pandas as pd

random_seed = 42

In [2]:
from utils import load_housing_data, save_fig, CombinedAttributesAdder, display_scores

housing = load_housing_data()

In [3]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# 연속적인 값은 카테고리로 Grouping
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed)
for train_index, test_index in split.split(housing, housing["income_cat"]) :
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]
    
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=random_seed)

housing = strat_train_set.copy()
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),  ## Fit & Transformer
    ('attribs_adder', CombinedAttributesAdder()),   ## Fit & Transformer
    ('std_scaler', StandardScaler()),                 ## Fit
])

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [6]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=random_seed)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(random_state=42)

In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                        scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores : [49633.454487   47565.18736589 50034.52343496 52444.25008076
 49463.18392461 53515.10766804 48577.59023701 47678.90315126
 53433.75372939 49947.64462367]
Mean : 50229.359870258486
Standard Deviation : 2082.3174179664184


## 6. 모델 세부 튜닝 (Fine-Tune Your Model)

모델의 종류를 선택한 후 모델의 세부 튜닝하는 것이 필요합니다. 모델 학습을 위한 최적의 하이퍼파라미터를 찾는 과젖ㅇ이라고 말할 수 있습니다.

### 그리드 탐색 (Grid Search)
수동으로 하이퍼파라미터 조합을 시도하는 대신 GridSearchCV를 사용하는 것이 좋습니다.

In [8]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3x4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features' : [2, 4, 6, 8]},
    # then try 6 (2x3) combinations with bootstrap set as False
    {'bootstrap' : [False], 'n_estimators': [3, 10], 'max_features' : [2, 3, 4]}
]

forest_reg = RandomForestRegressor(random_state=random_seed)

# train across 5 folds, that's total of (12+6)*5=90 rounds of training
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [9]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [10]:
grid_search.best_estimator_

RandomForestRegressor(max_features=6, n_estimators=30, random_state=42)

In [11]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]) :
    print(np.sqrt(-mean_score), params)

64246.880700613496 {'max_features': 2, 'n_estimators': 3}
55869.85367924813 {'max_features': 2, 'n_estimators': 10}
53472.1282558969 {'max_features': 2, 'n_estimators': 30}
61376.36445082522 {'max_features': 4, 'n_estimators': 3}
53846.329115303764 {'max_features': 4, 'n_estimators': 10}
51270.1941502407 {'max_features': 4, 'n_estimators': 30}
59860.61532587693 {'max_features': 6, 'n_estimators': 3}
53114.42460001889 {'max_features': 6, 'n_estimators': 10}
50811.43543872171 {'max_features': 6, 'n_estimators': 30}
59220.31563298743 {'max_features': 8, 'n_estimators': 3}
52884.78697544277 {'max_features': 8, 'n_estimators': 10}
50944.39369116168 {'max_features': 8, 'n_estimators': 30}
62805.52917192821 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54462.1410888642 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
61117.32056104296 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53022.992252269294 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10

### 랜덤 탐색 (Randomized Search)

하이퍼 파리미터 조합의 수가 큰 경우에 유리. 지정한 횟수만큼만 평가.

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'n_estimators' : randint(low=1, high=200),
    'max_features' : randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=random_seed)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                              n_iter=10, cv=5, scoring='neg_mean_squared_error')
rnd_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe938aee460>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe933e3c430>},
                   scoring='neg_mean_squared_error')

In [13]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]) :
    print(np.sqrt(-mean_score), params)

51484.00644031442 {'max_features': 6, 'n_estimators': 19}
50510.18392710172 {'max_features': 4, 'n_estimators': 89}
55161.51496501134 {'max_features': 7, 'n_estimators': 6}
50285.60888853998 {'max_features': 4, 'n_estimators': 163}
55645.51092041444 {'max_features': 3, 'n_estimators': 8}
50136.996719075054 {'max_features': 5, 'n_estimators': 168}
50119.05502384417 {'max_features': 7, 'n_estimators': 80}
53025.884642853394 {'max_features': 2, 'n_estimators': 40}
52214.7496193823 {'max_features': 4, 'n_estimators': 17}
50367.22157690333 {'max_features': 5, 'n_estimators': 92}


### 특성 중요도, 에러 분석

In [14]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.79326113e-02, 6.18280724e-02, 4.33395023e-02, 1.81017027e-02,
       1.83291556e-02, 1.93269892e-02, 1.78369580e-02, 2.41360490e-01,
       1.61976585e-01, 5.35982558e-02, 1.06273526e-01, 6.14045141e-02,
       1.22353255e-02, 1.08821239e-01, 2.76143239e-05, 2.59938294e-03,
       5.00807682e-03])

In [15]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
# cat_encoder   = cat_pipeline.named_steps_["cat"]   # old solution
cat_encoder   = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes    = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.2413604895538288, 'median_income'),
 (0.16197658459849273, 'income_cat'),
 (0.10882123891274473, 'INLAND'),
 (0.10627352591969833, 'pop_per_hhold'),
 (0.06793261134305181, 'longitude'),
 (0.061828072419167844, 'latitude'),
 (0.06140451407841603, 'bedrooms_per_room'),
 (0.05359825584988401, 'rooms_per_hhold'),
 (0.04333950231438805, 'housing_median_age'),
 (0.0193269891794112, 'population'),
 (0.018329155582427953, 'total_bedrooms'),
 (0.018101702689683707, 'total_rooms'),
 (0.017836957990116878, 'households'),
 (0.01223532548334132, '<1H OCEAN'),
 (0.005008076816921073, 'NEAR OCEAN'),
 (0.0025993829445232247, 'NEAR BAY'),
 (2.761432390218492e-05, 'ISLAND')]

## 7. 테스트 데이터셋으로 최종 평가하기

In [16]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared   = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse  = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [17]:
final_rmse

49040.18609727207

## 8. 론칭, 모니터링, 시스템 유지 보수

상용환경에 배포하기 위해서 데이터 전처리와 모델의 예측이 포함된 파이프라인을 만들어 저장하는 것이 좋습니다.

In [18]:
from sklearn.linear_model import LinearRegression

full_pipeline_with_predictor = Pipeline([
    ("preparation", full_pipeline),
    ("linear", LinearRegression())
])

full_pipeline_with_predictor.fit(housing, housing_labels)

some_data   = housing.iloc[:5]
full_pipeline_with_predictor.predict(some_data)

array([203682.37379543, 326371.39370781, 204218.64588245,  58685.4770482 ,
       194213.06443039])

In [19]:
some_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN,3


In [20]:
my_model = full_pipeline_with_predictor

In [21]:
import joblib
joblib.dump(my_model, "my_model.pkl")
# ...
my_model_loaded = joblib.load("my_model.pkl")

In [22]:
my_model_loaded.predict(some_data)

array([203682.37379543, 326371.39370781, 204218.64588245,  58685.4770482 ,
       194213.06443039])

### 론칭 후 시스템 모니터링

- 시간이 지나면 모델이 낙후되면서 성능이 저하
- 자동 모니터링 : 추천시스템의 경우, 추천된 상품의 판매량이 줄어드는지?
- 수동 모니터링 : 이미지 분류의 경우, 분류된 이미지들 중 일부를 전문가에게 검토시킴
- 결과가 나빠진 경우
    - 데이터 입력의 품질이 나빠졌는지? 센서 고장?
    - 트렌드의 변화? 계절적 요인?
    

### 유지 보수
- 정기적으로 새로운 데이터 수집(레이블)
- 새로운 데이터를 테스트 데이터로, 현재의 테스트 데이터는 학습데이터로 편입
- 다시 학습 후, 새로운 테스트 데이터에 기반해 현재 모델과 새 모델을 평가, 비교

### 전체 프로세스에 고르게 시간을 배분해야 합니다!