In [99]:
# Chapter 2:End to end machine learning Project
# P64 Cusom Transformers

In [100]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

In [101]:
# download data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "../datasets/housing/"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    print(tgz_path, housing_url)
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [102]:
from sklearn.base import BaseEstimator, TransformerMixin

In [103]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

In [104]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [105]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [106]:
# Feature Scalling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [107]:
# This cell insert from here to run next cell.
from sklearn.preprocessing import Imputer
housing_num = housing.drop("ocean_proximity", axis=1)

In [108]:
num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [109]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [110]:
# A full pipeline handing both numerical
# categorical atributes may look like this:

In [111]:
# from sklearn.pipeline import FeatureUnion

In [112]:
# num_attribs = list(housing_num)
# cat_attribs = ["ocean_proximi"]

In [128]:
# This cell from another part P66 by zhangming
from sklearn.preprocessing import CategoricalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attrbute_names):
        self.attrbute_names = attrbute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attrbute_names].values

In [129]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [130]:
# These code is wrong!!!
'''
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
cat_pipline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
])
full_pipline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipline", cat_pipline)
])
'''
# The right is at
# https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb

'\nnum_pipeline = Pipeline([\n    (\'selector\', DataFrameSelector(num_attribs)),\n    (\'imputer\', Imputer(strategy="median")),\n    (\'attribs_adder\', CombinedAttributesAdder()),\n    (\'std_scaler\', StandardScaler()),\n])\ncat_pipline = Pipeline([\n    (\'selector\', DataFrameSelector(num_attribs)),\n    (\'cat_encoder\', CategoricalEncoder(encoding="onehot-dense")),\n])\nfull_pipline = FeatureUnion(transformer_list=[\n    ("num_pipeline", num_pipeline),\n    ("cat_pipline", cat_pipline)\n])\n'

In [131]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [132]:
housing_prepared = full_pipeline.fit_transform(housing)

In [133]:
housing_prepared

array([[-1.32783522,  1.05254828,  0.98214266, ...,  0.        ,
         1.        ,  0.        ],
       [-1.32284391,  1.04318455, -0.60701891, ...,  0.        ,
         1.        ,  0.        ],
       [-1.33282653,  1.03850269,  1.85618152, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8237132 ,  1.77823747, -0.92485123, ...,  0.        ,
         0.        ,  0.        ],
       [-0.87362627,  1.77823747, -0.84539315, ...,  0.        ,
         0.        ,  0.        ],
       [-0.83369581,  1.75014627, -1.00430931, ...,  0.        ,
         0.        ,  0.        ]])