In [70]:
# Chapter 2:End to end machine learning Project
# P64 Cusom Transformers

In [71]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

In [72]:
# download data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "../datasets/housing/"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    print(tgz_path, housing_url)
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [73]:
from sklearn.base import BaseEstimator, TransformerMixin

In [74]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

In [75]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [76]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [77]:
# Feature Scalling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [78]:
# This cell insert from here to run next cell.
from sklearn.preprocessing import Imputer
housing_num = housing.drop("ocean_proximity", axis=1)

In [79]:
num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [80]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [81]:
# A full pipeline handing both numerical
# categorical atributes may look like this:

In [82]:
from sklearn.pipeline import FeatureUnion

In [68]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [87]:
# This cell from another part P66 by zhangmi
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attrbute_names):
        self.attrbute_names = attrbute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attrbute_names].values

In [88]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
cat_pipline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('label_binarizer', LabelBinarizer()),
])
full_pipline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipline", cat_pipline)
])