### Load data

In [1]:
import os
import pandas as pd
import numpy as np
from pandas import Categorical, get_dummies
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, StandardScaler

In [2]:
print(os.listdir('./datasets/housing'))

housing = pd.read_csv('./datasets/housing/housing.csv')
housing.head()

['housing.csv', 'housing.tgz', 'README.md']


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Split data

In [3]:
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
#print(housing.income_cat)
# note this code. It's so nice that series also have where!!!
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
# housing.income_cat.hist(bins = np.arange(6)-0.5);
# plt.hist(housing.income_cat)

from sklearn.model_selection import StratifiedShuffleSplit

mysplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# since n_splits=1 we have only one list for train_index, one list for test_index
for train_index, test_index in mysplit.split(housing, housing['income_cat']):
    train_data = housing.loc[train_index]
    test_data = housing.loc[test_index]

xtrain = train_data.drop(columns=['median_house_value', 'income_cat'])
ytrain = train_data['median_house_value']

xtest = test_data.drop(columns=['median_house_value', "income_cat"])
ytest = test_data['median_house_value']

### Create pipeline

In [4]:
# Selector to just select the data
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# column numbers of some attributes
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6    
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        # 2 features are always added
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        # one feature can be added or not based on the boolean parameter
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            # np.c_ Translates slice objects to concatenation along the second axis.
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        
        else:
            #print(X.shape)
            #print(rooms_per_household.shape)
            #print(np.c_[X, rooms_per_household, population_per_household].shape)
            return np.c_[X, rooms_per_household, population_per_household]

        
class CategoricalWarrior(BaseEstimator, TransformerMixin):
    """One hot encoder for all categorical features"""
    def __init__(self, attribute_names):
        # initiate attribute names from input column names
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        """
        fit function first creates a dictionary.
        Each element has: key word=categorical column name, value=unique value in the categorical column
        then fit function creates a new attribute call categoricals, which equals the dictionary
        """
        cats = {}
        for column in self.attribute_names:
            cats[column] = X[column].unique().tolist()
        self.categoricals = cats
        return self

    def transform(self, X, y=None):
        """
        transform turns first convert the categorical columns into categorical data type
        with given categories (defined from function "fit" above)
        Then it uses function get_dummies to convert to one hot encoded data
        """
        df = X.copy()
        for column in self.attribute_names:
            df[column] = Categorical(df[column], categories=self.categoricals[column])
        new_df = get_dummies(df, drop_first=False)
        # in case we need them later
        self.columns = new_df.columns
        return new_df

In [5]:
num_att = list(xtrain.drop(columns=["ocean_proximity"]))
cat_att = ["ocean_proximity"]

num_pipeline = Pipeline([("selector", DataFrameSelector(num_att)), 
                         ("imputer", Imputer(strategy="median")),
                        ("add_attributes", CombinedAttributesAdder()),
                        ("std_scaler", StandardScaler())])

cat_pipeline = Pipeline([("cat_maker", CategoricalWarrior(cat_att))])

full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline),
                                              ("cat_pipeline", cat_pipeline)])

In [6]:
xtrain_transformed = full_pipeline.fit_transform(xtrain)
xtest_transformed = full_pipeline.transform(xtest)

In [7]:
xtrain_transformed.shape

(16512, 24)

In [8]:
xtest_transformed.shape

(4128, 24)

array([[-1.15604281e+00,  7.71949616e-01,  7.43330892e-01,
        -4.93233934e-01, -4.45438207e-01, -6.36211407e-01,
        -4.20698422e-01, -6.14937444e-01, -3.12054519e-01,
        -8.64987054e-02,  1.55317530e-01, -1.21890000e+02,
         3.72900000e+01,  3.80000000e+01,  1.56800000e+03,
         3.51000000e+02,  7.10000000e+02,  3.39000000e+02,
         2.70420000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.17602483e+00,  6.59694795e-01, -1.16531720e+00,
        -9.08966554e-01, -1.03692780e+00, -9.98331347e-01,
        -1.02222705e+00,  1.33645936e+00,  2.17683377e-01,
        -3.35339129e-02, -8.36289016e-01, -1.21930000e+02,
         3.70500000e+01,  1.40000000e+01,  6.79000000e+02,
         1.08000000e+02,  3.06000000e+02,  1.13000000e+02,
         6.42140000e+00,  1.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.18684903e+00, -1.34218285e+00,  1.86641864e