# Chapter 2 - End-to-end Machine Learning

## Overview

This chapter demonstrated an end-to-end ML project, using the California Housing Prices dataset.

Chapter structure:
    
1. Look at the big picture
2. Get the data
3. Discover and visualize the data to gain insights
4. Prepare the data for Machine Learning algorithms
5. Select a model and train it
6. Fine-tune your model
7. Present your solution
8. Launch, monitor, and maintain your system

## Loading the data

In [None]:
import os
import tarfile
import pandas as pd
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"

def fetch_housing_data():
    tgz_path = "data/housing.tgz"
    urllib.request.urlretrieve(DOWNLOAD_ROOT + "datasets/housing/housing.tgz", tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path="data")
    housing_tgz.close()

def load_housing_data():
    return pd.read_csv("data/housing.csv")

fetch_housing_data()

# Load data into Pandas dataframes
housing = load_housing_data()

### Sanity check

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

## Visualise the data

Matplotlib is a useful tool for visualising the data. Here, we plot the data as histograms:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import numpy as np

np.random.seed(42)

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

In [None]:
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [None]:
housing_with_id = housing.reset_index()
housing_with_id.head()

In [None]:
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
print(len(train_set), "train +", len(test_set), "test")

In [None]:
# Scale and round off income values
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# Replace values >= 5 with 5.0
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].head()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]


In [None]:
housing["income_cat"].value_counts() / len(housing)

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
for set in (strat_train_set, strat_test_set):
    set.drop(["income_cat"], axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()
housing.plot(kind="scatter", x="longitude", y="latitude")
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population",
             c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
             figsize=(8, 6))
plt.legend()

In [None]:
corr_matrix = housing.corr()
corr_matrix

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
plt.show()

In [None]:
# Experiment with attribute combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

In [None]:
# Inspect the updated correlation matrix
corr_matrix = housing.corr()
corr_matrix

In [None]:
# See which values correspond most strongly with median house value; we'll see that bedrooms_per_room
# is highly correlated
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# Revert to a clean dataset
# NB: drop() makes a copy of the dataset
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# Data cleaning
# Options are to:
# 1. use dropna()
# 2. use drop()
# 3. use fillna(median) with a pre-computed median
# 4. use SimpleImputer class

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
# Remove text attribute 'ocean_proximity'
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
imputer.statistics_

In [None]:
# Generate a numpy array as per the transformations applyed by imputer
X = imputer.transform(housing_num)
# Convert back into a Pandas DataFrame
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

In [None]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
housing_cat_encoded

In [None]:
print(encoder.classes_)

In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
housing["ocean_proximity"]

In [None]:
housing_cat_1hot = encoder.fit_transform(housing["ocean_proximity"].values.reshape(-1, 1))
housing_cat_1hot.toarray()

In [None]:
# Custom transformers
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

# BaseEstimator provides get_params() and set_params() functions
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            # np.c_ allows new columns to be appended
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
# Scaling and pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
# DataFrameSelector

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
from sklearn.preprocessing import LabelBinarizer

#
# Resolves 'TypeError: fit_transform() takes 2 positional arguments but 3 were given'
#
# See https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize
#
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

In [None]:
# Define a pipeline using FeatureUnion

from sklearn.pipeline import FeatureUnion

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', MyLabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

In [None]:
# Run the pipeline
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

In [None]:
#
# Linear regression
#
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_data

In [None]:
some_data_prepared = full_pipeline.transform(some_data)
some_data_prepared

In [None]:
lin_reg.predict(some_data_prepared)

In [None]:
some_labels = housing_labels.iloc[:5]
list(some_labels)

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
#
# Decision tree
#
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
# Model has likely overfit the data
#
# Use cross validation instead
#

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)
rmse_scores

In [None]:
#
# Apply cross-validation to the linear regression model
#
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_scores

In [None]:
lin_rmse_scores = np.sqrt(-lin_scores)
lin_rmse_scores

In [None]:
#
# Random forest regressor
#

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_scores

In [None]:
forest_rmse_scores = np.sqrt(-forest_scores)
forest_rmse_scores