# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*

    
___



# End-to-End Machine Learning Project!


1/ Describe the Task

2/ Get and Explore the Data

3/ Prepare the Data for ML Algorithms!

4/ Select and Train a Model

5/ Fine-Tune the Model!

6/ Interpret the results

___    

# 1/ The Problem 

    1/  Build a model of housing prices in California using the California census data; 
    2/  to be able to predict the median housing price in any district. 
    
    The Problem: 
    * multivariate regression (will use multiple features to make a prediction); 
    * there is no continuous flow of data -> batch learning; 



# Setup environment

In [None]:
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot 'pretty' figures
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%matplotlib inline

# print(os.getcwd())
# SET PATH TO YOUR PROJECT DIRECTORY!!!
# machine_learning_geosciences/06_ML_workflow/
PROJECT_ROOT_DIR = "./"
if os.path.isdir(PROJECT_ROOT_DIR): 
    print('Ok continue.')
else: 
    print('Nok, set correct path to your project directory!')

# Set path to save project figures 
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="internal gelsd")

# 2/ Get and explore the Data


In [None]:
import pandas as pd

# check the dayasets dir 
HOUSING_PATH = os.path.join(PROJECT_ROOT_DIR, "housing")
print(HOUSING_PATH)

# function to read the csv file 
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
# load data 
housing = load_housing_data()

# check header and some values 
housing.head()

In [None]:
# evaluate the values 
housing.info()

In [None]:
housing.describe()

In [None]:
# histograms 
housing.hist(bins=50, figsize=(20,15))
# save_fig("attribute_histogram_plots")
plt.show()

### Create a Test Set

A sampling task: 

    * random sampling (introducing a significant sampling bias); 
    * stratified sampling (to ensure that split sample is representative of the whole populatio); 
    

### Numpy random 

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

### Scikit learn split functions

To make repeated experiments set `random_state` to some constatnt!

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
test_set.head()

In [None]:
# Check distribution
housing["median_income"].hist()

### Stratified sampling

    - Most median income values are clustered around 2–5; 
    - some median incomes go far beyond 6; 
    - should not have too many strata, and each stratum should be large enough; 
    - create an income category attribute by dividing the median income by 1.5 (rounding up using ceil);
    - merging all the categories greater than 5 into category 5. 
    

In [None]:
# Option 1: np.ceil & where

# Divide by 1.5 to limit the number of income categories
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)

# Label those above 5 as 5
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)


In [None]:
housing.head()

In [None]:
# Option 2: pd.cut

# housing["income_cat"] = pd.cut(housing["median_income"],
#                                bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
#                                labels=[1, 2, 3, 4, 5])

In [None]:
housing["income_cat"].value_counts()

In [None]:
# histogram of cathegories!
housing["income_cat"].hist()

In [None]:
# Do stratified sampling based on the income category!
# use Scikit-Learn’s, cross-validator provides train/test indices to split data in train/test sets

from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    

In [None]:
# the income category proportions in the test set
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# the income category proportions in the full housing dataset
housing["income_cat"].value_counts() / len(housing)

In [None]:
# Is there a balnce?

In [None]:
# Compare stratified sampling, and using purely random sampling!
# of the income category proportions generated 

def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

# random 
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()

compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
# Which test set is more representative of the overall set? 
compare_props

In [None]:
# Which one do you prefere? 

In [None]:
# cleaning 
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True) 

# Visualize the data to gain insights


In [None]:
housing = strat_train_set.copy()
housing.head()

In [None]:
housing.shape

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1, 
            s=housing["population"]/100, label="population", figsize=(10,7), 
            cmap=plt.get_cmap("jet"), colorbar=True)

In [None]:
# plot housing data
# c="median_house_value", 
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1,
    s=housing["population"]/100, label="population", figsize=(10,7),
    cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()
# save_fig("housing_prices_scatterplot")

In [None]:
# Looking for Correlations
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8), alpha=0.5)
# save_fig("scatter_matrix_plot")

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])
# save_fig("income_vs_house_value_scatterplot")

## 3/ Prepare the data for Machine Learning algorithms
## Feature Engineering

In [None]:
# add some more features (relative) 
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
# drop labels for training set
housing = strat_train_set.drop("median_house_value", axis=1) 
housing_labels = strat_train_set["median_house_value"].copy()
housing.head()

In [None]:
housing_labels.head()

In [None]:
# Which records have None / null?

sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
# sample_incomplete_rows.dropna(subset=["total_bedrooms"])    # option 1

In [None]:
# sample_incomplete_rows.drop("total_bedrooms", axis=1)       # option 2

In [None]:
# Fill-in meadian value 

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
sample_incomplete_rows

In [None]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
# Remove the text attribute because median can only be calculated on numerical attributes:

housing_num = housing.drop('ocean_proximity', axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])
# housing_num

In [None]:
# fit the imputer instance to the training data using the fit() method
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
# Check that this is the same as manually computing the median of each attribute:
housing_num.median().values

In [None]:
# Now you can use this “trained” imputer to transform the training set 
# by replacing missing values by the learned medians

X = imputer.transform(housing_num)

# Numpy array containing the transformed features
print(type(X))

In [None]:
# put it back into a Pandas DataFrame

housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [None]:
sample_incomplete_rows.head()

In [None]:
housing_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
housing_tr.shape

### Transformation Pipelines

One of the most important transformations you need to apply to your data is feature scaling. Machine Learning algorithms don’t perform well when the input numerical attributes have very different scales. 

E.g. total number of rooms ranges from about 6 to 39,320, while the median incomes only range from 0 to 15; 

**Feature Scaling**: 
        * min-max scaling (normalization), values are shifted and rescaled so that they end up ranging from 0 to 1. 
        * standardization: subtracts the mean value, and then it divides by the variance so that the resulting distribution has zero mean and unit variance.

Scikit-learn provides `MinMaxScaler` and `StandardScaler` for standardization. 

### Add extra features

In [None]:
# define function for adding extra features 

# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

# print(rooms_ix, bedrooms_ix, population_ix, household_ix)

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]

### Create Pipeline 

Let's build a pipeline for preprocessing the numerical attributes (use `CombinedAttributesAdder()` or `FunctionTransformer(...)` as preferred):

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),                              # 1st process   
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)), # 2nd process 
        ('std_scaler', StandardScaler())                                            # 3rd process 
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [None]:
housing_num_tr

In [None]:
print(housing_num_tr.shape)

Try also different transformations to different columns using a solution based on the `ColumnTransformer` class that was introduced in Scikit-Learn 0.20. 
If you are using an older version of Scikit-Learn, you can import it from `future_encoders.py`:

In [None]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20


In [None]:
# treat cathegorical variables 
num_attribs = list(housing_num)
print(num_attribs)
cat_attribs = ["ocean_proximity"]
print(cat_attribs)

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
print(type(housing_prepared))

In [None]:
housing_prepared.shape

# Select and train a model 

In [None]:
print(housing_prepared.shape)
print(housing_labels.shape)

In [None]:
# housing_labels.head()

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Compare against the actual values:

In [None]:
print("Labels:", list(some_labels))

In [None]:
# some_data_prepared

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
lin_rmse / housing_labels.median() * 100.

In [None]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

#### What does it mean? 

# Fine-tune the model

In [None]:
from sklearn.model_selection import cross_val_score

# Decision Tree regressor 
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
# Linear regression model 
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
# Try Random Forest 

**Note**: we specify `n_estimators=10` to avoid a warning about the fact that the default value is going to change to 100 in Scikit-Learn 0.22.

In [None]:
from sklearn.ensemble import RandomForestRegressor
# n_estimators=10, 
forest_reg = RandomForestRegressor(random_state=42)


In [None]:
# BEAWARE runs longer time! 
# from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
# scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
# pd.Series(np.sqrt(-scores)).describe()

In [None]:
# BEAWARE runs long time! 
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [5, 10, 30, 50] } # , 'max_features': [2, 4, 6, 8, 12] # ,
    # then try 6 (2×3) combinations with bootstrap set as False
    # {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)

# train across 5 folds, that's a total of (12+6)*5 = 90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

The best hyperparameter combination found:

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

Let's look at the score of each hyperparameter combination tested during the grid search:

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# pd.DataFrame(grid_search.cv_results_)

In [None]:
# BEAWARE runs long time! 

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42, n_jobs=1)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

In [None]:
rnd_search.best_estimator_

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
# feature_importances

In [None]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
# Select final model and evaluate it with test set!

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
display_scores(lin_rmse_scores)

In [None]:
# 1. Linear model 
print('Linear model RMSE: {}'.format(lin_rmse))
print('---')
# 2. DecisionTreeRegressor
print('DecisionTreeRegressor model RMSE: {}'.format(tree_rmse))
print('---')
# 3. Cross-validated linear model 
print('Cross-validated linear model RMSE: ')   # cv=10 
display_scores(lin_rmse_scores)
print('---')
# 4. Cross-validated decision trees 
print('Cross-validated DecisionTreeRegressor model RMSE: ')   # cv=10 
display_scores(tree_rmse_scores) # cv=10 
print('---')
# 5. RF - grid search
print('RF - grid searched model: ') 
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
print('---')
# 6. RF random search 
print('RF - random searched model: ') 
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
final_rmse / np.median(y_test) * 100. 

# Extra

## A full pipeline with both preparation and prediction

In [None]:
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("linear", LinearRegression())
    ])

full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)