# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*

    
___



# End-to-End Machine Learning Project!


1/ Describe the Task

2/ Get and Explore the Data

3/ Prepare the Data for ML Algorithms!

4/ Select and Train a Model

5/ Fine-Tune the Model!

6/ Interpret the results

___    

# 1/ The Problem 

**1.1 Frame the problem:** 
The task is to build a model of housing prices in California using the California census data to be able to predict the median housing price in any district. 

**Assumptions of the problem:** 
* There exist some (most likely non-linear) relationship between input features (X) and the output target  variable (y); 
* the output target is a continuous variable, hence we employ regression type of model; 
* There are multiple features, hence multivariate regression; 
* There is no continuous flow of data -> batch learning;

**1.2 The expected result:** 
The developed model shall predict housing prices based set of characteristics (fetures) with error < 20 %. 


**Performance measures:** 
* RMSE or MAE; 
* How much we know about the data? -> Update after EDA. 



# Setup environment

In [None]:
import os
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt 

# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestRegressor

# to make this notebook's output stable across runs
np.random.seed(42)
# To plot 'pretty' figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=12)
mpl.rc('xtick', labelsize=10)
mpl.rc('ytick', labelsize=10)

# SET PATH TO YOUR PROJECT DIRECTORY!
PROJECT_ROOT_DIR = "./"
if os.path.isdir(PROJECT_ROOT_DIR): 
    print('Ok continue.')
else: 
    print('Nok, set correct path to your project directory!')

# Set path to save project figures 
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="internal gelsd")

# 2/ Exploratory Data Analysis


In [None]:
# Inport data 
HOUSING_PATH = os.path.join(PROJECT_ROOT_DIR)
# print(HOUSING_PATH)

# function to read the csv file 
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
# load data 
housing = load_housing_data()

# check header and some values 
housing.head()

In [None]:
# evaluate the values 
housing.info()

In [None]:
# Evaluate statistics of the features 
housing.describe()

In [None]:
# features 
# housing_columns = list(housing.columns)
# housing_columns

### 2.1 Visualize the data to gain insights

In [None]:
# histograms 
housing.hist(bins=50, figsize=(15,10))
# save_fig("attribute_histogram_plots") 

In [None]:
# plot housing data (spatial plot )
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1,
             c="median_house_value",
             s=housing["population"]/100, label="population", figsize=(10,7),
             cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend()
# save_fig("housing_prices_scatterplot")

### 2.2 Looking for Correlations

In [None]:
# Remove non-numeric data 
housing_rval = housing.drop('ocean_proximity', axis=1)
# housing_rval.head()

In [None]:
# Prepare correlation data 
corr_matrix = housing_rval.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
# cross-scatterplot for selected features 
attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(7, 5), alpha=0.3)
# save_fig("scatter_matrix_plot")

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1)
plt.axis([0, 16, 0, 550000])
# save_fig("income_vs_house_value_scatterplot")

### 2.3 Experiment with Attributes 
.. later on in data preparation. 

# 3. Data preparation

### 3.1 Prepare the data for Machine Learning algorithms - Feature Engineering

In [None]:
housing.shape

### New fetures

In [None]:
# add some more features (relative indicators) 
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
housing.columns

In [None]:
housing.shape

In [None]:
# Append new feature names to the list of columns 
housing_columns.append('rooms_per_household')
housing_columns.append('bedrooms_per_room')
housing_columns.append('population_per_household')
housing_columns

### Check correlations

In [None]:
housing_rval = housing.drop('ocean_proximity', axis=1)

In [None]:
corr_matrix = housing_rval.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

### 3.2 Fill no-data values or drop incomplete records

In [None]:
# Which records have NaN record?
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
sample_incomplete_rows.shape

### Drop NaN records 

In [None]:
# Option 1: .dropna()
# sample_incomplete_rows.dropna(subset=["total_bedrooms"])    

In [None]:
# Option 2: .drop()
# sample_incomplete_rows.drop("total_bedrooms", axis=1)       

### Fill-in meadian value 

In [None]:
# Option 3: fill-in meadian value 
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) 
sample_incomplete_rows

### Sklearn Imputer

In [None]:
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
# Remove the text attribute because median can only be calculated on numerical attributes:
housing_num = housing.drop('ocean_proximity', axis=1)
# housing_num

In [None]:
# fit the imputer instance to the training data using the fit() method
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
# Check that this is the same as manually computing the median of each attribute:
housing_num.median().values

In [None]:
# Now we can use this “trained” imputer to transform the training set 
# by replacing missing values by the learned medians

housing_num_imp = imputer.transform(housing_num)

# Numpy array containing the transformed features
print(type(housing_num_imp))

In [None]:
# put it back into a Pandas DataFrame
housing_num_tr = pd.DataFrame(housing_num_imp, columns=housing_num.columns,
                          index=housing.index)

In [None]:
# check the imputation 
housing_num_tr.loc[sample_incomplete_rows.index.values]

In [None]:
imputer.strategy

In [None]:
housing_num_tr.shape

In [None]:
# Scale the numerical features: housing_num
# scaler = StandardScaler()
# scaler.fit(housing_num)
# scaler.mean_
# housing_num_scaled = scaler.transform(housing_num)

In [None]:
# type(housing_num_scaled)

In [None]:
# put it back into a Pandas DataFrame
# housing_num_scaled = pd.DataFrame(housing_num_scaled, columns=housing_num.columns,
#                           index=housing.index)

In [None]:
# housing_num_scaled.head()

In [None]:
# housing_num_scaled.shape

### 3.3 Harmonize numerical data
### Feature Endcoding 

Transform cathegorical text feature(s) into numerical. 

In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn > 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

In [None]:
housing["ocean_proximity"]

In [None]:
housing["ocean_proximity"].shape 

In [None]:
# encoder = OneHotEncoder()
encoder= OneHotEncoder(sparse_output=False).set_output(transform="pandas")
encoder.fit(housing[["ocean_proximity"]])
ocean_proximity_num = encoder.transform(housing[["ocean_proximity"]])

In [None]:
type(ocean_proximity_num)

In [None]:
ocean_proximity_num.head()

In [None]:
housing_num_tr.shape

In [None]:
ocean_proximity_num.shape

In [None]:
# Merge the two DataFrames
housing_num_enc_merged = pd.concat([housing_num_tr, ocean_proximity_num], axis=1)
housing_num_enc_merged.head()

In [None]:
housing_num_enc_merged.columns

In [None]:
housing_num_enc_merged.shape

In [None]:
# housing_columns.remove('ocean_proximity')

In [None]:
# for item in list(ocean_proximity_num.columns):
#         housing_columns.append(item)

In [None]:
# housing_columns

### 3.4 Split Data set into Training and Test Sets

Options: 

    * random sampling (introducing a significant sampling bias); 
    * stratified sampling (to ensure that split sample is representative of the whole populatio); 
    

### Numpy random solution

In [None]:
# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(housing_num_enc_merged, 0.5)
print(f'Train set size: {len(train_set)}, Test set size: {len(test_set)}')

### Scikit learn split functions

To make repeated experiments set `random_state` to some constatnt!

In [None]:
# split 
train_set, test_set = train_test_split(housing_num_enc_merged, test_size=0.5, random_state=42)

In [None]:
print(f'Train set size: {len(train_set)}, Test set size: {len(test_set)}')

### Stratified sampling


In [None]:
# check histogram 
housing_num_enc_merged["median_income"].hist()

In [None]:
# cut the histogram into bins 
housing_num_enc_merged["income_cat"] = pd.cut(housing_num_enc_merged["median_income"],
                               bins=[0.0, 2.0, 3.0, 4.0,  np.inf],
                                labels=[1, 2, 3, 4])

In [None]:
# chaeck for NaNs 
housing_num_enc_merged["income_cat"].isnull().any()

In [None]:
# housing_num_enc_merged["income_cat"]
# housing_num_enc_merged[housing_num_enc_merged["income_cat"].isnull().any(axis=1)].head()
# housing_num_enc_merged[housing_num_enc_merged["income_cat"].isnull()]['median_income'].hist()

In [None]:
# check distribution in the bins
housing_num_enc_merged["income_cat"].hist()

In [None]:
# housing_num_enc_merged["income_cat"] 

In [None]:
# Do stratified sampling based on the income category!
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in sss.split(housing_num_enc_merged, housing_num_enc_merged["income_cat"]):
    strat_train_set = housing_num_enc_merged.loc[train_index]
    strat_test_set = housing_num_enc_merged.loc[test_index]

In [None]:
strat_train_set.head()

In [None]:
# the income category proportions in the test set
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
# the income category proportions in the full housing dataset
housing_num_enc_merged["income_cat"].value_counts() / len(housing_num_enc_merged)

### Is there a balnce?

# 4. Select and train a model 

In [None]:
# Prepare training data 
X_train = strat_train_set.drop('median_house_value', axis=1)
y_train = strat_train_set['median_house_value']

In [None]:
# and testing data 
X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value']

### Linear relationship? 

In [None]:
# Model 
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
# Evaluating: training MAE 
housing_predictions = lin_reg.predict(X_train)
lin_mae = mean_absolute_error(y_train, housing_predictions)
print(f'Linear model training MAE: {lin_mae}') 

In [None]:
# Evaluating: testing MAE 
housing_test_predictions = lin_reg.predict(X_test)
lin_mae_test = mean_absolute_error(y_test, housing_test_predictions)
print(f'Linear model testing MAE: {lin_mae_test}') 

In [None]:
print(f'Median relative error: {round(lin_mae / y_train.median() * 100., 2)} %') 

### Non-linear relationship? 

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)

In [None]:
# Evaluating: training MAE 
housing_predictions = tree_reg.predict(X_train)
tree_mae = mean_absolute_error(y_train, housing_predictions)
print(f'Tree model training MAE: {tree_mae}') 

In [None]:
# Evaluating: testing MAE 
housing_test_predictions = tree_reg.predict(X_test)
tree_mae_test = mean_absolute_error(y_test, housing_test_predictions)
print(f'Tree model testing MAE: {tree_mae_test}') 

#### What does it mean? 

In [None]:
print(f'Median relative error: {round(tree_mae / y_train.median() * 100., 2)} %') 

# Fine-tune the model

In [None]:
# Try Random Forest 
forest_reg = RandomForestRegressor(random_state=42)

In [None]:
# Combinations for Grid Search Cross-validation 
hyperparameter_space = {'n_estimators': [20, 50, 100], 
                        'max_depth': [10, 15, 20],
                        'min_samples_leaf': [2, 4]
                        }

gs = GridSearchCV(forest_reg, param_grid=hyperparameter_space, n_jobs=4,
                  scoring="neg_mean_absolute_error", cv=10, return_train_score=True)

In [None]:
# run fit (be aware: TAKES LONG!)
gs.fit(X_train, y_train)

In [None]:
# Optimal parameters and scores 
print("Optimal hyperparameter combination: ", gs.best_params_)

In [None]:
feature_importances = gs.best_estimator_.feature_importances_

In [None]:
# feature_importances
sorted(zip(feature_importances, list(X_train.columns)), reverse=True)

In [None]:
gs.best_estimator_

In [None]:
# Select final model and evaluate it with test set!
final_model = gs.best_estimator_

In [None]:
print("Mean cross-validated accuracy of the best_estimator: ", round((-gs.best_score_), 2))

In [None]:
# Evaluating: testing MAE 
housing_test_predictions = final_model.predict(X_test)
final_model_mae_test = mean_absolute_error(y_test, housing_test_predictions)
print(f'Final model testing MAE: {final_model_mae_test}') 

In [None]:
print(f'Median relative error: {round(final_model_mae_test / y_train.median() * 100., 2)} %') 