![](logo1.jpg)

# **shAI Training 2023 | Level 1**

## Task #8 (End-to-End ML Project {part_2})

## Welcome to the exercises for reviewing second part of end to end ML project.
**Make sure that you read and understand ch2 from the hands-on ML book (page 72 to the end of the chapter ) before start with this notebook.**

**If you stuck with anything reread that part from the book and feel free to ask about anything in the messenger group as you go along.**

 ## Good Luck : )

## first run the following cell for the first part of the project to continue your work

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [41]:
import os
import tarfile
import urllib
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
   csv_path = os.path.join(housing_path, "housing.csv")
   return pd.read_csv(csv_path)

fetch_housing_data()
housing = load_housing_data()

rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('attribs_adder', CombinedAttributesAdder()),
 ('std_scaler', StandardScaler())])

full_pipeline = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", OneHotEncoder(), cat_attribs)])

housing_prepared = full_pipeline.fit_transform(housing)

In [42]:
housing_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [43]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND
...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,<1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,<1H OCEAN


# 1- Select and Train a Model

# Let’s first train a LinearRegression model

In [44]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

# First try it out on a few instances from the training set:


In [45]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [46]:
some_data_prepared = full_pipeline.transform(some_data)
print('predictions : ', lin_reg.predict(some_data_prepared))
print('Labels : ', list(some_labels))

predictions :  [181746.54359616 290558.74973505 244957.50017771 146498.51061398
 163230.42393939]
Labels :  [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]


# measure this regression model’s RMSE on the whole training set
* sing Scikit-Learn’s mean_squared_error() function:

In [47]:
from sklearn.metrics import mean_squared_error

In [48]:
# CODE HERE
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = mean_squared_error(housing_labels, housing_predictions, squared = False)
lin_rmse

67593.20745775253

# judge on the RMSE result for this model
write down your answar

most district 'median_houseing_value' range between $14999 and $500001, so a typical prediction error of $67593 is not very satisfying. This is because an under fitting occurred

your answer goes here

# Let’s train a Decision Tree Regressor model
## more powerful model

In [49]:
from sklearn.tree import DecisionTreeRegressor

In [50]:
# CODE HERE
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

# Now evaluate the model on the training set
* using Scikit-Learn’s mean_squared_error() function:

In [51]:
# CODE HERE
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels,housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

# Explaine this result
write down your answar

your answer goes here

# Evaluation Using Cross-Validation

1-split the training set into 10 distinct subsets then train and evaluate the Decision Tree model

In [52]:
from sklearn.model_selection import cross_val_score

In [53]:
# CODE HERE
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring ="neg_mean_squared_error",cv = 10)
tree_rmse_scores = np.sqrt(-scores)

2- display the resultant scores and calculate its Mean and Standard deviation

In [54]:
# CODE HERE
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())
display_scores(tree_rmse_scores)

Scores:  [65705.70001763 69944.32536591 69763.499016   69383.01210253
 73060.03836308 67636.59586247 66894.16029761 69558.59734501
 67943.03506504 69712.07229675]
Mean:  68960.10357320502
Standard Deviation:  1927.9904192563984


3-repaet the same steps to compute the same scores for the Linear Regression  model

*notice the difference between the results of the two models*

In [55]:
# CODE HERE
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring ="neg_mean_squared_error",cv = 10)
lin_reg_scores = np.sqrt(-scores)
display_scores(lin_reg_scores)

Scores:  [65000.67382615 70960.56056304 67122.63935124 66089.63153865
 68402.54686442 65266.34735288 65218.78174481 68525.46981754
 72739.87555996 68957.34111906]
Mean:  67828.38677377408
Standard Deviation:  2468.0913950652275


Let's say there is no difference between linear regression and the Decision Tree, The error is considered equal.


## Let’s train one last model the RandomForestRegressor.

In [56]:
# CODE HERE
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators = 100, random_state = 42)
forest_reg.fit(housing_prepared, housing_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse


18527.322990316152

# repeat the same steps to compute the same scores its Mean and Standard deviation for the Random Forest model

In [57]:
# CODE HERE
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores:  [47341.96931397 51653.53070248 49360.29148883 51625.62777032
 52771.91063892 46989.97118038 47333.72603398 50636.24303693
 48951.73251683 50183.60590465]
Mean:  49684.86085873057
Standard Deviation:  1929.9797084102233


# Save every model you experiment with
*using the joblib library*

In [58]:
# CODE HERE
import joblib
joblib.dump(forest_reg, "housing_model.pkl")
loaded_model = joblib.load("housing_model.pkl")

## now you have a shortlist of promising models. You now need to
## fine-tune them!
# Fine-Tune Your Model

## 1- Grid Search
## evaluate all the possible combinations of hyperparameter values for the RandomForestRegressor
*It may take a long time*

In [59]:
from sklearn.model_selection import GridSearchCV

In [60]:
# CODE HERE
param_grid = [
              {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}
]
forest_reg = RandomForestRegressor(random_state = 42)

grid_search = GridSearchCV(forest_reg,param_grid,cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)
grid_search.fit(housing_prepared, housing_labels)

with the evaluation scores

In [61]:
# CODE HERE
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
  print(np.sqrt(-mean_score), params)

64878.27480854276 {'max_features': 2, 'n_estimators': 3}
55391.003575336406 {'max_features': 2, 'n_estimators': 10}
52721.66494842234 {'max_features': 2, 'n_estimators': 30}
58541.12715494087 {'max_features': 4, 'n_estimators': 3}
51623.59366665994 {'max_features': 4, 'n_estimators': 10}
49787.65951361993 {'max_features': 4, 'n_estimators': 30}
58620.88234614251 {'max_features': 6, 'n_estimators': 3}
51645.862673140065 {'max_features': 6, 'n_estimators': 10}
49917.66994061786 {'max_features': 6, 'n_estimators': 30}
58640.96129790229 {'max_features': 8, 'n_estimators': 3}
51650.365581628095 {'max_features': 8, 'n_estimators': 10}
49672.50940389753 {'max_features': 8, 'n_estimators': 30}
61580.24110015614 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53889.80996032937 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
58667.89389226964 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52764.2630869393 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

# Analyze the Best Models and Their Errors
1-indicate the relative importance of each attribute

In [62]:
# CODE HERE
final_model = grid_search.best_estimator_
feature_importances = final_model.feature_importances_
feature_importances.round(4)

array([6.840e-02, 6.490e-02, 4.170e-02, 1.450e-02, 1.370e-02, 1.430e-02,
       1.300e-02, 3.718e-01, 4.950e-02, 1.098e-01, 6.120e-02, 7.400e-03,
       1.650e-01, 2.000e-04, 1.800e-03, 2.700e-03])

2-display these importance scores next to their corresponding attribute names:

In [63]:
# CODE HERE
housing_cat = housing[['ocean_proximity']]
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe_housing = ohe.fit_transform(housing_cat)
extra_attribs = ["population_per_household","bedroom_per_room","rooms_per_household"]
num_attribs = list(train_set.drop(["median_house_value"],axis = 1).columns)
cat_one_hot_attribs = list(ohe.categories_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances,attributes),reverse = True)

[(0.37183388814667373, 'median_income'),
 (0.16501259905352061,
  array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
        dtype=object)),
 (0.1097583570805625, 'population_per_household'),
 (0.06844933917056337, 'longitude'),
 (0.06491313404989336, 'latitude'),
 (0.061176949819538154, 'bedroom_per_room'),
 (0.04945029095965893, 'ocean_proximity'),
 (0.0417428332867604, 'housing_median_age'),
 (0.014515821649955954, 'total_rooms'),
 (0.014300165080528202, 'population'),
 (0.013706064997348836, 'total_bedrooms'),
 (0.012959133102106802, 'households'),
 (0.0073955403637295455, 'rooms_per_household')]

In [64]:
train_set_cat = train_set[['ocean_proximity']]
test_set_cat = test_set[['ocean_proximity']]

ohe_train = OneHotEncoder()
ohe_test = OneHotEncoder()
ohe_train_cat = ohe_train.fit_transform(train_set_cat)
ohe_test_cat = ohe_test.fit_transform(test_set_cat)

## Now is the time to evaluate the final model on the test set.
# Evaluate Your System on the Test Set

1-get the predictors and the labels from your test set

In [76]:
# CODE HERE
X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()

2-run your full_pipeline to transform the data

In [77]:
# CODE HERE
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

3-evaluate the final model on the test set

In [78]:
# CODE HERE
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

49198.020631676336

# compute a 95% confidence interval for the generalization error
*using scipy.stats.t.interval():*

In [79]:
from scipy import stats

In [80]:
# CODE HERE
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

array([46948.10215126, 51349.4515311 ])

# Great Job!
# #shAI_Club