In [1]:
import os
import tarfile

# import matplotlib as mpl
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import randint
from six.moves import urllib
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedShuffleSplit,
    train_test_split,
)
from sklearn.tree import DecisionTreeRegressor


import mlflow
import mlflow.sklearn


# import urllib.request

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [3]:
# mlflow server --backend-store-uri mlruns/ --default-artifact-root mlruns/ --host 0.0.0.0 --port 5000
remote_server_uri = "http://0.0.0.0:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)  # or set the MLFLOW_TRACKING_URI in the env

In [4]:
mlflow.tracking.get_tracking_uri()

'http://0.0.0.0:5000'

In [5]:
data_path = "data/"

In [6]:
exp_name = "HousingPricePrediction"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='mlruns/2', experiment_id='2', lifecycle_stage='active', name='HousingPricePrediction', tags={}>

In [7]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


fetch_housing_data(HOUSING_URL, HOUSING_PATH)


# import pandas as pd


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()


In [8]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [9]:
def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

In [10]:
with mlflow.start_run(run_name='PARENT_RUN') as parent_run:
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

    housing["income_cat"] = pd.cut(
        housing["median_income"],
        bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
        labels=[1, 2, 3, 4, 5],
    )
    
    with mlflow.start_run(run_name='DataPreparation', nested=True) as child_run_1:
        #mlflow.log_param("child", "yes")
        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        for train_index, test_index in split.split(housing, housing["income_cat"]):
            strat_train_set = housing.loc[train_index]
            strat_test_set = housing.loc[test_index]
        train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
        
        compare_props = pd.DataFrame(
        {
            "Overall": income_cat_proportions(housing),
            "Stratified": income_cat_proportions(strat_test_set),
            "Random": income_cat_proportions(test_set),
        }
        ).sort_index()
        compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
        compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
        
        
        for set_ in (strat_train_set, strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)
        housing = strat_train_set.copy()
        corr_matrix = housing.corr()
        corr_matrix["median_house_value"].sort_values(ascending=False)
        housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
        housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
        housing["population_per_household"] = housing["population"] / housing["households"]

        housing = strat_train_set.drop("median_house_value", axis=1)  # drop labels for training set
        housing_labels = strat_train_set["median_house_value"].copy()
        
        imputer = SimpleImputer(strategy="median")

        housing_num = housing.drop("ocean_proximity", axis=1)

        imputer.fit(housing_num)
        X = imputer.transform(housing_num)

        housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index)
        housing_tr["rooms_per_household"] = housing_tr["total_rooms"] / housing_tr["households"]
        housing_tr["bedrooms_per_room"] = housing_tr["total_bedrooms"] / housing_tr["total_rooms"]
        housing_tr["population_per_household"] = housing_tr["population"] / housing_tr["households"]

        housing_cat = housing[["ocean_proximity"]]
        housing_prepared = housing_tr.join(pd.get_dummies(housing_cat, drop_first=True))
        
        
    with mlflow.start_run(run_name='Modelling', nested=True) as child_run_2:
        param_grid = [
        # try 12 (3×4) combinations of hyperparameters
        {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
        # then try 6 (2×3) combinations with bootstrap set as False
        {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
        ]

    forest_reg = RandomForestRegressor(random_state=42)
    # train across 5 folds, that's a total of (12+6)*5=90 rounds of training
    grid_search = GridSearchCV(
        forest_reg,
        param_grid,
        cv=5,
        scoring="neg_mean_squared_error",
        return_train_score=True,
    )
    grid_search.fit(housing_prepared, housing_labels)

    grid_search.best_params_
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    feature_importances = grid_search.best_estimator_.feature_importances_
    sorted(zip(feature_importances, housing_prepared.columns), reverse=True)


    final_model = grid_search.best_estimator_

    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    X_test_num = X_test.drop("ocean_proximity", axis=1)
    X_test_prepared = imputer.transform(X_test_num)
    X_test_prepared = pd.DataFrame(X_test_prepared, columns=X_test_num.columns, index=X_test.index)
    X_test_prepared["rooms_per_household"] = X_test_prepared["total_rooms"] / X_test_prepared["households"]
    X_test_prepared["bedrooms_per_room"] = X_test_prepared["total_bedrooms"] / X_test_prepared["total_rooms"]
    X_test_prepared["population_per_household"] = X_test_prepared["population"] / X_test_prepared["households"]

    X_test_cat = X_test[["ocean_proximity"]]
    X_test_prepared = X_test_prepared.join(pd.get_dummies(X_test_cat, drop_first=True))


    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    
    
       # Log parameter, metrics, and model to MLflow

    mlflow.log_metrics({"final_mse": final_mse,"final_rmse":final_rmse})
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_artifact(data_path)
    print("Save to: {}".format(mlflow.get_artifact_uri()))

    mlflow.sklearn.log_model(grid_search, "model")


64062.72861884957 {'max_features': 2, 'n_estimators': 3}
55479.21774655382 {'max_features': 2, 'n_estimators': 10}
52966.124941638336 {'max_features': 2, 'n_estimators': 30}
58363.665560496614 {'max_features': 4, 'n_estimators': 3}
52396.19523340483 {'max_features': 4, 'n_estimators': 10}
50215.10958041685 {'max_features': 4, 'n_estimators': 30}
59020.00126496427 {'max_features': 6, 'n_estimators': 3}
52006.13620481499 {'max_features': 6, 'n_estimators': 10}
50051.59528363614 {'max_features': 6, 'n_estimators': 30}
58910.833233016325 {'max_features': 8, 'n_estimators': 3}
52362.774093865475 {'max_features': 8, 'n_estimators': 10}
50273.65306916561 {'max_features': 8, 'n_estimators': 30}
63087.194157215 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54773.96059466593 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60379.84359138394 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
51887.898627977534 {'bootstrap': False, 'max_features': 3, 'n_estimators':



In [11]:
final_predictions

array([491164.13333333, 256180.06666667, 215153.33333333, ...,
       364546.73333333, 258216.66666667, 214690.        ])

In [14]:
print("parent run_id: {}".format(parent_run.info.run_id))
print("child run_id : {}".format(child_run_1.info.run_id))
print("child run_id : {}".format(child_run_2.info.run_id))
print("--")

parent run_id: 592b655273084727a2afdae35cdb6de6
child run_id : cb995a3f693d4d42a3f35b935f995c80
child run_id : ddc0d9be23b84d698ddacaa165be89f8
--


In [21]:
query = "tags.mlflow.parentRunId = '{}'".format(parent_run.info.run_id)
results = mlflow.search_runs(filter_string=query)
print(results[["run_id", "tags.mlflow.runName"]])

                             run_id tags.mlflow.runName
0  ddc0d9be23b84d698ddacaa165be89f8           Modelling
1  cb995a3f693d4d42a3f35b935f995c80     DataPreparation
