In [1]:
#
# STEP 1 - Generating an artificial dataset:
#
# reproduced what was done in the video in python.
#
import pandas as pd
import numpy as np
import random
from math import sqrt
import matplotlib.pyplot as plt
import h2o
from h2o.estimators import H2OGradientBoostingEstimator


number_of_records = 5000

ids = [x for x in range(0,number_of_records)]
random_category = ['A', 'B', 'C', 'D']
age = [round(random.uniform(18.0, 75.0), 2) for x in range(0, number_of_records)]
healthyEating = [round(x, 0) for x in np.random.normal(loc=5, scale=1.0, size=number_of_records)]
activeLifestyle = [round(x, 0) for x in np.random.normal(loc=5, scale=1.0, size=number_of_records)]

dataframe = pd.DataFrame(ids, columns=['id'])
dataframe['random_category'] = [random_category[random.randint(0,len(random_category)-1)] for x in range(0, len(ids))]
dataframe['age'] = age
dataframe['healthyEating'] = healthyEating
dataframe['activeLifestyle'] = activeLifestyle
dataframe['activeLifestyle'] = np.where(dataframe['age'] < 30, dataframe['activeLifestyle'] + 1 ,  dataframe['activeLifestyle'])
dataframe['income'] = 20000 + ((dataframe['age']*3)**2)
dataframe['income'] = np.where(dataframe['activeLifestyle'] < 5, dataframe['income'] * 1.3, dataframe['income'])
dataframe['income'] = np.where(dataframe['activeLifestyle'] > 5, dataframe['income'] * 0.7, dataframe['income'])
dataframe['income'] = np.where(dataframe['healthyEating'] > 5, dataframe['income'] * 1.1, dataframe['income'])

#
# add noise to the income column
#
dataframe['income'] = dataframe['income'] + random.randint(0,4000)
dataframe['income'] = np.ceil(dataframe['income']/100)*100


In [2]:
#
# STEP 2 - start h2o, and import data
#

h2o.init()

#
# import  dataframe in h2o instance
#
dataframe = h2o.H2OFrame(dataframe)


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,3 hours 37 mins
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.1
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_Gerd_mpc823
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.969 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
#
# STEP 3 - Split the data. Split into train and test (for Crossvalidation).
#

train, test = dataframe.split_frame(ratios=[0.8], destination_frames=["train", "test"], seed=123)


In [4]:
#
# STEP 4 - creating a GBM classification model. Show the results, on both training and test data. 
# Show all the performance stats for training and testdata.
#

#
# ignoring irrelevant fields for training in the dataset
#
target_variable = 'income'
ignoreFields = [target_variable, 'id']
input_variables = [i for i in train.names if i not in ignoreFields]

#
# using cross validation
#
gbm_model_1 = H2OGradientBoostingEstimator(model_id = "gbm-coursera", ntrees=60, nfolds=5, max_depth=5, learn_rate=0.2)
gbm_model_1.train(input_variables, target_variable, train)

#
# showing model performance
#
print("Performance on testdata: \n", gbm_model_1.model_performance(test))
print("Performance on trainingdata: \n",gbm_model_1.model_performance(train))

#
# storing the scoring results of model 1 for comparison
#
gbm_model_1_rmse = gbm_model_1.rmse(xval = True)
gbm_model_1_mae = gbm_model_1.mae(xval = True) 

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Performance on testdata: 
 
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 79480.17397775457
RMSE: 281.9222835778587
MAE: 201.41323693016932
RMSLE: 0.006198023640633325
Mean Residual Deviance: 79480.17397775457

Performance on trainingdata: 
 
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 47025.55971366976
RMSE: 216.85377495831094
MAE: 157.24448661343834
RMSLE: 0.004761168440103185
Mean Residual Deviance: 47025.55971366976



In [5]:
#
# STEP 5 - try some alternative parameters, to build a different model, and show how the results differ.
#

#
# using cross validation
# changed parameters compared to the model above:
# increased ntrees to 80, increased max_depth to 6, decreased the learn_rate to 0.01 (since using higher ntrees)
# 
# Should give better results than the first model!
#
gbm_model_2 = H2OGradientBoostingEstimator(model_id = "gbm-coursera-tweaked-params", ntrees=80, nfolds=5, max_depth=6, learn_rate = 0.1)
gbm_model_2.train(input_variables, target_variable, train)

#
# showing model performance
#
print("Performance on testdata: \n", gbm_model_2.model_performance(test))
print("Performance on trainingdata: \n",gbm_model_2.model_performance(train))

#
# storing the scoring results of model 2 for comparison
#
gbm_model_2_rmse = gbm_model_2.rmse(xval = True)
gbm_model_2_mae = gbm_model_2.mae(xval = True) 


gbm Model Build progress: |███████████████████████████████████████████████| 100%
Performance on testdata: 
 
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 40752.463626682904
RMSE: 201.87239441459772
MAE: 131.96896280739773
RMSLE: 0.004217003296384585
Mean Residual Deviance: 40752.463626682904

Performance on trainingdata: 
 
ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 17985.186729576875
RMSE: 134.10886148788555
MAE: 93.9869185063788
RMSLE: 0.002893957903522658
Mean Residual Deviance: 17985.186729576875



In [6]:
#
# This section is to compare the scorings of both the first model and the second model (with tweaked parameters)
#
print("RESULT COMPARISON:\n")
print("RMSE Model 1:",gbm_model_1_rmse,"\n","MAE Model 1:",gbm_model_1_mae,"\n","\n"
      "RMSE Model 2 (Tweaked Parameters):",gbm_model_2_rmse,"\n","MAE Model 2 (Tweaked Parameters):",gbm_model_2_mae)


RESULT COMPARISON:

RMSE Model 1: 299.07816050028225 
 MAE Model 1: 208.76301456022514 
 
RMSE Model 2 (Tweaked Parameters): 215.35022500251927 
 MAE Model 2 (Tweaked Parameters): 145.36858048765177
