In [1]:
# Setting up the initial configurations.
%%configure -f
{
    "conf":{
        "spark.ext.h2o.announce.rest.url": "http://ed10-mareks.qxyas5oni4vulmjdyxf3kkjrlb.ax.internal.cloudapp.net:5000/flows",
        "spark.jars":"/H2O-Sparkling-Water-files/sparkling-water-assembly-all.jar",
        "spark.submit.pyFiles":"/H2O-Sparkling-Water-files/pySparkling.zip",
        "spark.locality.wait":"3000",
        "spark.scheduler.minRegisteredResourcesRatio":"1",
        "spark.task.maxFailures":"1",
        "spark.yarn.am.extraJavaOption":"-XX:MaxPermSize=384m",
        "spark.yarn.max.executor.failures":"1",
        "maximizeResourceAllocation": "true"
    },
    "driverMemory":"21G",
    "executorMemory":"21G",
    "numExecutors":6
}

In [2]:
#Initiate H2OContext on top of Spark

import pyspark
import os
os.environ["PYTHON_EGG_CACHE"] = "~/"
sc.addPyFile("wasb:///H2O-Sparkling-Water-files/pySparkling.zip") # For Azure DataLake replace wasb with adl

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
29,application_1574552528623_0005,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [3]:
import pysparkling, h2o

h2o_context = pysparkling.H2OContext.getOrCreate(sc)

Connecting to H2O server at http://10.0.0.15:54323 ... successful.
--------------------------  ------------------------------------------------------------------
H2O cluster uptime:         13 secs
H2O cluster timezone:       Etc/UTC
H2O data parsing timezone:  UTC
H2O cluster version:        3.26.0.5
H2O cluster version age:    2 months and 7 days
H2O cluster name:           sparkling-water-yarn_application_1574552528623_0005
H2O cluster total nodes:    6
H2O cluster free memory:    112.0 Gb
H2O cluster total cores:    48
H2O cluster allowed cores:  18
H2O cluster status:         accepting new members, healthy
H2O connection url:         http://10.0.0.15:54323
H2O connection proxy:
H2O internal security:      False
H2O API Extensions:         XGBoost, Algos, Amazon S3, AutoML, Core V3, TargetEncoder, Core V4
Python version:             2.7.12 final
--------------------------  ------------------------------------------------------------------

Sparkling Water Context:
 * Sparkling Wate

In [4]:
# Importing the train and test datasets from the csv files stored on 'azure storage' in the created cluster.(0.1m dataset)
from pyspark import SparkFiles
import h2o

train = h2o.import_file("wasb://bdclusterstore@bdclusterhdistorage1.blob.core.windows.net/csvFiles/train-0.1m.csv")
test = h2o.import_file("wasb://bdclusterstore@bdclusterhdistorage1.blob.core.windows.net/csvFiles/test.csv")

Parse progress: [#########################################################] 100%
Parse progress: [#########################################################] 100%

In [5]:
# Defining response and predictor variables.
predictor_columns = train.drop("dep_delayed_15min").col_names
response_column = "dep_delayed_15min"

In [6]:
# Model Training and Calculating the time taken for model training. 
# Parameters defined are:
##  number_of_trees = 100
##  max_bins        = 20
##  max_depth       = 50
##  distribution    = bernoulli

from h2o.estimators.random_forest import H2ORandomForestEstimator
import time
model = H2ORandomForestEstimator( ntrees = 100,
                                  max_depth = 20,
                                  nbins = 50,
                                  distribution = "bernoulli")
start = time.time()
model.train( x = predictor_columns,
             y = response_column,
             training_frame = train,
             validation_frame = test)


drf Model Build progress: [###############################################] 100%

In [7]:
# The total time taken for training the model.
end = time.time()
print(end - start)

234.419556856

In [8]:
# Calculating the model performance for the test dataset.
model.model_performance(test)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.151781610747
RMSE: 0.38959159481
LogLoss: 0.486662142822
Mean Per-Class Error: 0.34857896664
AUC: 0.711090904292
pr_auc: 0.418172272964
Gini: 0.422181808584

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.186182007732:        N      Y      Error    Rate
-----  -----  -----  -------  ------------------
N      50506  28009  0.3567   (28009.0/78515.0)
Y      7314   14171  0.3404   (7314.0/21485.0)
Total  57820  42180  0.3532   (35323.0/100000.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.186182     0.445174  260
max f2                       0.0808954    0.608115  347
max f0point5                 0.340547     0.423914  151
max accuracy                 0.494404     0.79429   77
max precision                0.901481     1         0
max recall               