In [1]:
# Setting up the initial configurations.
%%configure -f
{
    "conf":{
        "spark.ext.h2o.announce.rest.url": "http://ed10-mareks.qxyas5oni4vulmjdyxf3kkjrlb.ax.internal.cloudapp.net:5000/flows",
        "spark.jars":"/H2O-Sparkling-Water-files/sparkling-water-assembly-all.jar",
        "spark.submit.pyFiles":"/H2O-Sparkling-Water-files/pySparkling.zip",
        "spark.locality.wait":"3000",
        "spark.scheduler.minRegisteredResourcesRatio":"1",
        "spark.task.maxFailures":"1",
        "spark.yarn.am.extraJavaOption":"-XX:MaxPermSize=384m",
        "spark.yarn.max.executor.failures":"1",
        "maximizeResourceAllocation": "true"
    },
    "driverMemory":"41G",
    "executorMemory":"41G",
    "numExecutors":9
}

In [2]:
#Initiate H2OContext on top of Spark

import pyspark
import os
os.environ["PYTHON_EGG_CACHE"] = "~/"
sc.addPyFile("wasb:///H2O-Sparkling-Water-files/pySparkling.zip") # For Azure DataLake replace wasb with adl

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1574529212966_0006,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [3]:
import pysparkling, h2o

h2o_context = pysparkling.H2OContext.getOrCreate(sc)

Connecting to H2O server at http://10.0.0.4:54321 ... successful.
--------------------------  ------------------------------------------------------------------
H2O cluster uptime:         13 secs
H2O cluster timezone:       Etc/UTC
H2O data parsing timezone:  UTC
H2O cluster version:        3.26.0.5
H2O cluster version age:    2 months and 7 days
H2O cluster name:           sparkling-water-yarn_application_1574529212966_0006
H2O cluster total nodes:    3
H2O cluster free memory:    80.0 Gb
H2O cluster total cores:    24
H2O cluster allowed cores:  9
H2O cluster status:         accepting new members, healthy
H2O connection url:         http://10.0.0.4:54321
H2O connection proxy:
H2O internal security:      False
H2O API Extensions:         XGBoost, Algos, Amazon S3, AutoML, Core V3, TargetEncoder, Core V4
Python version:             2.7.12 final
--------------------------  ------------------------------------------------------------------

Sparkling Water Context:
 * Sparkling Water Ve

In [4]:
# Importing the train and test datasets from the csv files stored on 'azure storage' in the created cluster.(1m dataset)
from pyspark import SparkFiles
import h2o

train = h2o.import_file("wasb://bdclusterstore@bdclusterhdistorage1.blob.core.windows.net/csvFiles/train-1m.csv")
test = h2o.import_file("wasb://bdclusterstore@bdclusterhdistorage1.blob.core.windows.net/csvFiles/test.csv")

Parse progress: [#########################################################] 100%
Parse progress: [#########################################################] 100%

In [None]:
# Defining response and predictor variables.
predictor_columns = train.drop("dep_delayed_15min").col_names
response_column = "dep_delayed_15min"

In [None]:
# Model Training and Calculating the time taken for model training. 
# Parameters defined are:
##  number_of_trees = 90
##  max_bins        = 20
##  max_depth       = 50
##  distribution    = bernoulli
### Note: The model crashed while running for no_of_trees = 100, hence used maximum no_of_trees =90 for this cluster configuration.

from h2o.estimators.random_forest import H2ORandomForestEstimator
import time
model = H2ORandomForestEstimator( ntrees = 90,
                                  max_depth = 20,
                                  nbins = 50,
                                  distribution = "bernoulli")

start = time.time()
model.train( x = predictor_columns,
             y = response_column,
             training_frame = train,
             validation_frame = test)
end = time.time()
print(end - start)

drf Model Build progress: [###############################################] 100%
696.191375971

In [None]:
# Calculating the model performance for the test dataset.
model.model_performance(test)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.144498427627
RMSE: 0.380129487973
LogLoss: 0.454764904542
Mean Per-Class Error: 0.327592161165
AUC: 0.742294061584
pr_auc: 0.47528849695
Gini: 0.484588123169

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.228611101343:        N      Y      Error    Rate
-----  -----  -----  -------  ------------------
N      57774  20741  0.2642   (20741.0/78515.0)
Y      8532   12953  0.3971   (8532.0/21485.0)
Total  66306  33694  0.2927   (29273.0/100000.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.228611     0.46949   231
max f2                       0.0992576    0.62335   325
max f0point5                 0.373806     0.462857  149
max accuracy                 0.485265     0.80324   100
max precision                0.981532     1         0
max recall             