In [1]:
from pysparkling import *
from load_susy_into_df import susy_csv_to_df

In [2]:
path_to_csv = '/home/varsrao/Downloads/SUSY.csv'
chunksize = 100000
num_train_chunks = 10
num_test_chunks = 10

In [3]:
print('----Loading Dataset----')
ds_train_pd_df, ds_test_pd_df, target_col_name, target_col_idx, feature_col_names = susy_csv_to_df(
        path_to_csv, chunksize, num_train_chunks, num_test_chunks)
col_names = [target_col_name]+feature_col_names

----Loading Dataset----


In [4]:
print('----Creating H2O Context----')
hc = H2OContext.getOrCreate(spark)

----Creating H2O Context----
Connecting to H2O server at http://192.168.2.253:54323... successful.


0,1
H2O cluster uptime:,12 secs
H2O cluster timezone:,America/Toronto
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.4
H2O cluster version age:,17 days
H2O cluster name:,sparkling-water-varsrao_local-1551834688821
H2O cluster total nodes:,1
H2O cluster free memory:,778 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4



Sparkling Water Context:
 * H2O name: sparkling-water-varsrao_local-1551834688821
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.2.253,54323)
  ------------------------

  Open H2O Flow in browser: http://192.168.2.253:54323 (CMD + click in Mac OSX)

    


In [5]:
print('----Creating H2O Frame----')
import h2o
ds_f = h2o.H2OFrame(ds_train_pd_df, column_names=col_names)
ds_test_f = h2o.H2OFrame(ds_test_pd_df, column_names=col_names)
ds_test_f[target_col_name] = ds_test_f[target_col_name].asfactor()

h2o.cluster().timezone = "Etc/UTC"

----Creating H2O Frame----
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
print('----Assembling Data----')
ds_f[target_col_name] = ds_f[target_col_name].asfactor()
ds_f_splits = ds_f.split_frame(ratios=[0.8])
ds_train_f, ds_val_f = ds_f_splits
predictor_columns = ds_train_f.drop(target_col_name).col_names
response_column = target_col_name

----Assembling Data----


In [7]:
print('----Training Model----')
# Create and train GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# Prepare model based on the given set of parameters
gbm_model = H2OGradientBoostingEstimator(ntrees       = 50,
                                         max_depth    = 3,
                                         learn_rate   = 0.1,
                                         distribution = "bernoulli"
                                        )

glm_model = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5])

gbm_model.train(x            = predictor_columns,
            y                = response_column,
            training_frame   = ds_train_f,
            validation_frame = ds_val_f
         )

glm_model.train(x            = predictor_columns,
            y                = response_column,
            training_frame   = ds_train_f,
            validation_frame = ds_val_f
         )

----Training Model----
gbm Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%


In [8]:
print('----Testing Model----')
gbm_model.model_performance(ds_test_f)
glm_model.model_performance(ds_test_f)

----Testing Model----

ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.14851441979949484
RMSE: 0.38537568657025423
LogLoss: 0.45874377647705206
Null degrees of freedom: 999999
Residual degrees of freedom: 999984
Null deviance: 1378988.740405002
Residual deviance: 917487.552954104
AIC: 917519.552954104
AUC: 0.8575438005385447
pr_auc: 0.7656807710929513
Gini: 0.7150876010770895
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3692238803642497: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,418336.0,124377.0,0.2292,(124377.0/542713.0)
1,98971.0,358316.0,0.2164,(98971.0/457287.0)
Total,517307.0,482693.0,0.2233,(223348.0/1000000.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3692239,0.7623907,233.0
max f2,0.1885384,0.8274261,322.0
max f0point5,0.6046688,0.8030148,145.0
max accuracy,0.4750199,0.788901,190.0
max precision,0.9999094,0.9945110,0.0
max recall,0.0037385,1.0,399.0
max specificity,0.9999094,0.9995559,0.0
max absolute_mcc,0.5144700,0.5765774,176.0
max min_per_class_accuracy,0.3763389,0.7777610,230.0


Gains/Lift Table: Avg response rate: 45.73 %, avg score: 45.75 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.01,0.9999997,2.1835303,2.1835303,0.9985,1.0000000,0.9985,1.0000000,0.0218353,0.0218353,118.3530256,118.3530256
,2,0.02,0.9999902,2.1793753,2.1814528,0.9966,0.9999969,0.99755,0.9999984,0.0217938,0.0436291,117.9375316,118.1452786
,3,0.03,0.9999233,2.1717215,2.1782090,0.9931,0.9999649,0.9960667,0.9999872,0.0217172,0.0653463,117.1721479,117.8209017
,4,0.04,0.9996811,2.1690973,2.1759311,0.9919,0.9998241,0.995025,0.9999465,0.0216910,0.0870372,116.9097307,117.5931089
,5,0.05,0.9990604,2.1649424,2.1737333,0.99,0.9994103,0.99402,0.9998392,0.0216494,0.1086867,116.4942367,117.3733345
,6,0.1,0.9808670,2.1349393,2.1543363,0.97628,0.9926687,0.98515,0.9962540,0.1067470,0.2154336,113.4939327,115.4336336
,7,0.15,0.9227699,2.0643053,2.1243260,0.94398,0.9554187,0.9714267,0.9826422,0.1032153,0.3186489,106.4305349,112.4326007
,8,0.2,0.8287893,1.9432873,2.0790663,0.88864,0.8778912,0.95073,0.9564545,0.0971644,0.4158133,94.3287257,107.9066319
,9,0.3,0.6205907,1.6604452,1.9395259,0.7593,0.7228861,0.88692,0.8785983,0.1660445,0.5818578,66.0445191,93.9525943







In [9]:
print('----Testing Model v2----')
predict_table_gbm = gbm_model.predict(ds_test_f)
predict_table_glm = glm_model.predict(ds_test_f)


predict_table_gbm_df = predict_table_gbm.as_data_frame()
predictions_gbm = predict_table_gbm_df["predict"].tolist()
predict_table_glm_df = predict_table_glm.as_data_frame()
predictions_glm = predict_table_glm_df["predict"].tolist()
ground_truth = ds_test_pd_df[0]
print('GBM Accuracy = {0}'.format(sum(ground_truth==predictions_gbm)/len(ground_truth)))
print('GLM Accuracy = {0}'.format(sum(ground_truth==predictions_glm)/len(ground_truth)))

print('----End----')

----Testing Model v2----
gbm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
GBM Accuracy = 0.785505
GLM Accuracy = 0.778977
----End----
