In [1]:
from pysparkling import *
from load_susy_into_df import susy_csv_to_df

In [2]:
path_to_csv = '/home/varsrao/Downloads/SUSY.csv'
chunksize = 100000
num_train_chunks = 10
num_test_chunks = 10

In [3]:
print('----Loading Dataset----')
ds_train_pd_df, ds_test_pd_df, target_col_name, target_col_idx, feature_col_names = susy_csv_to_df(
        path_to_csv, chunksize, num_train_chunks, num_test_chunks)
col_names = [target_col_name]+feature_col_names

----Loading Dataset----


In [4]:
print('----Creating H2O Context----')
hc = H2OContext.getOrCreate(spark)

----Creating H2O Context----
Connecting to H2O server at http://192.168.2.253:54321... successful.


0,1
H2O cluster uptime:,12 secs
H2O cluster timezone:,America/Toronto
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.4
H2O cluster version age:,17 days
H2O cluster name:,sparkling-water-varsrao_local-1551834540125
H2O cluster total nodes:,1
H2O cluster free memory:,754 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4



Sparkling Water Context:
 * H2O name: sparkling-water-varsrao_local-1551834540125
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.2.253,54321)
  ------------------------

  Open H2O Flow in browser: http://192.168.2.253:54321 (CMD + click in Mac OSX)

    


In [5]:
print('----Creating H2O Frame----')
import h2o
ds_f = h2o.H2OFrame(ds_train_pd_df, column_names=col_names)
ds_test_f = h2o.H2OFrame(ds_test_pd_df, column_names=col_names)
ds_test_f[target_col_name] = ds_test_f[target_col_name].asfactor()

h2o.cluster().timezone = "Etc/UTC"

----Creating H2O Frame----
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
print('----Assembling Data----')
ds_f[target_col_name] = ds_f[target_col_name].asfactor()
ds_f_splits = ds_f.split_frame(ratios=[0.8])
ds_train_f, ds_val_f = ds_f_splits
predictor_columns = ds_train_f.drop(target_col_name).col_names
response_column = target_col_name

----Assembling Data----


In [7]:
print('----Training Model----')
from h2o.estimators.kmeans import H2OKMeansEstimator

kmeans_model = H2OKMeansEstimator(k=2, max_iterations=1000000)

kmeans_model.train(x            = predictor_columns,
            training_frame   = ds_train_f,
            validation_frame = ds_val_f
         )

----Training Model----
kmeans Model Build progress: |████████████████████████████████████████████| 100%


In [8]:
print('----Testing Model----')
predict_table = kmeans_model.predict(ds_test_f)
predict_table_df = predict_table.as_data_frame()
predictions = predict_table_df["predict"].tolist()
ground_truth = ds_test_pd_df[0]
print('KMeans Accuracy = {0}'.format(sum(ground_truth==predictions)/len(ground_truth)))

print('----End----')

----Testing Model----
kmeans prediction progress: |█████████████████████████████████████████████| 100%
KMeans Accuracy = 0.669413
----End----
