In [1]:
from pyspark.sql import SparkSession
from pysparkling import *
from pyspark.sql.types import *



## Create from Spark DataFrame

In [2]:
def toDoubleSafe(v):
    try:
        return float(v)
    except:
        return str(v) #if it is not a float type return as a string.
    
adultschema = StructType([
    StructField("age",DoubleType(),True),
    StructField("workclass",StringType(),True),
    StructField("fnlwgt",DoubleType(),True),
    StructField("education",StringType(),True),
    StructField("marital_status",StringType(),True),
    StructField("occupation",StringType(),True),
    StructField("relationship",StringType(),True),
    StructField("race",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("capital_gain",DoubleType(),True),
    StructField("capital_loss",DoubleType(),True),
    StructField("hours_per_week",DoubleType(),True),
    StructField("native_country",StringType(),True),
    StructField("income",StringType(),True)
])

In [4]:
ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate()

In [5]:
#load and convert the data
census_raw = ss.sparkContext.textFile("../Data/adult.raw", 4).map(lambda x:  x.split(", "))
census_raw = census_raw.map(lambda row:  [toDoubleSafe(x) for x in row])

dfraw = ss.createDataFrame(census_raw, adultschema)

03-06 19:43:25.398 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_0 stored as values in memory (estimated size 354.2 KiB, free 366.0 MiB)
03-06 19:43:25.584 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 32.0 KiB, free 365.9 MiB)
03-06 19:43:25.586 10.0.0.115:54321      77804  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Added broadcast_0_piece0 in memory on 10.0.0.115:56039 (size: 32.0 KiB, free: 366.3 MiB)
03-06 19:43:25.590 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.SparkContext: Created broadcast 0 from textFile at NativeMethodAccessorImpl.java:0
03-06 19:43:26.030 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.sql.internal.SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
03-06 19:43:26.034 10.0.0.115:54321      7

In [11]:
adult_h2o = hc.asH2OFrame(dfraw,'adult')

03-06 19:47:51.243 10.0.0.115:54321      77804  866381-178  INFO water.default: POST /3/InitializeFrame, parms: {key=adult, columns=["age","workclass","fnlwgt","education","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income"]}
03-06 19:47:51.297 10.0.0.115:54321      77804    Thread-4  INFO ai.h2o.sparkling.H2OFrame: H2O node http://10.0.0.115:54321/3/InitializeFrame successfully responded for the POST.
03-06 19:47:51.315 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.SparkContext: Starting job: collect at Writer.scala:169
03-06 19:47:51.318 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Got job 5 (collect at Writer.scala:169) with 4 output partitions
03-06 19:47:51.318 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Final stage: ResultStage 5 (collect at Writer.scala:169)
03-06 19:47:51.318 10.0.0.115:54321      77

In [12]:
adult_h2o

age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K




In [13]:
adult_h2o.summary()

Unnamed: 0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
type,int,enum,int,enum,enum,enum,enum,enum,enum,int,int,int,enum,enum
mins,17.0,,12285.0,,,,,,,0.0,0.0,1.0,,
mean,38.64358543876177,,189664.13459727296,,,,,,,1079.0676262233324,87.50231358257236,40.42238237582413,,
maxs,90.0,,1490400.0,,,,,,,99999.0,4356.0,99.0,,
sigma,13.710509934443552,,105604.02542315725,,,,,,,7452.019057655393,403.00455212435895,12.391444024252296,,
zeros,0,,0,,,,,,,44807,46560,0,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,39.0,State-gov,77516.0,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K


## Create from File

In [15]:
import h2o
adult_h2o= h2o.import_file('../Data/adult.raw')

03-06 19:51:37.101 10.0.0.115:54321      77804  866381-176  INFO water.default: POST /3/ImportFilesMulti, parms: {paths=[/Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEMS/msds697_distributed_data_systems_2022/Data/adult.raw]}
03-06 19:51:37.129 10.0.0.115:54321      77804  9866381-83  INFO water.default: POST /3/ParseSetup, parms: {single_quotes=False, source_frames=["nfs://Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEMS/msds697_distributed_data_systems_2022/Data/adult.raw"], check_header=0}
03-06 19:51:37.551 10.0.0.115:54321      77804  9866381-83  INFO water.default: ParseSetup heuristic: cloudSize: 1, cores: 8, numCols: 14, maxLineLength: 150, totalSize: 5787978, localParseSize: 5787978, chunkSize: 180875, numChunks: 31, numChunks * cols: 434
03-06 19:51:37.562 10.0.0.115:54321      77804  866381-176  INFO water.default: POST /3/Parse, parms: {number_columns=14, source_frames=["nfs://Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEM

In [8]:
import h2o
adult_h2o = h2o.import_file(path='../Data/adult.raw')

03-03 14:48:45.858 10.1.153.45:54321     4474   1421110-70  INFO water.default: POST /3/ImportFilesMulti, parms: {paths=[/Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEMS/msds697_distributed_data_systems_2022/Data/adult.raw]}
03-03 14:48:45.995 10.1.153.45:54321     4474   1421110-71  INFO water.default: POST /3/ParseSetup, parms: {single_quotes=False, source_frames=["nfs://Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEMS/msds697_distributed_data_systems_2022/Data/adult.raw"], check_header=0}
03-03 14:48:46.557 10.1.153.45:54321     4474   1421110-71  INFO water.default: ParseSetup heuristic: cloudSize: 1, cores: 8, numCols: 14, maxLineLength: 150, totalSize: 5787978, localParseSize: 5787978, chunkSize: 180875, numChunks: 31, numChunks * cols: 434
03-03 14:48:46.637 10.1.153.45:54321     4474   1421110-70  INFO water.default: POST /3/Parse, parms: {number_columns=14, source_frames=["nfs://Users/fanli/Desktop/classes/spring_1/697_DISTRIBUTED_DATA_SYSTEM

In [16]:
adult_h2o.summary()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
type,int,enum,int,enum,enum,enum,enum,enum,enum,int,int,int,enum,enum
mins,17.0,,12285.0,,,,,,,0.0,0.0,1.0,,
mean,38.64358543876172,,189664.13459727276,,,,,,,1079.0676262233324,87.50231358257237,40.422382375824085,,
maxs,90.0,,1490400.0,,,,,,,99999.0,4356.0,99.0,,
sigma,13.710509934443557,,105604.02542315726,,,,,,,7452.019057655394,403.00455212435907,12.391444024252305,,
zeros,0,,0,,,,,,,44807,46560,0,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,39.0,State-gov,77516.0,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K


In [17]:
adult_h2o.set_names(["age", "capital_gain", "capital_loss", "education", "fnlwgt", "hours_per_week", "income", "marital_status", "native_country", "occupation", "race", "relationship", "sex", "workclass"])

03-06 19:56:26.215 10.0.0.115:54321      77804  9866381-66  INFO water.default: POST /4/sessions, parms: {}
03-06 19:56:26.309 10.0.0.115:54321      77804  866381-177  INFO water.default: POST /99/Rapids, parms: {ast=(tmp= py_1_sid_8867 (colnames= adult_raw1.hex [0 1 2 3 4 5 6 7 8 9 10 11 12 13] ['age' 'capital_gain' 'capital_loss' 'education' 'fnlwgt' 'hours_per_week' 'income' 'marital_status' 'native_country' 'occupation' 'race' 'relationship' 'sex' 'workclass'])), session_id=_sid_8867}
03-06 19:56:26.338 10.0.0.115:54321      77804  9866381-66  INFO water.default: GET /3/Frames/py_1_sid_8867, parms: {column_offset=0, full_column_count=-1, row_count=10, row_offset=0, column_count=-1}


age,capital_gain,capital_loss,education,fnlwgt,hours_per_week,income,marital_status,native_country,occupation,race,relationship,sex,workclass
39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K




In [21]:
adult_h2o.show()

age,capital_gain,capital_loss,education,fnlwgt,hours_per_week,income,marital_status,native_country,occupation,race,relationship,sex,workclass
39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Convert as Spark DataFrame

In [22]:
adult_df = hc.asSparkFrame(adult_h2o)

03-06 20:02:53.115 10.0.0.115:54321      77804  866381-514  INFO water.default: GET /3/Frames/py_1_sid_8867/summary, parms: {row_count=0}
03-06 20:02:53.179 10.0.0.115:54321      77804    Thread-4  INFO ai.h2o.sparkling.H2OFrame: H2O node http://10.0.0.115:54321/3/Frames/py_1_sid_8867/summary?row_count=0 successfully responded for the GET.
03-06 20:02:53.205 10.0.0.115:54321      77804    Thread-4  INFO ai.h2o.sparkling.backend.utils.RestApiUtils: H2O node http://10.0.0.115:54321/3/Cloud successfully responded for the GET.
03-06 20:02:53.210 10.0.0.115:54321      77804  866381-514  INFO water.default: GET /3/FrameChunks/py_1_sid_8867, parms: {}
03-06 20:02:53.212 10.0.0.115:54321      77804    Thread-4  INFO ai.h2o.sparkling.H2OFrame: H2O node http://10.0.0.115:54321/3/FrameChunks/py_1_sid_8867 successfully responded for the GET.


In [23]:
adult_df.show()

03-06 20:02:56.617 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator: Code generated in 102.179459 ms
03-06 20:02:56.644 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
03-06 20:02:56.650 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Got job 7 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
03-06 20:02:56.650 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Final stage: ResultStage 7 (showString at NativeMethodAccessorImpl.java:0)
03-06 20:02:56.650 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Parents of final stage: List()
03-06 20:02:56.651 10.0.0.115:54321      77804  event-loop  INFO org.apache.spark.scheduler.DAGScheduler: Missing parents: List()
03-06 20:02:56.653 10.0.0.115:54321      7780

In [24]:
ss.stop()

03-06 20:03:01.278 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.ui.SparkUI: Stopped Spark web UI at http://10.0.0.115:4041
03-06 20:03:01.288 10.0.0.115:54321      77804  agerMaster  INFO org.apache.spark.storage.BlockManagerInfo: Removed broadcast_8_piece0 on 10.0.0.115:56039 in memory (size: 11.2 KiB, free: 366.3 MiB)
03-06 20:03:01.313 10.0.0.115:54321      77804  ent-loop-4  INFO org.apache.spark.MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
03-06 20:03:01.343 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.storage.memory.MemoryStore: MemoryStore cleared
03-06 20:03:01.343 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.storage.BlockManager: BlockManager stopped
03-06 20:03:01.346 10.0.0.115:54321      77804    Thread-4  INFO org.apache.spark.storage.BlockManagerMaster: BlockManagerMaster stopped
03-06 20:03:01.349 10.0.0.115:54321      77804  ent-loop-1  INFO org.apache.spark.scheduler.OutputCommitCoordinato