# Getting data into H2O

In [1]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.171-b11, mixed mode)
  Starting server from C:\Users\Protech\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Protech\AppData\Local\Temp\tmpyp_27orw
  JVM stdout: C:\Users\Protech\AppData\Local\Temp\tmpyp_27orw\h2o_Protech_started_from_python.out
  JVM stderr: C:\Users\Protech\AppData\Local\Temp\tmpyp_27orw\h2o_Protech_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Asia/Bangkok
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_Protech_ji3100
H2O cluster total nodes:,1
H2O cluster free memory:,1.755 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [2]:
datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"
data = h2o.import_file(datasets + "iris.csv",
    col_names=[
        "Sepal length", "Sepal width", "Petal length", "Petal width", "Species"
        ],
    col_types=[
        "numeric", "numeric", "numeric", "numeric", "enum"
        ],
    destination_frame="iris.hex"
    )

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
patients = {'height':[188, 157, 175],  
            'age':[29, 33, 65],  
            'risk':['A', 'B', 'B']}
df = h2o.H2OFrame(patients)
df.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


{'height': 'int', 'age': 'int', 'risk': 'enum'}

In [4]:
df =  h2o.H2OFrame.from_python(
    patients,
    column_types=['enum', None, None],
    destination_frame="patients"
)
df.types
df.frame_id

Parse progress: |█████████████████████████████████████████████████████████| 100%


'patients'

In [5]:
import pandas as pd
patients = pd.DataFrame({
  'height':[188, 157, 175.1],
  'age':[29, 33, 65],
  'risk':['A', 'B', 'B']
  })
df = h2o.H2OFrame(patients)
df.types
df.frame_id
df

Parse progress: |█████████████████████████████████████████████████████████| 100%


height,age,risk
188.0,29,A
157.0,33,B
175.1,65,B




In [6]:
patients = pd.DataFrame({
  'height':[188, 157, 175.1],
  'age':[29, 33, 65],
  'risk':['A', 'B', 'B']
  })
df = h2o.H2OFrame.from_python(
  patients,
  column_names=patients.columns.tolist()
  )
df.types
df.frame_id

Parse progress: |█████████████████████████████████████████████████████████| 100%


'Key_Frame__upload_a624ce575e361017eb50d58480a6434e.hex'

# Data Manipulation

In [7]:
import h2o
h2o.init()

datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"
data = h2o.import_file(datasets + "iris_wheader.csv")
data.frame_id  #iris_wheader.hex

data = data[:,1:] #Drop column 0. Keep column 1 onwards.
data.frame_id  #py_2_sid_88fe

data = h2o.assign(data, "iris")
data.frame_id  #iris

h2o.ls()  #iris and iris_wheader.hex, no py_2_sid_88fe
h2o.remove("iris_wheader.hex")
h2o.ls()  #Just lists iris

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,14 secs
H2O cluster timezone:,Asia/Bangkok
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_Protech_ji3100
H2O cluster total nodes:,1
H2O cluster free memory:,1.743 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%


Unnamed: 0,key
0,Key_Frame__https___raw_githubusercontent_com_D...
1,Key_Frame__upload_9dfb237d13e176e47430c97b9268...
2,Key_Frame__upload_a624ce575e361017eb50d58480a6...
3,Key_Frame__upload_b2605783498df9a1ffcda68a60ef...
4,iris
5,iris.hex
6,patients
7,py_2_sid_b9d7


In [8]:
data.describe()

Rows:150
Cols:4




Unnamed: 0,sepal_wid,petal_len,petal_wid,class
type,real,real,real,enum
mins,2.0,1.0,0.1,
mean,3.053999999999999,3.758666666666667,1.1986666666666665,
maxs,4.4,6.9,2.5,
sigma,0.43359431136217375,1.764420419952262,0.7631607417008414,
zeros,0,0,0,
missing,0,0,0,0
0,3.5,1.4,0.2,Iris-setosa
1,3.0,1.4,0.2,Iris-setosa
2,3.2,1.3,0.2,Iris-setosa


In [9]:
data.dim

[150, 4]

In [10]:
data.nrow

150

In [11]:
data.ncol

4

In [12]:
data["petal_len"] = data["petal_len"] * 1.2

In [13]:
data.describe

sepal_wid,petal_len,petal_wid,class
3.5,1.68,0.2,Iris-setosa
3.0,1.68,0.2,Iris-setosa
3.2,1.56,0.2,Iris-setosa
3.1,1.8,0.2,Iris-setosa
3.6,1.68,0.2,Iris-setosa
3.9,2.04,0.4,Iris-setosa
3.4,1.68,0.3,Iris-setosa
3.4,1.8,0.2,Iris-setosa
2.9,1.68,0.2,Iris-setosa
3.1,1.8,0.1,Iris-setosa


<bound method H2OFrame.describe of >

In [14]:
data["ratio"] = data["petal_wid"] / data["sepal_wid"]
data["petal_len"].sd() #2.117
data["ratio"].cor(data["petal_len"])  #0.956

0.9557785904293422

In [15]:
data.head()

sepal_wid,petal_len,petal_wid,class,ratio
3.5,1.68,0.2,Iris-setosa,0.0571429
3.0,1.68,0.2,Iris-setosa,0.0666667
3.2,1.56,0.2,Iris-setosa,0.0625
3.1,1.8,0.2,Iris-setosa,0.0645161
3.6,1.68,0.2,Iris-setosa,0.0555556
3.9,2.04,0.4,Iris-setosa,0.102564
3.4,1.68,0.3,Iris-setosa,0.0882353
3.4,1.8,0.2,Iris-setosa,0.0588235
2.9,1.68,0.2,Iris-setosa,0.0689655
3.1,1.8,0.1,Iris-setosa,0.0322581




In [16]:
data["is_long"] = (data["petal_len"] > data["petal_len"].mean()[0]).ifelse(1,0)

In [17]:
data.head()

sepal_wid,petal_len,petal_wid,class,ratio,is_long
3.5,1.68,0.2,Iris-setosa,0.0571429,0
3.0,1.68,0.2,Iris-setosa,0.0666667,0
3.2,1.56,0.2,Iris-setosa,0.0625,0
3.1,1.8,0.2,Iris-setosa,0.0645161,0
3.6,1.68,0.2,Iris-setosa,0.0555556,0
3.9,2.04,0.4,Iris-setosa,0.102564,0
3.4,1.68,0.3,Iris-setosa,0.0882353,0
3.4,1.8,0.2,Iris-setosa,0.0588235,0
2.9,1.68,0.2,Iris-setosa,0.0689655,0
3.1,1.8,0.1,Iris-setosa,0.0322581,0




In [18]:
data["species"] = data["class"].ascharacter().gsub("Iris-", "")

In [19]:
data.head()

sepal_wid,petal_len,petal_wid,class,ratio,is_long,species
3.5,1.68,0.2,Iris-setosa,0.0571429,0,setosa
3.0,1.68,0.2,Iris-setosa,0.0666667,0,setosa
3.2,1.56,0.2,Iris-setosa,0.0625,0,setosa
3.1,1.8,0.2,Iris-setosa,0.0645161,0,setosa
3.6,1.68,0.2,Iris-setosa,0.0555556,0,setosa
3.9,2.04,0.4,Iris-setosa,0.102564,0,setosa
3.4,1.68,0.3,Iris-setosa,0.0882353,0,setosa
3.4,1.8,0.2,Iris-setosa,0.0588235,0,setosa
2.9,1.68,0.2,Iris-setosa,0.0689655,0,setosa
3.1,1.8,0.1,Iris-setosa,0.0322581,0,setosa




In [20]:
data.group_by("class").count().mean("petal_len").sum("is_long").frame

class,nrow,mean_petal_len,sum_is_long
Iris-setosa,50,1.7568,0
Iris-versicolor,50,5.112,43
Iris-virginica,50,6.6624,50




In [21]:
data["petal_len"].hist()

TypeError: bar() missing 1 required positional argument: 'x'

**Issue on matplotlib newer version**

There is still no solution
https://stackoverflow.com/questions/52723180/python-h2o-histogram-error-bar-missing-1-required-positional-argument-x

Should use matplotlib version 2.2.4 or lower

In [22]:
# show data more than 10
d = data.as_data_frame()
d.info()  #Describes the pandas DataFrame internals
d.corr(method="spearman").round(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
sepal_wid    150 non-null float64
petal_len    150 non-null float64
petal_wid    150 non-null float64
class        150 non-null object
ratio        150 non-null float64
is_long      150 non-null int64
species      150 non-null object
dtypes: float64(4), int64(1), object(2)
memory usage: 8.3+ KB


Unnamed: 0,sepal_wid,petal_len,petal_wid,ratio,is_long
sepal_wid,1.0,-0.3,-0.28,-0.44,-0.47
petal_len,-0.3,1.0,0.94,0.9,0.84
petal_wid,-0.28,0.94,1.0,0.97,0.84
ratio,-0.44,0.9,0.97,1.0,0.82
is_long,-0.47,0.84,0.84,0.82,1.0


In [23]:
# split data
train, test, valid = data.split_frame([0.6, 0.2])

In [29]:
import h2o

h2o.init()

datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"
data = h2o.import_file(datasets + "iris_wheader.csv")

ratio_frame = data["petal_wid"] / data["sepal_wid"]
ratio_frame.col_names = ["ratio"]
data = data.cbind(ratio_frame)
data = h2o.assign(data, "iris")
ratio_frame.remove() # error

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,11 mins 01 secs
H2O cluster timezone:,Asia/Bangkok
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.2
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_Protech_ji3100
H2O cluster total nodes:,1
H2O cluster free memory:,1.743 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%


AttributeError: 'H2OFrame' object has no attribute 'remove'

In [30]:
prices = h2o.H2OFrame({
  'petal_len':[2, 3, 4, 5],
  'price':[4, 5.5, 8, 10]
  })

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [31]:
data["petal_len"] = data["petal_len"].round()
iris_prices = data.merge(prices)