In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[381]:


import h2o
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[382]:


#Step 1

h2o.init()


# In[383]:


data = h2o.import_file("http://coursera.h2o.ai/cacao.882.csv") # importing the cacao dataset


# In[384]:


data.head(5) # print the first five of our dataset


# In[385]:


train, valid, test = data.split_frame([0.8,0.1], seed=134) # splitting data to train, test and validation set and setting seed


# In[386]:


print("%d,%d,%d" % (train.nrows, valid.nrows, test.nrows)) # printing the size of each set


# In[387]:


# Step 2
y = "Maker Location" # setting our response variable Y
ignoreFields = [ "REF" ]
xAll = [i for i in train.names if i not in ignoreFields] # setting our independant variables X


# In[388]:


from h2o.estimators.deeplearning import H2ODeepLearningEstimator


# In[389]:


# Step 3
base_model = H2ODeepLearningEstimator(seed = 134, reproducible = True) # creating a baseline deep learning model with default parameters
get_ipython().run_line_magic('time', 'base_model.train(xAll, y, train, validation_frame = valid)')
#CPU times: user 280 ms, sys: 60.1 ms, total: 340 ms
#Wall time: 23.7 s


# In[390]:


base_model # performance on training set


# In[391]:


base_model.model_performance(test) # performance on test set


# In[392]:


#base_model.plot() # uncomment to see the plot for the scoring history in training and validation set of base model


# In[393]:


# after seeing the variable importances, some variables decided to be excluded in order to increase the model performance
ignoreFields2 = [ "REF" , "Bean Type", "Review Date", "Origin"]
x2 = [i for i in train.names if i not in ignoreFields2]


# In[394]:


# Step 4
tuned_model = H2ODeepLearningEstimator(epochs = 50,
                                      stopping_rounds = 4,
                                      stopping_tolerance = 0,
                                      stopping_metric = "logloss",
                                      seed = 134,
                                      reproducible = True
                                      #hidden = [400,400]
                                      )

get_ipython().run_line_magic('time', 'tuned_model.train(x2, y, train, validation_frame = valid)')
#CPU times: user 213 ms, sys: 44.6 ms, total: 257 ms
#Wall time: 38.7 s


# In[395]:


#tuned_model.plot() # uncomment to see the plot for the scoring history in training and validation set of tuned model


# In[396]:


tuned_model # tuned model performance on training set


# In[397]:


tuned_model.model_performance(test) # tuned model performance on test set


# In[398]:


both_models = [base_model, tuned_model]

loglosses = list(map(lambda x: x.logloss(), both_models))
print(" baseline_model: %.4f -> tuned_model: %.4f" % (loglosses[0],loglosses[1]))

mse = list(map(lambda x: x.mse(), both_models))
print(" baseline_model: %.4f -> tuned_model: %.4f" % (mse[0],mse[1]))

# we can see that both logloss and mse had a significant decrease on the tuned model
#baseline_model: 0.0656 -> tuned_model: 0.0043
#baseline_model: 0.0160 -> tuned_model: 0.0005


# In[399]:


test1_perf = base_model.model_performance(test) 
test2_perf = tuned_model.model_performance(test)

print(" baseline_model: %.4f -> tuned_model: %.4f" % (test1_perf.logloss(),test2_perf.logloss()))
print(" baseline_model: %.4f -> tuned_model: %.4f" % (test1_perf.mse(),test2_perf.mse()))

# logloss and mse had also a significant decrease from base model to tuned model on test set
#baseline_model: 0.6603 -> tuned_model: 0.3266
#baseline_model: 0.1544 -> tuned_model: 0.0774


# In[400]:


# Step 5: save the models
#model1_path = h2o.save_model(model=base_model, path="/Users/mike/Downloads/mymodel1", force=True)
#model2_path = h2o.save_model(model=tuned_model, path="/Users/mike/Downloads/mymodel2", force=True)


# In[401]:


# Step 6
h2o.cluster().shutdown()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_102"; Java(TM) SE Runtime Environment (build 1.8.0_102-b14); Java HotSpot(TM) 64-Bit Server VM (build 25.102-b14, mixed mode)
  Starting server from /Users/tim.wu/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/61/sfyb5b517dd3tj1_cdjsx6_w0000gn/T/tmp56enfyvw
  JVM stdout: /var/folders/61/sfyb5b517dd3tj1_cdjsx6_w0000gn/T/tmp56enfyvw/h2o_tim_wu_started_from_python.out
  JVM stderr: /var/folders/61/sfyb5b517dd3tj1_cdjsx6_w0000gn/T/tmp56enfyvw/h2o_tim_wu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.3
H2O cluster version age:,5 days
H2O cluster name:,H2O_from_python_tim_wu_hnpm9o
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


Parse progress: |█████████████████████████████████████████████████████████| 100%
1431,189,175
deeplearning Model Build progress: |██████████████████████████████████████| 100%
CPU times: user 270 ms, sys: 165 ms, total: 436 ms
Wall time: 32.9 s
deeplearning Model Build progress: |██████████████████████████████████████| 100%
CPU times: user 286 ms, sys: 153 ms, total: 439 ms
Wall time: 57.5 s
 baseline_model: 0.0656 -> tuned_model: 0.0043
 baseline_model: 0.0160 -> tuned_model: 0.0005
 baseline_model: 0.6607 -> tuned_model: 0.3266
 baseline_model: 0.1545 -> tuned_model: 0.0774
H2O session _sid_859a closed.
