# Experiment 1

Comparing H2O Random Grid Search with rBayesianOptimization 

(from this blog post https://a-ghorbani.github.io/2016/11/24/data-science-with-h2o#bayesian-optimization)


In [1]:
# Check and install other packages if needed
# list_pkgs <- c("rpart", "rattle", "rpart.plot", "RColorBrewer", "partykit",
#               "caret", "party", "rBayesianOptimization", "readr", "data.table",
#               "reshape2", "pROC", "ggplot2", "h2o")
list_pkgs <- c("rBayesianOptimization", "data.table", "h2o")
new_pkgs <- list_pkgs[!(list_pkgs %in% installed.packages()[,"Package"])]
if(length(new_pkgs)) install.packages(new_pkgs)

# Load All R Packages
suppressMessages(sapply(list_pkgs, require, character.only = TRUE))
rm(list_pkgs, new_pkgs)

In [2]:
# Start and connect to local H2O cluster
h2o.init(nthreads = -1)

# Disable progress bar
h2o.no_progress()


H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    /tmp/RtmpI0axxw/h2o_joe_started_from_r.out
    /tmp/RtmpI0axxw/h2o_joe_started_from_r.err


Starting H2O JVM and connecting: ... Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 seconds 877 milliseconds 
    H2O cluster version:        3.10.2.1 
    H2O cluster version age:    18 days  
    H2O cluster name:           H2O_started_from_R_joe_oue094 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   5.21 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    R Version:                  R version 3.3.2 (2016-10-31) 



In [3]:
# Import file
h_churn <- h2o.importFile(path = "http://www.dataminingconsultant.com/data/churn.txt")

In [4]:
# Dimension 3333 x 21
dim(h_churn)

In [5]:
# Head
head(h_churn)

State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,⋯,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
KS,128,415,382-4657,no,yes,25,265.1,110,45.07,⋯,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
OH,107,415,371-7191,no,yes,26,161.6,123,27.47,⋯,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
NJ,137,415,358-1921,no,no,0,243.4,114,41.38,⋯,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
OH,84,408,375-9999,yes,no,0,299.4,71,50.9,⋯,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
OK,75,415,330-6626,yes,no,0,166.7,113,28.34,⋯,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.
AL,118,510,391-8027,yes,no,0,223.4,98,37.98,⋯,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False.


In [6]:
# More
h2o.describe(h_churn)

Label,Type,Missing,Zeros,PosInf,NegInf,Min,Max,Mean,Sigma,Cardinality
State,enum,0,52,0,0,0.0,50.0,,,51.0
Account Length,int,0,0,0,0,1.0,243.0,101.064806480648,39.8221059285957,
Area Code,int,0,0,0,0,408.0,510.0,437.182418241824,42.3712904856066,
Phone,string,0,0,0,0,,,,,
Int'l Plan,enum,0,3010,0,0,0.0,1.0,0.0969096909690969,0.295879145484415,2.0
VMail Plan,enum,0,2411,0,0,0.0,1.0,0.276627662766277,0.447397870380064,2.0
VMail Message,int,0,2411,0,0,0.0,51.0,8.09900990099009,13.6883653720386,
Day Mins,real,0,2,0,0,0.0,350.8,179.775097509751,54.4673892023715,
Day Calls,int,0,2,0,0,0.0,165.0,100.435643564356,20.0690842073009,
Day Charge,real,0,2,0,0,0.0,59.64,30.5623072307231,9.2594345539305,


In [7]:
# Split (Training / Validation / Test)
h_split <- h2o.splitFrame(data = h_churn, 
                          ratios = c(0.5, 0.25),
                          destination_frames = c("h_train", "h_valid", "h_test"),
                          seed = 2016)
h_train <- h_split[[1]]
h_valid <- h_split[[2]]
h_test <- h_split[[3]]

In [8]:
# Define target and features
target <- "Churn?"
features <- setdiff(colnames(h_churn), target)
print(features)

 [1] "State"          "Account Length" "Area Code"      "Phone"         
 [5] "Int'l Plan"     "VMail Plan"     "VMail Message"  "Day Mins"      
 [9] "Day Calls"      "Day Charge"     "Eve Mins"       "Eve Calls"     
[13] "Eve Charge"     "Night Mins"     "Night Calls"    "Night Charge"  
[17] "Intl Mins"      "Intl Calls"     "Intl Charge"    "CustServ Calls"


## Experiment Setup

- **Ten** different random seeds
- Performance metrics: **AUC** and **time taken**

### Hyper-parameters Settings

Same as https://a-ghorbani.github.io/2016/11/24/data-science-with-h2o#bayesian-optimization

- max_depth   = c(2L, 8L)
- learn_rate  = c(1e-4, 0.2)
- sample_rate = c(0.4, 1)
- col_sample_rate = c(0.4, 1) 
- balance_classes = c(0L, 1L)) = c(TRUE, FALSE)


In [9]:
# Generate Random Seeds
set.seed(1234)
rand_seeds <- sample(1:99999, 10)
print(rand_seeds)

 [1] 11371 62229 60926 62336 86088 64028   950 23254 66603 51420


## Bayesian Optimization

See https://a-ghorbani.github.io/2016/11/24/data-science-with-h2o#bayesian-optimization


In [10]:
# Optimisation function
h2o_bayes <- function(max_depth, learn_rate, sample_rate, 
                      col_sample_rate, balance_classes) {
  
  bal.cl <- as.logical(balance_classes)
  
  gbm <- h2o.gbm(  
    x                   = features,
    y                   = target,
    training_frame      = h_train,
    validation_frame    = h_valid,
    ntrees              = 900,
    max_depth           = max_depth,
    learn_rate          = learn_rate,
    sample_rate         = sample_rate,
    col_sample_rate     = col_sample_rate,
    score_tree_interval = 5,
    stopping_rounds     = 2,
    stopping_metric     = "logloss",
    stopping_tolerance  = 0.005,
    balance_classes     = bal.cl,
    seed                = n_seed           # using global variable
  )
  
  score <- h2o.auc(gbm, valid = T)
  list(Score = score,
       Pred  = 0)
  
}

In [11]:
# Loop for Bayesian Optimization Benckmark
options(warn = -1)
d_eval <- c() # empty data frame ... lazy approach

for (item in rand_seeds) {
    
    # Extract Random Seed
    n_seed <- item
    set.seed(n_seed)
    
    # Start Timer
    t_start <- proc.time()
    
    # Bayes Opt
    # Same as https://a-ghorbani.github.io/2016/11/24/data-science-with-h2o#bayesian-optimization
    OPT_Res <- BayesianOptimization(
                  h2o_bayes,
                  bounds = list(
                    max_depth   = c(2L, 8L), 
                    learn_rate  = c(1e-4, 0.2),
                    sample_rate = c(0.4, 1), 
                    col_sample_rate = c(0.4, 1), 
                    balance_classes = c(0L, 1L)),
                  init_points = 10,  n_iter = 10,   
                  acq = "ucb", kappa = 2.576, eps = 0.0,
                  verbose = FALSE)
    
    # End Timer
    t_end <- timetaken(t_start)
    
    # Evaluation Results
    d_eval_temp <- data.frame(        
                              # Model / Optimization Info
                              algo = "Bayes_Opt",
                              seed = n_seed,
                              max_depth = OPT_Res$Best_Par[1],
                              learn_rate = round(OPT_Res$Best_Par[2], 6),
                              sample_rate = round(OPT_Res$Best_Par[3], 6),
                              col_sample_rate = round(OPT_Res$Best_Par[4], 6),
                              balance_classes = as.logical(OPT_Res$Best_Par[5]),
                              
                              # Evaluation Metrics
                              AUC = round(OPT_Res$Best_Value, 6),
                              timetaken = t_end,
        
                              stringsAsFactors = FALSE      
                             )
    
    # Stack
    d_eval <- rbind(d_eval, d_eval_temp)
    
}



 Best Parameters Found: 
Round = 18	max_depth = 4.0000	learn_rate = 0.1145	sample_rate = 0.5288	col_sample_rate = 0.5578	balance_classes = 0.0000	Value = 0.9038 

 Best Parameters Found: 
Round = 8	max_depth = 3.0000	learn_rate = 0.1764	sample_rate = 0.9991	col_sample_rate = 0.6130	balance_classes = 0.0000	Value = 0.8986 

 Best Parameters Found: 
Round = 17	max_depth = 6.0000	learn_rate = 0.0001	sample_rate = 1.0000	col_sample_rate = 0.4867	balance_classes = 0.0000	Value = 0.8989 

 Best Parameters Found: 
Round = 9	max_depth = 2.0000	learn_rate = 0.1336	sample_rate = 0.8845	col_sample_rate = 0.9303	balance_classes = 0.0000	Value = 0.8984 

 Best Parameters Found: 
Round = 16	max_depth = 3.0000	learn_rate = 0.0855	sample_rate = 0.8815	col_sample_rate = 0.7840	balance_classes = 0.0000	Value = 0.8993 

 Best Parameters Found: 
Round = 16	max_depth = 8.0000	learn_rate = 0.0710	sample_rate = 0.8933	col_sample_rate = 0.5262	balance_classes = 0.0000	Value = 0.9015 

 Best Parameters Found:

In [12]:
# Evaluation Results Table (Bayes Opt)
rownames(d_eval) <- NULL
d_eval

algo,seed,max_depth,learn_rate,sample_rate,col_sample_rate,balance_classes,AUC,timetaken
Bayes_Opt,11371,4,0.114541,0.52882,0.557812,False,0.903815,00:02:59
Bayes_Opt,62229,3,0.176366,0.999118,0.612994,False,0.898618,00:02:51
Bayes_Opt,60926,6,0.0001,1.0,0.486682,False,0.898942,00:04:01
Bayes_Opt,62336,2,0.13364,0.884505,0.930309,False,0.898388,00:03:49
Bayes_Opt,86088,3,0.085472,0.881468,0.78402,False,0.899331,00:03:44
Bayes_Opt,64028,8,0.07104,0.893326,0.526155,False,0.901458,00:03:55
Bayes_Opt,950,6,0.176404,0.960376,0.614625,False,0.898064,00:03:15
Bayes_Opt,23254,8,0.2,1.0,0.430649,False,0.901788,00:04:09
Bayes_Opt,66603,4,0.020041,0.980212,0.963159,False,0.897527,00:02:13
Bayes_Opt,51420,7,0.184445,0.497194,0.51316,True,0.900916,00:02:35


## H2O Random Grid Search

In [13]:
# Same parameters search range as above
params_gbm <- list(max_depth = seq(2, 8, 1),
                   learn_rate  = seq(1e-4, 0.2, 0.01),
                   sample_rate = seq(0.4, 1, 0.05),
                   col_sample_rate = seq(0.4, 1, 0.05),
                   balance_classes = c(TRUE, FALSE))


# Loop for Random Grid Search Benchmark
options(warn = -1)


for (item in rand_seeds) {
    
    # Extract Random Seed
    n_seed <- item
    set.seed(n_seed)
    
    # Search Criteria
    search_criteria <- list(strategy = "RandomDiscrete",
                            max_models = 10,
                            seed = n_seed)
    
    # Start Timer
    t_start <- proc.time()
    
    # H2O Random Grid Search
    grid_gbm <- h2o.grid(
                     
                     # Grid search parameters
                     algorithm        = "gbm",
                     grid_id          = paste0("grid_gbm_seed_", n_seed),
                     hyper_params     = params_gbm,
                     search_criteria  = search_criteria,
                     
                     # Core model parameters
                     x                = features,
                     y                = target,
                     training_frame   = h_train,
                     validation_frame = h_valid,
                     ntrees           = 900,
                     seed             = n_seed,
                     
                     # Early stopping parameters
                     # Same as Bayes Opt Above
                     score_tree_interval = 5,
                     stopping_metric     = "logloss",
                     stopping_tolerance  = 0.05,
                     stopping_rounds     = 2
                     
                     )
    
    # End Timer
    t_end <- timetaken(t_start)
    
    # Sort Grid
    grid_gbm_sorted <- h2o.getGrid(paste0("grid_gbm_seed_", n_seed), sort_by = "AUC", decreasing = TRUE)
    
    # Evaluation Results
    d_eval_temp <- data.frame(        
                              # Model / Optimization Info
                              algo = "H2O_Random_Grid",
                              seed = n_seed,
                              max_depth = grid_gbm_sorted@summary_table$max_depth[1],
                              learn_rate = grid_gbm_sorted@summary_table$learn_rate[1],
                              sample_rate = grid_gbm_sorted@summary_table$sample_rate[1],
                              col_sample_rate = grid_gbm_sorted@summary_table$col_sample_rate[1],
                              balance_classes = grid_gbm_sorted@summary_table$balance_classes[1],
                              
                              # Evaluation Metrics
                              AUC = grid_gbm_sorted@summary_table$auc[1],
                              timetaken = t_end,
        
                              stringsAsFactors = FALSE      
                             )
    
        # Stack
    d_eval <- rbind(d_eval, d_eval_temp)
    
}

In [14]:
rownames(d_eval) <- NULL
d_eval

algo,seed,max_depth,learn_rate,sample_rate,col_sample_rate,balance_classes,AUC,timetaken
Bayes_Opt,11371,4,0.114541,0.52882,0.557812,False,0.903815,00:02:59
Bayes_Opt,62229,3,0.176366,0.999118,0.612994,False,0.898618,00:02:51
Bayes_Opt,60926,6,0.0001,1.0,0.486682,False,0.898942,00:04:01
Bayes_Opt,62336,2,0.13364,0.884505,0.930309,False,0.898388,00:03:49
Bayes_Opt,86088,3,0.085472,0.881468,0.78402,False,0.899331,00:03:44
Bayes_Opt,64028,8,0.07104,0.893326,0.526155,False,0.901458,00:03:55
Bayes_Opt,950,6,0.176404,0.960376,0.614625,False,0.898064,00:03:15
Bayes_Opt,23254,8,0.2,1.0,0.430649,False,0.901788,00:04:09
Bayes_Opt,66603,4,0.020041,0.980212,0.963159,False,0.897527,00:02:13
Bayes_Opt,51420,7,0.184445,0.497194,0.51316,True,0.900916,00:02:35
