In [5]:
# Coursera Machine Learning Submission for H20

# Need to run install.packages("random") and install.packages("h2o")

# H2o library loading and initalization
library(h2o)
h2o.init()

# Artificial Data Creation
set.seed(123)
N <- 1000
data <- data.frame(id= 1:N)

# Name generation Random ; Trying another package
library(random)
string_5 <- as.vector(randomStrings(n=N, len=5, digits=FALSE, upperalpha=TRUE,
                                    loweralpha=FALSE, unique=TRUE, check=TRUE))
data$name = string_5

# Language of the Movie , HollyWood , Nollywood etc :D
languages <-c('English','English', 'Malayalam', 'Yourba','Yourba', 'Japanese', 'Hindi', 'Spanish')
data$language <- as.factor(languages[data$id %% length(languages) +1])

# No of Fights
v = round(rnorm(N, mean=3, sd=1))
v = pmax(v,0)
v = pmin(v,4)
table(v)
data$noOfFights = v

# No of Songs
v = round(rnorm(N, mean=2, sd=1))
v = pmax(v,0)
v = pmin(v,5)
table(v)
data$noOfSongs = v

# Gross collection depends on no of fights, songs and if english or not
v = 200000 +((data$noOfFights * 300) ^ 2) +((data$noOfSongs * 200) ^ 2)
v = v * 100 *(if( c('English') %in% data$language) 0.9 else 0.3)
data$grossCollection =v

# Convert to H2o Data Frame
as.h2o(data, destination_frame = "movies")
movies <- h2o.getFrame("movies")

# Summary
summary(movies)

parts <- h2o.splitFrame(data = movies, 
                        ratios = c(0.8,0.1),
                        destination_frames = c("movies_train","movies_valid","movies_test"),
                        seed=123)

train <-h2o.getFrame("movies_train")
valid <-h2o.getFrame("movies_valid")
test  <-h2o.getFrame("movies_test")

y<- "grossCollection"

x<-setdiff(names(train), c("id",y))
# Purposefully leaving the name column in


m1 <- h2o.gbm(x, y, train,
              model_id = "movies_r",
              validation_frame = valid)

# The model will kick out name as it is bad / constanct column
# Message : 
 # In .h2o.startModelJob(algo, params, h2oRestApiVersion) :
 # Dropping bad and constant columns: [name].

h2o.performance(m1, train = TRUE)
h2o.performance(m1, valid = TRUE)
h2o.performance(m1, test)

# Overfitting by giving more trees and more depths

m2 <- h2o.gbm(x, y, training_frame = train,
              model_id = "movies_overfit_r",
              validation_frame = valid,
              ntrees = 1000,
              max_depth = 10)


h2o.performance(m2, train = TRUE)
h2o.performance(m2, valid = TRUE)
h2o.performance(m2, test)


H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    /var/folders/61/sfyb5b517dd3tj1_cdjsx6_w0000gn/T//RtmpTRTjW2/h2o_tim_wu_started_from_r.out
    /var/folders/61/sfyb5b517dd3tj1_cdjsx6_w0000gn/T//RtmpTRTjW2/h2o_tim_wu_started_from_r.err


Starting H2O JVM and connecting: .. Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         2 seconds 326 milliseconds 
    H2O cluster timezone:       America/New_York 
    H2O data parsing timezone:  UTC 
    H2O cluster version:        3.20.0.8 
    H2O cluster version age:    2 months and 26 days  
    H2O cluster name:           H2O_started_from_R_tim.wu_hvp821 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.56 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       N

v
  0   1   2   3   4 
  5  61 229 402 303 

v
  0   1   2   3   4   5 
 67 228 375 267  56   7 



  id  name  language noOfFights noOfSongs grossCollection
1  1 JRHSU   English          2         1        54000000
2  2 QUJXB Malayalam          3         1        94500000
3  3 CYGAX    Yourba          4         2       162000000
4  4 LYSEQ    Yourba          3         2       105300000
5  5 KEBZX  Japanese          3         0        90900000
6  6 LIJHI     Hindi          4         3       180000000

[1000 rows x 6 columns] 

“Approximated quantiles computed! If you are interested in exact quantiles, please pass the `exact_quantiles=TRUE` parameter.”

 id               name language       noOfFights      noOfSongs      
 Min.   :   1.0        English  :250  Min.   :0.000   Min.   :0.000  
 1st Qu.: 250.8        Yourba   :250  1st Qu.:2.000   1st Qu.:1.000  
 Median : 500.5        Hindi    :125  Median :3.000   Median :2.000  
 Mean   : 500.5        Japanese :125  Mean   :2.937   Mean   :2.038  
 3rd Qu.: 750.2        Malayalam:125  3rd Qu.:4.000   3rd Qu.:3.000  
 Max.   :1000.0        Spanish  :125  Max.   :4.000   Max.   :5.000  
 grossCollection    
 Min.   : 18000000  
 1st Qu.: 82782000  
 Median :105181200  
 Mean   :113215500  
 3rd Qu.:151077601  
 Max.   :237600000  

“Dropping bad and constant columns: [name].
”



H2ORegressionMetrics: gbm
** Reported on training data. **

MSE:  7.742215e+12
RMSE:  2782484
MAE:  787975.1
RMSLE:  0.02901744
Mean Residual Deviance :  7.742215e+12


H2ORegressionMetrics: gbm
** Reported on validation data. **

MSE:  1.979512e+12
RMSE:  1406951
MAE:  666179.5
RMSLE:  0.01749877
Mean Residual Deviance :  1.979512e+12


H2ORegressionMetrics: gbm

MSE:  1.315718e+12
RMSE:  1147048
MAE:  555577.7
RMSLE:  0.0110369
Mean Residual Deviance :  1.315718e+12


“Dropping bad and constant columns: [name].
”



H2ORegressionMetrics: gbm
** Reported on training data. **

MSE:  6.030085e+12
RMSE:  2455623
MAE:  441534
RMSLE:  0.01922631
Mean Residual Deviance :  6.030085e+12


H2ORegressionMetrics: gbm
** Reported on validation data. **

MSE:  7.23701e+12
RMSE:  2690169
MAE:  762959.3
RMSLE:  0.03296876
Mean Residual Deviance :  7.23701e+12


H2ORegressionMetrics: gbm

MSE:  3.400839e+12
RMSE:  1844136
MAE:  346278.2
RMSLE:  0.01163702
Mean Residual Deviance :  3.400839e+12


In [2]:
install.packages("random")

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [3]:
install.packages("h2o")

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
