In [3]:
## House Price: Advanced Regression Technique using H2o

#### Implementing various Advanced regression techniques to predict target on Kaggle's House price problem.


## Import Libraries


import numpy as np    # linear algebra
import pandas as pd    # data processing, CSV file I/O (e.g. pd.read_csv)
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import os, time, sys
import warnings
warnings.filterwarnings("ignore")

## Import the dataset

train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")

train["galaxy"] = train["galaxy"].astype('category')
train["galaxy"] = train["galaxy"].cat.codes
test["galaxy"] = test["galaxy"].astype('category')
test["galaxy"] = test["galaxy"].cat.codes


test_original = test
print(train.shape)
print(train.columns)


## Clean the dataset



# Identify the columns containing NA values in both train and test dataset
train.isnull().sum().sort_values(ascending = False)
test.isnull().sum().sort_values(ascending = False)
# Remove the columns which contain NA with more than 100 rows
print(train.shape)
print(test.shape)
# Identify the columns which contain numeric values
train_numericCol = train.select_dtypes(include=[np.number]).columns.values
print(train_numericCol)
# Fill missing values in the numeric columns.
train.fillna(train.mean(),inplace = True)
test.fillna(test.mean(),inplace = True)
# check if still any NA's are available in the numerical coulumns of train data
train[train_numericCol].isnull().sum().sort_values(ascending = False)


## Initiate the h2o Server


h2o.init(ip="localhost", port=54321)


## Convert data into h2o Frame


train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)
test_original = h2o.H2OFrame(test_original)


# Split the train dataset
train, valid, test = train.split_frame(ratios=[0.7, 0.15], seed=42)

# Seperate the target data and store it into y variable
y = 'y'
Id = test['galaxy']

# remove target and Id column from the dataset and store rest of the columns in X variable
X = list(train.columns)
X.remove(y)
X.remove('galaxy')
X


## H2o Machine Learning models

#We will now perform training of the models using below H2o supervised algorithms


#Gradient Boosting Machine (RF)

#Random Forest (RF)

#Deep Learning (DL)

### 1. Gradient Boosting Machine (GBM)

# Prepare the hyperparameters
gbm_params = {
                'learn_rate': [0.01, 0.1], 
                'max_depth': [4, 5, 7],
                'sample_rate': [0.6, 0.8],               # Row sample rate
                'col_sample_rate': [0.2, 0.5, 0.9]       # Column sample rate per split (from 0.0 to 1.0)
                }



# Prepare the grid object
gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator,   # Model to be trained
                          grid_id='gbm_grid1',                  # Grid Search ID
                          hyper_params=gbm_params,              # Dictionary of parameters
                          search_criteria={"strategy": "Cartesian"}   # RandomDiscrete
                          )



# Train the Model
start = time.time() 
gbm_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                ntrees=100,      # Specify other GBM parameters not in grid
                score_tree_interval=5,     # For early stopping
                stopping_rounds=3,         # For early stopping
                stopping_tolerance=0.0005,
                seed=1)

end = time.time()
(end - start)/60



# Find the Model grid performance 
gbm_gridperf = gbm_grid.get_grid(sort_by='RMSE',decreasing = False)
gbm_gridperf



# Identify the best model generated with least error
best_gbm_model = gbm_gridperf.models[0]
best_gbm_model


### 2. Random Forest Algorithm



# Prepare the hyperparameters
nfolds = 5
rf_params = {
                'max_depth': [3, 4,5],
                'sample_rate': [0.8, 1.0],               # Row sample rate
                'mtries' : [2,4,3]
                }



# Search criteria for parameter space
search_criteria = {'strategy': "RandomDiscrete",
                   "seed": 1,
                   'stopping_metric': "AUTO",
                   'stopping_tolerance': 0.0005
                   }



# Prepare the grid object
rf_grid = H2OGridSearch(model=H2ORandomForestEstimator,   # Model to be trained
                          grid_id='rf_grid',                  # Grid Search ID
                          hyper_params=rf_params,              # Dictionary of parameters
                          search_criteria=search_criteria,   # RandomDiscrete
                          )



# Train the Model
start = time.time() 
rf_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                ntrees=100,      
                score_each_iteration=True,
                nfolds = nfolds,
                fold_assignment= "Modulo",
                seed=1
                )

end = time.time()
(end - start)/60



# Find the Model performance 
rf_gridperf = rf_grid.get_grid(sort_by='RMSE',decreasing = False)
rf_gridperf



# Identify the best model generated with least error
best_rf_model = rf_gridperf.models[0]
best_rf_model


### 3. Deep Learning Algorithm



activation_opt = ["RectifierWithDropout",
                  "TanhWithDropout"]
#L1 & L2 regularization
l1_opt = [0, 0.00001,
          0.0001,
          0.001,
          0.01,
          0.1]

l2_opt = [0, 0.00001,
          0.0001,
          0.001,
          0.01,
          0.1]



# Create the Hyperparameters
dl_params = {
             'activation': activation_opt,
             "input_dropout_ratio" : [0,0.05, 0.1],  # input layer dropout ratio to improve generalization. Suggested values are 0.1 or 0.2.
             'l1': l1_opt,
             'l2': l2_opt,
             'hidden_dropout_ratios':[[0.1,0.2,0.3], # hidden layer dropout ratio to improve generalization: one value per hidden layer.
                                      [0.1,0.5,0.5],
                                      [0.5,0.5,0.5]]
             }



search_criteria = {
                   'strategy': 'RandomDiscrete',
                   'max_runtime_secs': 1000,
                   'seed':1
                   }



# Prepare the grid object
dl_grid = H2OGridSearch(model=H2ODeepLearningEstimator(
                                                    epochs = 1000,   ## hopefully converges earlier...
                                                    adaptive_rate = True,  # http://cs231n.github.io/neural-networks-3/#sgd
                                                    stopping_metric="AUTO",
                                                    stopping_tolerance=1e-2,    ## stop when misclassification does not improve by >=1% for 2 scoring events
                                                    stopping_rounds=3,
                                                    hidden=[128,128,128],      ## more hidden layers -> more complex interactions
                                                    balance_classes= False,
                                                    standardize = True,  # If enabled, automatically standardize the data (mean 0, variance 1). If disabled, the user must provide properly scaled input data.
                                                    loss = "quantile"  # quantile for regression
                                                    ),
                        grid_id='dl_grid',
                        hyper_params=dl_params,
                        search_criteria=search_criteria)



# Train the Model
start = time.time() 
dl_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                stopping_rounds=2,
                stopping_tolerance=0.0005,
                seed=1
                )

end = time.time()
(end - start)/60



# Find the Model performance 
dl_gridperf = dl_grid.get_grid(sort_by='RMSE',decreasing = False)
dl_gridperf



# Identify the best model generated with least error
best_dl_model = dl_gridperf.models[0]
best_dl_model


## Compare Model Performances


best_gbm_perf= best_gbm_model.model_performance(test)  # GBM Model
best_rf_perf = best_rf_model.model_performance(test)   # Random Forest Model
best_dl_perf = best_dl_model.model_performance(test)   #deep Learning Model

### Retreive test set AUC
print(best_gbm_perf.gini)
print(best_rf_perf.gini)
print(best_dl_perf.gini)

## Prediction of Model
gbm_pred= best_gbm_model.predict(test_original).as_data_frame()
rf_pred = best_rf_model.predict(test_original).as_data_frame()
dl_pred = best_dl_model.predict(test_original).as_data_frame()



(3865, 80)
Index(['galactic year', 'galaxy', 'existence expectancy index',
       'existence expectancy at birth', 'Gross income per capita',
       'Income Index', 'Expected years of education (galactic years)',
       'Mean years of education (galactic years)',
       'Intergalactic Development Index (IDI)', 'Education Index',
       'Intergalactic Development Index (IDI), Rank',
       'Population using at least basic drinking-water services (%)',
       'Population using at least basic sanitation services (%)',
       'Gross capital formation (% of GGP)', 'Population, total (millions)',
       'Population, urban (%)',
       'Mortality rate, under-five (per 1,000 live births)',
       'Mortality rate, infant (per 1,000 live births)',
       'Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))',
       'Population, ages 15–64 (millions)',
       'Population, ages 65 and older (millions)',
       'Life expectancy at birth, male (galactic years)',
       '

 'y']
Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,3 mins 16 secs
H2O cluster timezone:,Africa/Cairo
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,5 months and 24 days !!!
H2O cluster name:,H2O_from_python_tahahussein_pvpi28
H2O cluster total nodes:,1
H2O cluster free memory:,3.518 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
drf Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Grid Build progress: |███████████████████████████████████████| 100%

ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.0005600683323488388
RMSE: 0.023665762872741684
MAE: 0.014006980175767264
RMSLE: 0.020388988080713443
Mean Residual Deviance: 0.0005600683323488388
<bound method MetricsBase.gini of >

ModelMetricsRegression: drf
** Reported on test data. **

MSE: 0.0008698502376937746
RMSE: 0.029493223589390404
MAE: 0.019609488448939203
RMSLE: 0.025779472149274237
Mean Residual Deviance: 0.0008698502376937746
<bound method MetricsBase.gini of >

ModelMetricsReg

In [110]:
sub = pd.DataFrame()
sub['Index'] = gbm_pred.index 
sub['pred'] = gbm_pred
sub.head()


Unnamed: 0,Index,pred
0,0,0.07422
1,1,0.036943
2,2,0.036199
3,3,0.036199
4,4,0.036129


## Optimization part

- Ideally giving 100 to top 500 samples with highest p^2 values should optimize the likely increase.
- However, as the predictions can be faulty, this approach would result with lower Leaderboard Score.

E.g: If the original p^2 value is higher than the predicted p^2, it will increase the error as we are directly giving it 0.

- That's why, I believe its better to spread the risk for the samples in the bordering regions (400< [rank of p^2] <600).
- I assign 100 energy to top 400 samples and 50 energy to the remaining top 200 samples.

In [111]:
index = gbm_pred
pot_inc = -np.log(index+0.01)+3

In [112]:
p2= pot_inc**2

In [113]:
sub['opt_pred'] = 0

In [114]:
sub['eei'] = h2o.as_list(test_original['existence expectancy index'])

In [115]:
sub = pd.DataFrame(sub)

In [116]:
sub.loc[p2.nlargest(400, ['predict']).index, 'opt_pred']=100
sub=sub.sort_values('pred')
sub.iloc[400:600].opt_pred = 50
sub=sub.sort_index()

In [117]:
increase = (sub['opt_pred']*p2)/1000

In [118]:
print(sub.loc[sub.eei < 0.7, 'opt_pred'].sum(), sub['opt_pred'].sum())

6600 50000


In [119]:
sub[['Index', 'pred', 'opt_pred']]

Unnamed: 0,Index,pred,opt_pred
0,0,0.074220,0
1,1,0.036943,100
2,2,0.036199,100
3,3,0.036199,100
4,4,0.036129,100
...,...,...,...
886,886,0.044865,100
887,887,0.044865,100
888,888,0.044865,100
889,889,0.044865,100


In [121]:
sub[['Index', 'pred', 'opt_pred']].to_csv('h2o_submission.csv', index=False)