In [1]:
import pandas as pd
import numpy as np
import h2o
import lime
from lime import lime_tabular

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA  # Principal component
from sklearn.linear_model import LinearRegression
from h2o.automl import H2OAutoML as ml

from functions.datacleaning import MoscowHousing as mh
from functions.distance import get_distance_coordinates, norm_features, PCA_plot

import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,24 mins 45 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.35.0.2
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,H2O_from_python_vanjafalck_wrgfdg
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.810 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
# All variables in training data
data = h2o.import_file("../prepared_data/hot_encoded_training_data_step_4.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
data_test = h2o.import_file("../prepared_data/hot_encoded_testing_data_step_4.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [5]:

data_pca = h2o.import_file("../prepared_data/hot_encoded_training_data_step_4_high_pca.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [6]:
data_pca_test = h2o.import_file("../prepared_data/hot_encoded_testing_data_step_4_high_pca.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [7]:
# Kaggle score:  1.14745 (with predict on best model from these features using 
# data_pca_test: "../prepared_data/hot_encoded_testing_data_step_4_high_pca.csv")
# data_pca:      "../prepared_data/hot_encoded_training_data_step_4_high_pca.csv"
features = ["area_total",
            "rooms",
            "floor",
            "distance",
            "distance_metro",
            "ceiling", 
            "district", 
            "stories", 
            "material_0.0", "material_1.0", "material_2.0", 
            "material_3.0", "material_4.0", "material_5.0",
            "material_6.0", "material_nan", 
            "heating_0.0", "heating_1.0", "heating_2.0", 
            "heating_3.0", "heating_nan"]

output = "price"


# Only PCA scores
features_light = ["rooms",
                  "floor",
                  "distance_metro",
                  "ceiling", 
                  "district", 
                  "stories", 
                  "material_2.0",
                  "heating_nan"]

In [12]:
train_light, test_light = data.split_frame(ratios=[0.8])
aml_light = ml(max_models=30, max_runtime_secs=300, seed=1)
aml_light.train(x=features_light, y=output, training_frame=train_light)

AutoML progress: |
10:57:17.987: AutoML: XGBoost is not available; skipping it.
10:57:17.988: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
10:57:17.988: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_4_AutoML_3_20211112_105717


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,58.0,58.0,106393.0,10.0,10.0,10.0,37.0,327.0,141.15517




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.2783323000776709
RMSE: 0.5275720804569466
MAE: 0.12272181611100959
RMSLE: 0.1464800326440427
Mean Residual Deviance: 0.2783323000776709

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 0.46866426425913593
RMSE: 0.6845905814858513
MAE: 0.1526455207559901
RMSLE: 0.18338639560578354
Mean Residual Deviance: 0.46866426425913593

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.152033,0.011372,0.146757,0.163199,0.148163,0.164209,0.137836
1,mean_residual_deviance,0.468789,0.416013,0.189421,0.653195,0.219588,1.119875,0.161866
2,mse,0.468789,0.416013,0.189421,0.653195,0.219588,1.119875,0.161866
3,r2,0.556639,0.172107,0.604342,0.468982,0.654175,0.307219,0.748474
4,residual_deviance,0.468789,0.416013,0.189421,0.653195,0.219588,1.119875,0.161866
5,rmse,0.63452,0.287605,0.435226,0.808205,0.468602,1.058242,0.402326
6,rmsle,0.183398,0.005667,0.182498,0.187804,0.178883,0.190511,0.177293



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-11-12 10:57:38,1.602 sec,0.0,0.959711,0.389977,0.921045
1,,2021-11-12 10:57:38,1.643 sec,5.0,0.788052,0.266375,0.621026
2,,2021-11-12 10:57:38,1.678 sec,10.0,0.698565,0.208176,0.487994
3,,2021-11-12 10:57:38,1.713 sec,15.0,0.645659,0.170895,0.416876
4,,2021-11-12 10:57:38,1.749 sec,20.0,0.613138,0.151917,0.375939
5,,2021-11-12 10:57:38,1.780 sec,25.0,0.59256,0.141139,0.351127
6,,2021-11-12 10:57:38,1.813 sec,30.0,0.576763,0.134902,0.332655
7,,2021-11-12 10:57:38,1.845 sec,35.0,0.564277,0.130772,0.318409
8,,2021-11-12 10:57:38,1.874 sec,40.0,0.554387,0.128263,0.307345
9,,2021-11-12 10:57:38,1.904 sec,45.0,0.546068,0.126242,0.29819



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,rooms,21784.601562,1.0,0.363321
1,distance_metro,9547.047852,0.438248,0.159225
2,district,9362.850586,0.429792,0.156153
3,floor,7425.471191,0.340859,0.123841
4,stories,7244.120605,0.332534,0.120817
5,ceiling,2036.950562,0.093504,0.033972
6,material_2.0,1487.108276,0.068264,0.024802
7,heating_nan,1071.453369,0.049184,0.01787




In [13]:
lb_light = aml_light.leaderboard
lb_light.head()

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_4_AutoML_3_20211112_105717,0.468664,0.684591,0.468664,0.152646,0.183386
DRF_1_AutoML_3_20211112_105717,0.469268,0.685031,0.469268,0.129127,0.159578
StackedEnsemble_AllModels_3_AutoML_3_20211112_105717,0.472526,0.687405,0.472526,0.136139,0.163681
GBM_grid_1_AutoML_3_20211112_105717_model_5,0.473512,0.688122,0.473512,0.145028,
GBM_grid_1_AutoML_3_20211112_105717_model_1,0.473762,0.688303,0.473762,0.164989,0.195967
StackedEnsemble_BestOfFamily_2_AutoML_3_20211112_105717,0.476272,0.690125,0.476272,0.136852,0.166074
StackedEnsemble_BestOfFamily_3_AutoML_3_20211112_105717,0.478249,0.691555,0.478249,0.137658,0.166516
GBM_grid_1_AutoML_3_20211112_105717_model_8,0.478981,0.692084,0.478981,0.135838,0.165977
GBM_3_AutoML_3_20211112_105717,0.480742,0.693356,0.480742,0.161859,0.19022
GBM_2_AutoML_3_20211112_105717,0.482427,0.69457,0.482427,0.166291,0.195402




In [14]:
pred_light = aml_light.predict(data_test)
df_light = pred_light.as_data_frame()
df_light.to_csv("automl_light_prediction.csv")

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [15]:
df_light.head()

Unnamed: 0,predict
0,1.589714
1,-0.325993
2,-0.136178
3,-0.159252
4,-0.173778


In [15]:
#train, test = data_pca.split_frame(ratios=[0.8])
#aml = ml(max_models = 30, max_runtime_secs=600, seed=1)
#aml.train(x=features, y=output, training_frame=train)

In [16]:
#lb = aml.leaderboard
#lb.head()

In [17]:
#preds = aml.predict(test)

In [None]:
#print(preds)

In [None]:
#df = preds.as_data_frame()
#print(df.shape)
#df.head()

In [None]:
#df.to_csv("automl_y.csv")

In [None]:
#lb.head(rows=lb.nrows)