In [1]:
import pandas as pd
import numpy as np
import h2o
import lime
from lime import lime_tabular

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA  # Principal component
from sklearn.linear_model import LinearRegression
from h2o.automl import H2OAutoML as ml
from h2o.estimators import H2OXGBoostEstimator

from functions.datacleaning import MoscowHousing as mh
from functions.distance import get_distance_coordinates, norm_features, PCA_plot

import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')

# Using Prepared Data from Category_1

In [2]:
# Importing prepared_category_1 datasets
XTrain = pd.read_csv("../data/train_prepared_category_1.csv")
XTest = pd.read_csv("../data/test_prepared_category_1.csv")

# Make copies to avoid new imports
X_train = XTrain.copy()
X_test = XTest.copy()
y_train_real = XTrain["price"].copy()          # Prices actual values
y_train_zscore = XTrain["price"].copy() 
y_train_zscore = norm_features(y_train_zscore) # Prices as z-scores (normalized)

In [3]:
# Features with nan (not hot encoded) in this dataset (drop or prepare)
features_nan = ["area_kitchen",
                "area_living",
                "bathrooms_shared",
                "bathrooms_private",
                "windows_court",
                "windows_street",
                "balconies",
                "loggias",
                "phones",
                "constructed",
                "parking"]

features_drop = ["street", "address",
                 "latitude", "longitude"]

# Hot-encoded-nans (drop or leave in)
features_nan_hot = ["seller_nan", 
                    "layout_nan",
                    "condition_nan", 
                    "new_nan", 
                    "material_nan", 
                    "garbage_chute_nan", 
                    "heating_nan"]

# Features WITH hot-encoded-nans in dataset Category-1
features = ["area_total",
            "rooms",
            "floor",
            "distance",
            "distance_metro",
            "ceiling", 
            "district", 
            "stories", 
            "material_0.0", "material_1.0", "material_2.0", 
            "material_3.0", "material_4.0", "material_5.0",
            "material_6.0", "material_nan", 
            "heating_0.0", "heating_1.0", "heating_2.0", 
            "heating_3.0", "heating_nan",
            "elevator", "elevator_no"]

output = "price"

In [4]:
X_train = X_train.drop(features_nan, axis=1)
X_test = X_test.drop(features_nan, axis=1)
X_train = X_train.drop(features_drop, axis=1)
X_test = X_test.drop(features_drop, axis=1)

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 day 0 hours 38 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.35.0.2
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_vanjafalck_wrgfdg
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.147 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [6]:
X_train["price"] = norm_features(X_train["price"])
ids = ["id", "building_id"]
X_train = X_train.drop(ids, axis=1)
X_test = X_test.drop(ids, axis=1)

data_train = h2o.H2OFrame(X_train)
data_test = h2o.H2OFrame(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


# Data Analysis

In [7]:
#data_train = h2o.import_file("../data/train_prepared_category_1.csv")

In [8]:
#data_test = h2o.import_file("../data/test_prepared_category_1.csv")

In [9]:
is_xgboost_available = H2OXGBoostEstimator.available()

Cannot build an XGBoost model - no backend found.


In [10]:
# Settings for "LightGBM" emuation mode options
tree_method="hist"
grow_policy="lossguide"

In [11]:
train, test = data_train.split_frame(ratios=[0.8])
aml = ml(max_models=30, max_runtime_secs=300, seed=1)
aml.train(x=features, y=output, training_frame=train)

AutoML progress: |
10:58:08.780: AutoML: XGBoost is not available; skipping it.
10:58:08.781: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
10:58:08.781: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid_1_AutoML_8_20211113_105808_model_1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,101.0,101.0,423514.0,0.0,16.0,14.118812,1.0,754.0,328.38614




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.11047594737719557
RMSE: 0.3323792222404938
MAE: 0.08890771617834269
RMSLE: 0.10748088725837665
Mean Residual Deviance: 0.11047594737719557

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 0.27412124452843567
RMSE: 0.5235658932058463
MAE: 0.14086714019137958
RMSLE: 0.16204247417686993
Mean Residual Deviance: 0.27412124452843567

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.140935,0.008716,0.146979,0.145158,0.131093,0.149457,0.131989
1,mean_residual_deviance,0.274653,0.207147,0.568928,0.17276,0.111525,0.414332,0.105718
2,mse,0.274653,0.207147,0.568928,0.17276,0.111525,0.414332,0.105718
3,r2,0.726037,0.119821,0.559137,0.786348,0.809082,0.640955,0.834662
4,residual_deviance,0.274653,0.207147,0.568928,0.17276,0.111525,0.414332,0.105718
5,rmse,0.49454,0.193916,0.754273,0.415644,0.333953,0.643687,0.325144
6,rmsle,0.162123,0.002257,0.163824,0.164838,0.159764,0.159994,0.162194



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-11-13 10:59:16,4.798 sec,0.0,0.946249,0.390523,0.895388
1,,2021-11-13 10:59:16,4.862 sec,5.0,0.811537,0.304742,0.658592
2,,2021-11-13 10:59:16,4.937 sec,10.0,0.681264,0.227715,0.46412
3,,2021-11-13 10:59:16,5.018 sec,15.0,0.589104,0.173382,0.347043
4,,2021-11-13 10:59:17,5.079 sec,20.0,0.558139,0.154088,0.311519
5,,2021-11-13 10:59:17,5.139 sec,25.0,0.51805,0.136163,0.268376
6,,2021-11-13 10:59:17,5.210 sec,30.0,0.472047,0.12134,0.222829
7,,2021-11-13 10:59:17,5.272 sec,35.0,0.436293,0.11328,0.190351
8,,2021-11-13 10:59:17,5.333 sec,40.0,0.419265,0.109315,0.175783
9,,2021-11-13 10:59:17,5.391 sec,45.0,0.404455,0.105749,0.163584



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,area_total,34187.1875,1.0,0.50981
1,rooms,7838.583984,0.229284,0.116891
2,distance,5226.694824,0.152885,0.077942
3,distance_metro,4249.724609,0.124308,0.063373
4,stories,3242.270996,0.094839,0.04835
5,district,2766.906494,0.080934,0.041261
6,floor,2447.642334,0.071595,0.0365
7,ceiling,1753.568359,0.051293,0.02615
8,heating_0.0,1211.55603,0.035439,0.018067
9,material_2.0,1048.327759,0.030664,0.015633



See the whole table with table.as_data_frame()




In [12]:
lb = aml.leaderboard
lb.head(30)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_grid_1_AutoML_8_20211113_105808_model_1,0.274121,0.523566,0.274121,0.140867,0.162042
StackedEnsemble_AllModels_4_AutoML_8_20211113_105808,0.279008,0.528213,0.279008,0.153558,
GBM_4_AutoML_8_20211113_105808,0.281849,0.530894,0.281849,0.130321,0.148075
StackedEnsemble_AllModels_3_AutoML_8_20211113_105808,0.284682,0.533556,0.284682,0.155239,
GBM_3_AutoML_8_20211113_105808,0.284867,0.533729,0.284867,0.13507,0.151531
GLM_1_AutoML_8_20211113_105808,0.287093,0.535811,0.287093,0.213121,
StackedEnsemble_BestOfFamily_4_AutoML_8_20211113_105808,0.289597,0.538142,0.289597,0.159896,0.178888
StackedEnsemble_AllModels_1_AutoML_8_20211113_105808,0.292136,0.540496,0.292136,0.160295,0.184263
GBM_2_AutoML_8_20211113_105808,0.293216,0.541494,0.293216,0.13956,0.156072
XRT_1_AutoML_8_20211113_105808,0.293218,0.541496,0.293218,0.130999,0.14635




In [13]:
predictions = aml.predict(test)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [14]:
print(predictions)

predict
-0.239485
-0.239485
-0.410277
-0.177293
-0.180116
-0.29303
-0.183583
0.0785243
-0.391436
-0.18544





In [15]:
test.describe()

Rows:4580
Cols:45




Unnamed: 0.1,Unnamed: 0,price,area_total,floor,rooms,ceiling,district,stories,distance,distance_metro,seller_0.0,seller_1.0,seller_2.0,seller_3.0,seller_nan,layout_0.0,layout_1.0,layout_2.0,layout_nan,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_nan,new_0.0,new_1.0,new_nan,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,material_nan,garbage_chute_0.0,garbage_chute_1.0,garbage_chute_nan,heating_0.0,heating_1.0,heating_2.0,heating_3.0,heating_nan,elevator,elevator_no
type,int,real,real,int,int,real,int,int,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,enum,enum
mins,5.0,-0.4303663197876889,-1.0985084579715496,1.0,1.0,-0.3025507650198925,0.0,2.0,-1.529550528708958,-0.4543685764945318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
mean,11633.373144104808,0.010325320963273767,0.018441564554375497,8.896506550218348,2.192358078602628,0.009689291441841033,4.634497816593897,17.513537117903958,-0.020835000332402504,-0.01694946732204868,0.0700873362445415,0.21244541484716156,0.07903930131004366,0.262882096069869,0.37554585152838427,0.012008733624454149,0.21419213973799126,0.01943231441048035,0.7543668122270742,0.16026200873362445,0.1980349344978166,0.14192139737991266,0.08777292576419214,0.41200873362445417,0.6563318777292576,0.33056768558951966,0.013100436681222707,0.0927947598253275,0.0,0.4427947598253275,0.24126637554585154,0.051746724890829696,0.008296943231441048,0.0,0.1631004366812227,0.11572052401746726,0.5028384279475983,0.3814410480349345,0.5812227074235807,0.053056768558951965,0.0015283842794759825,0.02096069868995633,0.34323144104803494,,
maxs,23277.0,48.94094976711498,35.90420501572241,95.0,6.0,29.31959740948717,11.0,95.0,7.005125865324459,12.32729110856141,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,
sigma,6708.217668933902,1.1946559279511015,1.1819475499861776,8.063800844434567,1.0784405252642506,0.9074909734629426,3.160210328052831,10.899430679894042,0.9798277119265999,0.9515225882269536,0.2553220222032336,0.4090829984472969,0.26982955193766006,0.44024699641142967,0.4843164042457456,0.1089362887854102,0.41030552615190385,0.1380538333059648,0.43050899136700943,0.3668889308379543,0.39856214447897514,0.34900760680786386,0.2829952744556468,0.4922503867463638,0.4749837931387273,0.47046893475214663,0.11371736341426797,0.2901762863346311,0.0,0.496771016573706,0.4278982227861139,0.22153942639949092,0.0907188011717865,0.0,0.36949762374128137,0.3199244158492168,0.5000465364752565,0.48579347697094394,0.493412634755946,0.22417118476436246,0.03906893384852219,0.14326838249428883,0.47483981383596696,,
zeros,0,0,0,0,0,2187,493,0,0,0,4259,3607,4218,3376,2860,4525,3599,4491,1125,3846,3673,3930,4178,2693,1574,3066,4520,4155,4580,2552,3475,4343,4542,4580,3833,4050,2277,2833,1918,4337,4573,4484,3008,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,36.0,-0.24800910596972667,0.2309303494665568,18.0,3.0,0.0,4.0,25.0,-0.3175427808619569,-0.2329279531740411,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,True,False
1,37.0,-0.24800910596972667,0.2309303494665568,18.0,3.0,0.0,4.0,25.0,-0.3175427808619569,-0.2329279531740411,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,True,False
2,197.0,-0.31259395253025496,-0.6127519706383954,17.0,1.0,0.0,11.0,17.0,2.2602062148591178,1.495375742149924,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,True,False


# Runing PCA on h2o

In [16]:
from h2o.estimators import H2OPrincipalComponentAnalysisEstimator

In [17]:
# Complete training dataset
data_complete = h2o.H2OFrame(X_train)

pca_model = H2OPrincipalComponentAnalysisEstimator(k = 5,
                                                   use_all_factor_levels = True,
                                                   pca_method = "glrm",
                                                   transform = None,
                                                   impute_missing = False)

pca_model.train(training_frame=data_complete, x=features)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
pca Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2OPrincipalComponentAnalysisEstimator :  Principal Components Analysis
Model Key:  PCA_model_python_1636708750492_7


Importance of components: 




Unnamed: 0,Unnamed: 1,pc1,pc2,pc3,pc4,pc5
0,Standard deviation,24.103245,5.476256,4.06914,1.529021,1.168189
1,Proportion of Variance,0.920392,0.04751,0.026232,0.003704,0.002162
2,Cumulative Proportion,0.920392,0.967902,0.994134,0.997838,1.0




ModelMetricsPCA: pca
** Reported on train data. **

MSE: NaN
RMSE: NaN

Scoring history from GLRM: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iterations,step_size,objective
0,,2021-11-13 11:03:14,0.303 sec,0.0,0.666667,6830808.0
1,,2021-11-13 11:03:14,0.353 sec,1.0,0.444444,6830808.0
2,,2021-11-13 11:03:14,0.401 sec,2.0,0.222222,6830808.0
3,,2021-11-13 11:03:14,0.452 sec,3.0,0.074074,6830808.0
4,,2021-11-13 11:03:14,0.503 sec,4.0,0.018519,6830808.0
5,,2021-11-13 11:03:14,0.549 sec,5.0,0.003704,6830808.0
6,,2021-11-13 11:03:14,0.598 sec,6.0,0.003889,3347219.0
7,,2021-11-13 11:03:14,0.642 sec,7.0,0.004083,1482439.0
8,,2021-11-13 11:03:14,0.686 sec,8.0,0.004288,1233693.0
9,,2021-11-13 11:03:14,0.727 sec,9.0,0.004502,1096042.0



See the whole table with table.as_data_frame()




In [18]:
pca_model.summary()


Importance of components: 


Unnamed: 0,Unnamed: 1,pc1,pc2,pc3,pc4,pc5
0,Standard deviation,24.103245,5.476256,4.06914,1.529021,1.168189
1,Proportion of Variance,0.920392,0.04751,0.026232,0.003704,0.002162
2,Cumulative Proportion,0.920392,0.967902,0.994134,0.997838,1.0




In [19]:
pca_model.varimp()

[('Standard deviation',
  24.103245305721835,
  5.476256025298006,
  4.0691401063018136,
  1.529020540417895,
  1.1681894201660319),
 ('Proportion of Variance',
  0.920392022957473,
  0.047510466263842886,
  0.02623173954093144,
  0.00370380781577723,
  0.0021619634219754652),
 ('Cumulative Proportion',
  0.920392022957473,
  0.9679024892213159,
  0.9941342287622473,
  0.9978380365780246,
  1.0)]

In [29]:
pca_ = pca_model.predict(data_complete)

pca prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [30]:
combined_ = pca_.cbind(data_complete[output])

In [31]:
combined_.head()

PC1,PC2,PC3,PC4,PC5,price
9.42643,-2.80388,1.02684,0.819133,-0.51064,-0.311843
30.705,4.35121,-0.196563,0.247376,-0.00852853,-0.248009
19.402,3.24011,1.69425,1.00489,-0.159527,-0.276129
30.705,4.35121,-0.196563,0.247376,-0.00852853,-0.248009
17.0344,-1.10883,0.912159,0.511622,-0.547518,-0.183424
8.95302,-3.67202,0.837698,0.797245,-0.865759,-0.343655
24.3984,5.42719,8.36584,-3.03254,0.759123,-0.312594
22.8006,6.02333,1.30775,-0.954746,-1.02733,-0.334439
28.3444,9.08375,-0.768326,0.847478,0.435624,-0.282201
21.1311,-2.99037,-1.29395,0.102361,-0.417961,-0.238701




In [32]:
# Keep only the three PCs that explain about 99%
pca_training_df = combined_.drop(["PC4", "PC5"])

# Run model fitting on the PCA dataset

In [34]:
data_train = pca_training_df

train2, test2 = data_train.split_frame(ratios=[0.8])
aml2 = ml(max_models=30, max_runtime_secs=300, seed=1)
aml2.train(y=output, training_frame=train2)

AutoML progress: |
11:43:56.556: AutoML: XGBoost is not available; skipping it.
11:43:56.557: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
11:43:56.557: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_4_AutoML_9_20211113_114356

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.07014616972856716
RMSE: 0.26485122187478605
MAE: 0.13474425150884298
RMSLE: 0.1647537138984076
R^2: 0.9459433243383639
Mean Residual Deviance: 0.07014616972856716
Null degrees of freedom: 10047
Residual degrees of freedom: 10043
Null deviance: 13040.722789617392
Residual deviance: 704.8287134326429
AIC: 1827.703656824978

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **




In [35]:
lb2 = aml.leaderboard
lb2.head(30)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_grid_1_AutoML_8_20211113_105808_model_1,0.274121,0.523566,0.274121,0.140867,0.162042
StackedEnsemble_AllModels_4_AutoML_8_20211113_105808,0.279008,0.528213,0.279008,0.153558,
GBM_4_AutoML_8_20211113_105808,0.281849,0.530894,0.281849,0.130321,0.148075
StackedEnsemble_AllModels_3_AutoML_8_20211113_105808,0.284682,0.533556,0.284682,0.155239,
GBM_3_AutoML_8_20211113_105808,0.284867,0.533729,0.284867,0.13507,0.151531
GLM_1_AutoML_8_20211113_105808,0.287093,0.535811,0.287093,0.213121,
StackedEnsemble_BestOfFamily_4_AutoML_8_20211113_105808,0.289597,0.538142,0.289597,0.159896,0.178888
StackedEnsemble_AllModels_1_AutoML_8_20211113_105808,0.292136,0.540496,0.292136,0.160295,0.184263
GBM_2_AutoML_8_20211113_105808,0.293216,0.541494,0.293216,0.13956,0.156072
XRT_1_AutoML_8_20211113_105808,0.293218,0.541496,0.293218,0.130999,0.14635




In [37]:
predictions2 = aml2.predict(test2)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


# Prepare the test dataset for predictions

In [102]:
validate = h2o.H2OFrame(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [103]:
validate.head()

Unnamed: 0,area_total,floor,rooms,ceiling,district,stories,distance,distance_metro,seller_0.0,seller_1.0,seller_2.0,seller_3.0,seller_nan,layout_0.0,layout_1.0,layout_2.0,layout_nan,condition_0.0,condition_1.0,condition_2.0,condition_3.0,condition_nan,new_0.0,new_1.0,new_nan,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,material_nan,garbage_chute_0.0,garbage_chute_1.0,garbage_chute_nan,heating_0.0,heating_1.0,heating_2.0,heating_3.0,heating_nan,elevator,elevator_no
0,-0.0664348,2,1,0.0,3,20,-0.304959,-0.397837,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,True,False
12,-0.375168,4,1,-0.0456097,3,15,-0.59529,-0.354351,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,True,False
15,-0.63422,10,1,-0.0676013,2,9,0.227137,0.386589,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,True,False
76,-0.252739,5,2,0.0,2,17,-0.55682,-0.355316,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,False,True
78,-0.730034,14,1,-0.061318,0,17,-1.35323,-0.299657,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,True,False
79,-0.623574,15,1,0.0,5,17,-0.979164,-0.276416,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,True,False
80,-0.250965,3,3,-0.0676013,5,9,-0.979164,-0.276416,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,True,False
128,-0.268708,21,2,-0.0581763,5,25,-0.979164,-0.276416,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,True,False
129,-0.659061,17,1,0.0,5,17,-0.979164,-0.276416,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,True,False
131,-0.920065,11,1,0.0,5,15,-0.979164,-0.276416,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,True,False




In [104]:
validate_pca = pca_model.predict(validate)

pca prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [105]:
# Use this as input in the model that scores high
# with the training datas pca_model
validate_pca = validate_pca.drop(["PC4", "PC5"])
validate_pca.head()

PC1,PC2,PC3
18.7451,-7.33955,-1.96603
15.4043,-3.49746,-0.4865
12.8738,4.47813,0.941619
17.1447,-3.13131,-1.6956
21.2289,5.09225,-2.77524
22.5039,4.94466,2.07239
10.3042,-2.38735,3.11124
32.165,6.85093,0.810653
23.4269,6.70417,2.29465
18.9437,2.25944,2.13228




In [106]:
validate_pca.head()

PC1,PC2,PC3
18.7451,-7.33955,-1.96603
15.4043,-3.49746,-0.4865
12.8738,4.47813,0.941619
17.1447,-3.13131,-1.6956
21.2289,5.09225,-2.77524
22.5039,4.94466,2.07239
10.3042,-2.38735,3.11124
32.165,6.85093,0.810653
23.4269,6.70417,2.29465
18.9437,2.25944,2.13228




In [107]:
prediction_final = aml2.predict(validate_pca)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [108]:
# Revert from z-scores to actual price
mean_test = XTrain["price"].mean()
sd_test = XTrain["price"].std()
# Just be sure actual prices are here
XTrain["price"].head()

0     7139520.0
1    10500000.0
2     9019650.0
3    10500000.0
4    13900000.0
Name: price, dtype: float64

In [109]:
df = mean_test + sd_test * prediction_final.as_data_frame()
df.head()

Unnamed: 0,predict
0,16675680.0
1,10823080.0
2,20929090.0
3,13064420.0
4,10134250.0


In [110]:
# Write to file

df["id"] = XTest["id"]
df["price_prediction"] = df.iloc[:,0]
# Skip first column
df = df.iloc[:,1:]

pd.DataFrame(df).to_csv("pca_prediction.csv", index=False)

In [111]:
df.head()

"""
Kaggle score: 0.87591 (Step 6, category_1)

0 	23285 	1.667568e+07
1 	23286 	1.082308e+07
2 	23287 	2.092909e+07
3 	23288 	1.306442e+07
4 	23289 	1.013425e+07
"""

Unnamed: 0,id,price_prediction
0,23285,16675680.0
1,23286,10823080.0
2,23287,20929090.0
3,23288,13064420.0
4,23289,10134250.0


# Analysis of dataset category_2

In [None]:
# Importing prepared_category_2 datasets
# New one-hot-encoding of balconies, loggias, 

bathrooms_shared, bathrooms_private
XTrain2 = pd.read_csv("../data/train_prepared_category_2.csv")
XTest2 = pd.read_csv("../data/test_prepared_category_2.csv")

# Make copies to avoid new imports
X_train = XTrain2.copy()
X_test = XTest2.copy()
y_train_real = XTrain2["price"].copy()          # Prices actual values
y_train_zscore = XTrain2["price"].copy() 
y_train_zscore = norm_features(y_train_zscore) # Prices as z-scores (normalized)