In [1]:
import pickle
import numpy as np
import os
import pathlib
import pandas as pd
import multiprocessing
import datetime

In [2]:
!pip install catboost --quiet
!pip install plotly --quiet
!pip install ipywidgets --quiet
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/root/.jupyter/nbconfig/notebook.json


In [3]:
DATA_PATH = os.path.join(pathlib.Path.cwd(), 'fetch_data', 'data_final.parquet')

data = pd.read_parquet(DATA_PATH)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.svm import LinearSVR
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge
from catboost import CatBoostRegressor

In [5]:
# Separação Treinamento - Teste

X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice']

print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

index
0       5.332438
1       5.021189
2       5.235528
3       5.387390
4       5.278525
          ...   
2925    5.153815
2926    5.117271
2927    5.120574
2928    5.230449
2929    5.274158
Name: SalePrice, Length: 2901, dtype: float64


In [6]:
# Modelo de Random Forest

rf = RandomForestRegressor(n_estimators=5000, random_state=42)
rf.fit(X_train, y_train)
ypred = rf.predict(X_test)
RMSE_RF = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_rf = 100 * (10**RMSE_RF - 1)

In [7]:
error_percent_rf

13.59259849281369

In [8]:
# Modelo de Extreme Gradient Boosting

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X_train, y_train)
ypred = xgb_model.predict(X_test)
RMSE_XGB = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_xgb = 100 * (10**RMSE_XGB - 1)

In [9]:
error_percent_xgb

14.232087713608621

In [10]:
# Support Vector Regressor Linear

svr = LinearSVR(random_state=42, max_iter=10000)
svr.fit(X_train, y_train)
ypred = svr.predict(X_test)
RMSE_SVR = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_svr = 100 * (10**RMSE_SVR - 1)



In [11]:
error_percent_svr

14.690271180184755

In [12]:
# Linear Regression

lr = LinearRegression()
lr.fit(X_train, y_train)
ypred = lr.predict(X_test)
RMSE_LR = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_lr = 100 * (10**RMSE_LR - 1)

In [13]:
error_percent_lr

14.381012071576006

In [14]:
# Ridge, Lasso, Elastic Net e Bayesian Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)
ypred = ridge.predict(X_test)
RMSE_RIDGE = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_ridge = 100 * (10**RMSE_RIDGE - 1)

lasso = Lasso()
lasso.fit(X_train, y_train)
ypred = lasso.predict(X_test)
RMSE_LASSO = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_lasso = 100 * (10**RMSE_LASSO - 1)

elastic = ElasticNet()
elastic.fit(X_train, y_train)
ypred = elastic.predict(X_test)
RMSE_ELASTIC = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_elastic = 100 * (10**RMSE_ELASTIC - 1)

bayesian = BayesianRidge()
bayesian.fit(X_train, y_train)
ypred = bayesian.predict(X_test)
RMSE_BAYESIAN = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_bayesian = 100 * (10**RMSE_BAYESIAN - 1)

In [15]:
print(error_percent_ridge)
print(error_percent_lasso)
print(error_percent_elastic)
print(error_percent_bayesian)

14.381242114704174
46.74117821569086
46.74117821569086
14.394598072769348


In [16]:
best_params = {
    'iterations': 200000,
}

    #'max_leaves': 8,
    #'depth': 3,
    #'od_wait': 200,
    #'l2_leaf_reg': 3,
    #'model_size_reg': 0.7,
    #'learning_rate': 0.05,
    #'random_seed': 42


time_zero = datetime.datetime.now()

model = CatBoostRegressor(**best_params, verbose=1000, task_type='GPU')
model.fit(X_train, y_train)

time_end = datetime.datetime.now()

total_time = time_end - time_zero

print(total_time)

ypred = model.predict(X_test)
RMSE_CAT = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_cat = 100 * (10**RMSE_CAT - 1)

0:	learn: 0.1683998	total: 7.79ms	remaining: 25m 57s
1000:	learn: 0.0434531	total: 10.9s	remaining: 36m 14s
2000:	learn: 0.0397891	total: 30s	remaining: 49m 24s
3000:	learn: 0.0378808	total: 36.6s	remaining: 40m
4000:	learn: 0.0371654	total: 43.8s	remaining: 35m 44s
5000:	learn: 0.0361241	total: 56.2s	remaining: 36m 29s
6000:	learn: 0.0353293	total: 1m 1s	remaining: 33m
7000:	learn: 0.0345399	total: 1m 8s	remaining: 31m 30s
8000:	learn: 0.0337997	total: 1m 16s	remaining: 30m 34s
9000:	learn: 0.0331752	total: 1m 23s	remaining: 29m 39s
10000:	learn: 0.0325696	total: 1m 35s	remaining: 30m 8s
11000:	learn: 0.0320626	total: 1m 41s	remaining: 28m 56s
12000:	learn: 0.0315907	total: 1m 48s	remaining: 28m 17s
13000:	learn: 0.0311604	total: 1m 54s	remaining: 27m 23s
14000:	learn: 0.0307411	total: 2m 1s	remaining: 26m 54s
15000:	learn: 0.0303116	total: 2m 6s	remaining: 26m 5s
16000:	learn: 0.0300041	total: 2m 12s	remaining: 25m 27s
17000:	learn: 0.0297119	total: 2m 17s	remaining: 24m 37s
18000:	l

In [17]:
# Category Boosting Regressor

time_zero = datetime.datetime.now()

cat = CatBoostRegressor(random_state=42, verbose=1000, iterations=20000)
cat.fit(X_train, y_train)

time_end = datetime.datetime.now()

total_time = time_end - time_zero

print(total_time)

ypred = cat.predict(X_test)
RMSE_CAT_2 = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_cat2 = 100 * (10**RMSE_CAT_2 - 1)

Learning rate set to 0.004094
0:	learn: 0.1734479	total: 12.7ms	remaining: 4m 14s
1000:	learn: 0.0523616	total: 5.03s	remaining: 1m 35s
2000:	learn: 0.0446934	total: 10.6s	remaining: 1m 35s
3000:	learn: 0.0398730	total: 15.4s	remaining: 1m 27s
4000:	learn: 0.0359755	total: 20.9s	remaining: 1m 23s
5000:	learn: 0.0331252	total: 25.7s	remaining: 1m 17s
6000:	learn: 0.0306030	total: 30.6s	remaining: 1m 11s
7000:	learn: 0.0283062	total: 36.2s	remaining: 1m 7s
8000:	learn: 0.0262937	total: 40.9s	remaining: 1m 1s
9000:	learn: 0.0246047	total: 46.5s	remaining: 56.9s
10000:	learn: 0.0231265	total: 51.3s	remaining: 51.3s
11000:	learn: 0.0217722	total: 57s	remaining: 46.7s
12000:	learn: 0.0205492	total: 1m 1s	remaining: 41.2s
13000:	learn: 0.0194221	total: 1m 7s	remaining: 36.3s
14000:	learn: 0.0183998	total: 1m 12s	remaining: 31s
15000:	learn: 0.0175228	total: 1m 17s	remaining: 26s
16000:	learn: 0.0167110	total: 1m 22s	remaining: 20.7s
17000:	learn: 0.0159634	total: 1m 27s	remaining: 15.5s
18000

In [18]:
print(error_percent_cat)
print(error_percent_cat2)

14.694751969191543
12.149510285666487


In [19]:
!pip install tensorflow_decision_forests --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.8/489.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
import tensorflow_decision_forests as tfdf

In [29]:
train_dataset = pd.concat([X_train, y_train], axis=1)
test_dataset = pd.concat([X_test, y_test], axis=1)

In [30]:
train_dataset = train_dataset.astype('float64')
test_dataset = test_dataset.astype('float64')

In [31]:
train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(train_dataset, label="SalePrice", task=tfdf.keras.Task.REGRESSION)
test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(test_dataset, label="SalePrice", task=tfdf.keras.Task.REGRESSION)
serving_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(X_test)

In [None]:
rf_tfdf = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, num_trees=5000, max_depth=10, num_candidate_splits=5000, max_num_nodes=1000000, min_examples_to_split=1000, min_examples_in_leaf=100, learning_rate=0.01, split_axis="SPARSE_OBLIQUE", categorical_algorithm="RANDOM", seed=42)
rf_tfdf.compile(metrics=["mse"])

time_zero = datetime.datetime.now()

rf_tfdf.fit(train_dataset, verbose=10)

time_end = datetime.datetime.now()

total_time = time_end - time_zero
print(total_time)

In [None]:
rf_tfdf.summary()

In [None]:
rf_tfdf.evaluate(test_dataset, return_dict=True)

In [None]:
ypred = rf_tfdf.predict(serving_dataset)
RMSE_TFDF = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_tfdf = 100 * (10**RMSE_TFDF - 1)

print(error_percent_tfdf)

In [32]:
tuner = tfdf.tuner.RandomSearch(num_trials=50, use_predefined_hps=True)
tuned_model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.REGRESSION, tuner=tuner)

Use /tmp/tmp8scpfyno as temporary training directory


In [None]:
tuned_model.fit(train_dataset, verbose=10)

Reading training dataset...
Training tensor examples:
Features: {'Overall.Qual': <tf.Tensor 'data:0' shape=(None,) dtype=float64>, 'Gr.Liv.Area': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Ttl.Home.Qual': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'Total.Bsmt.SF': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'Garage.Area': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'X1st.Flr.SF': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'Ttl.Bath': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'Age': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'MS.SubClass': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'Sq.Ft.PerRoom': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'Age.Remod': <tf.Tensor 'data_10:0' shape=(None,) dtype=float64>, 'Lot.Frontage': <tf.Tensor 'data_11:0' shape=(None,) dtype=float64>, 'Exter.Qual_TA': <tf.Tensor 'data_12:0' shape=(None,) dtype=float64>, 'Full.Bath': <tf.Tensor 'data_13:0' shape=(None,) dtype=float64>, 

[INFO 23-10-22 16:30:32.9329 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-10-22 16:30:32.9330 UTC kernel.cc:774] Collect training examples
[INFO 23-10-22 16:30:32.9330 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-10-22 16:30:32.9337 UTC kernel.cc:393] Number of batches: 3
[INFO 23-10-22 16:30:32.9337 UTC kernel.cc:394] Number of examples: 2320
[INFO 23-10-22 16:30:32.9372 UTC kernel.cc:794] Training dataset:
Number of records: 2320
Number of columns: 51

Number of columns by type:
	NUMERICAL: 51 (100%)

Columns:

NUMERICAL: 51 (100%)
	0: "Age" NUMERICAL mean:-0.297229 min:-2.12495 max:0.783774 sd:0.780547
	1: "Age.Remod" NUMERICAL mean:-0.116959 min:-1.64332 max:0.641775 sd:0.651728
	2: "Bsmt.E

In [None]:
evaluation = tuned_model.evaluate(test_dataset, return_dict=True)

for name, value in evaluation.items():
  print(f'{name}: {value:.4f}')

In [None]:
ypred = tuned_model.predict(serving_dataset, verbose=0)

RMSE_TFDF_TUNED = np.sqrt(mean_squared_error(y_test, ypred))
error_percent_tfdf_tuned = 100 * (10**RMSE_TFDF_TUNED - 1)

print(error_percent_tfdf_tuned)