In [1]:
import pandas as pd
dados = pd.read_csv('../manipulacao-de-dados/dados_tratados_ml.csv')

In [2]:
dados.head()

Unnamed: 0,aluguel,zona,area,quarto
0,21000.0,sul,265.0,3.0
1,2000.0,sul,45.0,1.0
2,1499.0,sul,45.0,1.0
3,4500.0,sul,112.0,3.0
4,4500.0,sul,125.0,3.0


In [3]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132 entries, 0 to 1131
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   aluguel  1132 non-null   float64
 1   zona     1132 non-null   object 
 2   area     1132 non-null   float64
 3   quarto   1132 non-null   float64
dtypes: float64(3), object(1)
memory usage: 35.5+ KB


In [4]:
X = dados.drop(columns=['aluguel'])

In [5]:
X.isna().sum()

zona      0
area      0
quarto    0
dtype: int64

In [6]:
X = pd.get_dummies(X)

In [7]:
X.head()

Unnamed: 0,area,quarto,zona_leste,zona_norte,zona_oeste,zona_sul
0,265.0,3.0,0,0,0,1
1,45.0,1.0,0,0,0,1
2,45.0,1.0,0,0,0,1
3,112.0,3.0,0,0,0,1
4,125.0,3.0,0,0,0,1


In [8]:
y = dados['aluguel']

In [9]:
y.isna().sum()

0

In [10]:
from sklearn.model_selection import train_test_split

SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state = SEED,
                                                    test_size = 0.25)

# Modelo "Baseline" (Ponto de Partida)

In [11]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

-0.003809830571262829

In [12]:
from sklearn.dummy import DummyRegressor
reg = DummyRegressor(strategy='mean')
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

-0.005107788952471326

# Fazer triagem entre diferentes estimadores

In [17]:
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LassoLars, HuberRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [18]:
reg_list = [RidgeCV(),
            LGBMRegressor(), 
            XGBRegressor(objective='reg:squarederror'),
            SVR(),
            KNeighborsRegressor(),
            RandomForestRegressor(),
            AdaBoostRegressor(),
            GradientBoostingRegressor(),
            MLPRegressor()
            ]

In [21]:
from sklearn.model_selection import cross_val_score
import numpy as np

for reg in reg_list:
    print(f'Treinando Modelo {reg.__class__.__name__}')
    reg.fit(X_train, y_train)
    
    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)
    
    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

Treinando Modelo RidgeCV
R2 Score Train: 0.38476237575777206
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.0037795041016013275
Treinando Modelo LGBMRegressor
R2 Score Train: 0.5384384436386846
R2 Score Valid: 0.33 +- 0.09
R2 Score Test: -0.0024979553447663783
Treinando Modelo XGBRegressor
R2 Score Train: 0.8486142039346978
R2 Score Valid: 0.05 +- 0.21
R2 Score Test: -0.002254966901589217
Treinando Modelo SVR
R2 Score Train: -0.037486119821007025
R2 Score Valid: -0.05 +- 0.03
R2 Score Test: -0.012193798060607275
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.46407170409611276
R2 Score Valid: 0.22 +- 0.06
R2 Score Test: -0.007070770505739121
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.7808295006433801
R2 Score Valid: 0.23 +- 0.13
R2 Score Test: -0.000989063316143124
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.48637312651872744
R2 Score Valid: 0.22 +- 0.06
R2 Score Test: 0.0007393841745568785
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.68385



R2 Score Train: 0.20629801036097795
R2 Score Valid: 0.19 +- 0.09
R2 Score Test: -0.008406292465828358




In [22]:
# Bonus: Testando com todos os regressores do sklearn
from sklearn.utils import all_estimators

estimators = all_estimators(type_filter='regressor')

relatorio = {'nome':[],
             'train_score':[],
             'cv_scores_mean':[],
             'test_score':[],
             'estimador':[]
             }

ignore_list = ['IsotonicRegression',
 'MultiOutputRegressor',
 'ElasticNet',
 'MultiTaskElasticNet',
 'MultiTaskElasticNetCV',
 'MultiTaskLasso',
 'MultiTaskLassoCV',
 'RadiusNeighborsRegressor',
 'RegressorChain',
 'StackingRegressor',
 'VotingRegressor']

In [23]:
estimators.extend(
    [('LGBMRegressor', LGBMRegressor),
     ('XGBRegressor', XGBRegressor)]
)

In [24]:
for name, RegressorClass in estimators:
  if name not in ignore_list:
    print(f'Treinando Modelo {name}')
    reg = RegressorClass()
    reg.fit(X_train, y_train)

    train_score = reg.score(X_train, y_train)
    cv_scores = cross_val_score(reg, X_train, y_train)
    test_score = reg.score(X_test, y_test)

    print(f"R2 Score Train: {train_score}")
    print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
    print(f"R2 Score Test: {test_score}")
    print('='*80)

    relatorio['nome'].append(name)
    relatorio['train_score'].append(train_score)
    relatorio['cv_scores_mean'].append(np.mean(cv_scores))
    relatorio['test_score'].append(test_score)
    relatorio['estimador'].append(reg)

Treinando Modelo ARDRegression
R2 Score Train: 0.3837395568162053
R2 Score Valid: 0.36 +- 0.08
R2 Score Test: -0.003118757051941312
Treinando Modelo AdaBoostRegressor
R2 Score Train: 0.4917543236555235
R2 Score Valid: 0.24 +- 0.07
R2 Score Test: 0.002487555314927281
Treinando Modelo BaggingRegressor
R2 Score Train: 0.7800794609511247
R2 Score Valid: 0.19 +- 0.11
R2 Score Test: -0.0016988409386848868
Treinando Modelo BayesianRidge
R2 Score Train: 0.38463987462426397
R2 Score Valid: 0.37 +- 0.07
R2 Score Test: -0.003598954255709108
Treinando Modelo CCA
R2 Score Train: 0.21729569023085238
R2 Score Valid: 0.17 +- 0.16
R2 Score Test: -0.017214914511364032
Treinando Modelo DecisionTreeRegressor
R2 Score Train: 0.8560440706474008




R2 Score Valid: -0.32 +- 0.43
R2 Score Test: -0.003239308801244345
Treinando Modelo DummyRegressor
R2 Score Train: 0.0
R2 Score Valid: -0.01 +- 0.01
R2 Score Test: -0.005107788952471326
Treinando Modelo ElasticNetCV
R2 Score Train: 0.2768701729583485
R2 Score Valid: 0.27 +- 0.05
R2 Score Test: -0.00019238746033667375
Treinando Modelo ExtraTreeRegressor
R2 Score Train: 0.8560440706474008
R2 Score Valid: -0.03 +- 0.22
R2 Score Test: -0.0040777258132875804
Treinando Modelo ExtraTreesRegressor
R2 Score Train: 0.8560440571248296
R2 Score Valid: 0.11 +- 0.19
R2 Score Test: -0.002086307830163836
Treinando Modelo GammaRegressor
R2 Score Train: 0.39089295253613965
R2 Score Valid: 0.38 +- 0.07
R2 Score Test: -0.12427809111042709
Treinando Modelo GaussianProcessRegressor


  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)
  return np.power(y_pred, self.power)


R2 Score Train: 0.8560440706423424
R2 Score Valid: -0.21 +- 0.06
R2 Score Test: -0.023165704572007062
Treinando Modelo GradientBoostingRegressor
R2 Score Train: 0.6838539795868791
R2 Score Valid: 0.27 +- 0.14
R2 Score Test: -0.0011171962336287145
Treinando Modelo HistGradientBoostingRegressor
R2 Score Train: 0.542000552588427
R2 Score Valid: 0.34 +- 0.09
R2 Score Test: -0.002833151783537824
Treinando Modelo HuberRegressor
R2 Score Train: 0.3442860949745056
R2 Score Valid: 0.34 +- 0.06
R2 Score Test: -0.004449777536515542
Treinando Modelo KNeighborsRegressor
R2 Score Train: 0.46407170409611276
R2 Score Valid: 0.22 +- 0.06
R2 Score Test: -0.007070770505739121
Treinando Modelo KernelRidge
R2 Score Train: 0.38468832323614766
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.0038388256420831546
Treinando Modelo Lars
R2 Score Train: 0.384764894346064
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.003809830571263495
Treinando Modelo LarsCV




R2 Score Train: 0.384764894346064
R2 Score Valid: 0.36 +- 0.07
R2 Score Test: -0.003809830571263495
Treinando Modelo Lasso
R2 Score Train: 0.3847580538454931
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.0037618124308949064
Treinando Modelo LassoCV
R2 Score Train: 0.3748273512165764
R2 Score Valid: 0.36 +- 0.07
R2 Score Test: -0.00225267843360899
Treinando Modelo LassoLars
R2 Score Train: 0.3828006014388622
R2 Score Valid: 0.37 +- 0.07
R2 Score Test: -0.0029439726663484933
Treinando Modelo LassoLarsCV
R2 Score Train: 0.384764894346064
R2 Score Valid: 0.36 +- 0.07
R2 Score Test: -0.003809830571263495
Treinando Modelo LassoLarsIC
R2 Score Train: 0.38319149413629006
R2 Score Valid: 0.36 +- 0.08
R2 Score Test: -0.0030564341821499585
Treinando Modelo LinearRegression
R2 Score Train: 0.384764894346064
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.003809830571262829
Treinando Modelo LinearSVR
R2 Score Train: 0.23248149395857365
R2 Score Valid: 0.22 +- 0.08
R2 Score Test: -0.0078238451435315



R2 Score Train: 0.20681728867327887
R2 Score Valid: 0.19 +- 0.09
R2 Score Test: -0.008426525386655559
Treinando Modelo NuSVR
R2 Score Train: 0.0035537186164757983
R2 Score Valid: -0.01 +- 0.03
R2 Score Test: -0.008687236826386613
Treinando Modelo OrthogonalMatchingPursuit
R2 Score Train: 0.2760432746809638
R2 Score Valid: 0.27 +- 0.05
R2 Score Test: -0.0003084358349643157
Treinando Modelo OrthogonalMatchingPursuitCV
R2 Score Train: 0.384764894346064
R2 Score Valid: 0.36 +- 0.07
R2 Score Test: -0.003809830571263495
Treinando Modelo PLSCanonical
R2 Score Train: -0.07359477106318946
R2 Score Valid: -0.15 +- 0.24
R2 Score Test: -0.02448366225550469
Treinando Modelo PLSRegression
R2 Score Train: 0.3761570814917763
R2 Score Valid: 0.35 +- 0.08
R2 Score Test: -0.003980219562063292
Treinando Modelo PassiveAggressiveRegressor
R2 Score Train: -0.031996904410201754
R2 Score Valid: 0.09 +- 0.17
R2 Score Test: -0.02212373666329337
Treinando Modelo PoissonRegressor


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


R2 Score Train: 0.46298066248195235
R2 Score Valid: 0.44 +- 0.11
R2 Score Test: -0.08090316471313286
Treinando Modelo RANSACRegressor
R2 Score Train: 0.19404362172277456
R2 Score Valid: 0.13 +- 0.06
R2 Score Test: -0.010040869830625132
Treinando Modelo RandomForestRegressor
R2 Score Train: 0.7785304850782545
R2 Score Valid: 0.21 +- 0.15
R2 Score Test: -0.0014899800751815295
Treinando Modelo Ridge
R2 Score Train: 0.38476237575777217
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.0037795041038273247
Treinando Modelo RidgeCV
R2 Score Train: 0.38476237575777206
R2 Score Valid: 0.37 +- 0.08
R2 Score Test: -0.0037795041016013275
Treinando Modelo SGDRegressor
R2 Score Train: -2.828587121009443e+18
R2 Score Valid: -3648881526510548480.00 +- 4305193059443459584.00
R2 Score Test: -8.465134082531642e+16
Treinando Modelo SVR
R2 Score Train: -0.037486119821007025
R2 Score Valid: -0.05 +- 0.03
R2 Score Test: -0.012193798060607275
Treinando Modelo TheilSenRegressor
R2 Score Train: 0.36139652513348497

In [25]:
relatorio = pd.DataFrame(relatorio).sort_values(by='cv_scores_mean', ascending=False)
relatorio.head(10)

Unnamed: 0,nome,train_score,cv_scores_mean,test_score,estimador
33,PoissonRegressor,0.462981,0.439753,-0.080903,PoissonRegressor()
10,GammaRegressor,0.390893,0.383492,-0.124278,GammaRegressor()
3,BayesianRidge,0.38464,0.366188,-0.003599,BayesianRidge()
36,Ridge,0.384762,0.365943,-0.00378,Ridge()
19,Lasso,0.384758,0.365879,-0.003762,Lasso()
16,KernelRidge,0.384688,0.36582,-0.003839,KernelRidge()
41,TransformedTargetRegressor,0.384765,0.365813,-0.00381,TransformedTargetRegressor()
24,LinearRegression,0.384765,0.365813,-0.00381,LinearRegression()
17,Lars,0.384765,0.365813,-0.00381,Lars()
21,LassoLars,0.382801,0.365806,-0.002944,LassoLars()


# TODO: Calibrar melhores estimadores usando GridSearchCV

In [26]:
## Usar GridSearchCV para calibrar os melhores estimadores
from sklearn.model_selection import GridSearchCV

# Combinar os melhores estimadores usando Stacking

In [27]:
from sklearn.ensemble import StackingRegressor

top_3_regs = relatorio[['nome', 'estimador']].values[:3]

reg = StackingRegressor(
    estimators=top_3_regs
)

reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
cv_scores = cross_val_score(reg, X_train, y_train)
test_score = reg.score(X_test, y_test)

print(f"R2 Score Train: {train_score}")
print(f"R2 Score Valid: {np.mean(cv_scores):.2f} +- {np.std(cv_scores):.2f}")
print(f"R2 Score Test: {test_score}")
print('='*80)

  return np.power(y_pred, self.power)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs",

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
  return np.power(y_pred, self.power)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs",

R2 Score Train: 0.39270797103981814
R2 Score Valid: 0.35 +- 0.08
R2 Score Test: -0.00326370342844462


================================================================================

Treinando Modelo XGBRegressor

R2 Score Train: 0.8486142039346978

R2 Score Valid: 0.05 +- 0.21

R2 Score Test: -0.002254966901589217

================================================================================