## Treinando Modelos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

### Pegando os dados da camada Refined

In [2]:
df = pd.read_csv('../data/refined/imoveis.csv')
df['crawler'].value_counts()

liberdade     1078
saúde         1077
bela_vista    1075
ipiranga      1069
brooklin      1012
broklin       1012
Name: crawler, dtype: int64

### Separando os dados com maior correlação positiva ou negativa e a variavel alvo

In [3]:
df[["area_limpo", "Banheiro", "Quarto", "condominio","preço"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_limpo  6323 non-null   int64  
 1   Banheiro    6323 non-null   int64  
 2   Quarto      6323 non-null   int64  
 3   condominio  6063 non-null   float64
 4   preço       6323 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 247.1 KB


In [4]:
# Substituir valores nulos por 0 no DataFrame df
df.fillna(0, inplace=True)

In [5]:
x = X = df[["area_limpo", "Banheiro", "Quarto", "condominio"]]
y = df["preço"]

### Separando dados de Treino e teste

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42, shuffle=True)

In [7]:
X.shape

(6323, 4)

In [8]:
x_test.shape

(1265, 4)

In [9]:
y_train

4567    1500000
5842    3500000
5113     604000
3333     890000
3382     949000
         ...   
3772     780000
5191    1000000
5226    1200000
5390    1040500
860     1059980
Name: preço, Length: 5058, dtype: int64

### Iniciando treinamento do Modelo

#### Modelo de regressão linear simples

In [10]:
#fit_intercept Ele controla se o modelo de regressão linear deve calcular ou não o intercepto (também conhecido como viés ou coeficiente linear)
linear1 = LinearRegression(fit_intercept=True)
linear2 = LinearRegression(fit_intercept=False)

In [11]:
linear1.fit(x_train,y_train)
linear2.fit(x_train,y_train)

LinearRegression(fit_intercept=False)

In [12]:
y_pred_train1 = linear1.predict(x_train)
y_pred_test1 = linear1.predict(x_test)
y_pred_train2 = linear2.predict(x_train)
y_pred_test2 = linear2.predict(x_test)

Calculando o MAPE (Mean Absolute Percentage Error)

In [13]:
print(f'''
Mape Train:
{np.mean(np.abs(y_train - y_pred_train1)/y_train)} , {np.mean(np.abs(y_train - y_pred_train2)/y_train)}
Mape Teste:
{np.mean(np.abs(y_test - y_pred_test1)/y_test)} , {np.mean(np.abs(y_test - y_pred_test2)/y_test)}
''')


Mape Train:
0.31305991721100956 , 0.3272471980359713
Mape Teste:
0.3090014711185113 , 0.3207849469867356



Calculando o MSE (Mean Squared Error)

In [14]:
mean_squared_error(y_test, y_pred_test1)

174658132962.8032

In [15]:
mean_squared_error(y_test, y_pred_test2)

175790169283.3162

Calulando o R²

In [16]:
r2_linear1 = r2_score(y_test, y_pred_test1)
r2_score(y_test, y_pred_test1)

0.7424004437105677

In [17]:
r2_linear1 = r2_score(y_test, y_pred_test2)
r2_score(y_test, y_pred_test2)

0.7407308274784408

## Treinando outros modelos

Treinamento

In [25]:
ridge = Ridge()
knn = KNeighborsRegressor()
tree = DecisionTreeRegressor()
extratree = ExtraTreeRegressor()
randomforest = RandomForestRegressor()
xgb = XGBRegressor()
cat = CatBoostRegressor()

In [26]:
ridge.fit(x_train,y_train)
y_pred = ridge.predict(x_test)
mape_ridge = np.mean(np.abs(y_test - y_pred)/y_test)
mse_ridge = mean_squared_error(y_test, y_pred)
r2_ridge = r2_score(y_test, y_pred)

In [27]:
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
mape_knn = np.mean(np.abs(y_test - y_pred)/y_test)
mse_knn = mean_squared_error(y_test, y_pred)
r2_knn = r2_score(y_test, y_pred)

In [28]:
tree.fit(x_train,y_train)
y_pred = tree.predict(x_test)
mape_tree = np.mean(np.abs(y_test - y_pred)/y_test)
mse_tree = mean_squared_error(y_test, y_pred)
r2_tree = r2_score(y_test, y_pred)

In [29]:
extratree.fit(x_train,y_train)
y_pred = extratree.predict(x_test)
mape_ext = np.mean(np.abs(y_test - y_pred)/y_test)
mse_ext = mean_squared_error(y_test, y_pred)
r2_ext = r2_score(y_test, y_pred)

In [30]:
randomforest.fit(x_train,y_train)
y_pred = randomforest.predict(x_test)
mape_rf = np.mean(np.abs(y_test - y_pred)/y_test)
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

In [31]:
xgb.fit(x_train,y_train)
y_pred = xgb.predict(x_test)
mape_xgb = np.mean(np.abs(y_test - y_pred)/y_test)
mse_xgb = mean_squared_error(y_test, y_pred)
r2_xgb = r2_score(y_test, y_pred)

In [32]:
cat.fit(x_train,y_train)
y_pred = cat.predict(x_test)
mape_cat = np.mean(np.abs(y_test - y_pred)/y_test)
mse_cat = mean_squared_error(y_test, y_pred)
r2_cat = r2_score(y_test, y_pred)

Learning rate set to 0.052894
0:	learn: 798740.2139529	total: 49.2ms	remaining: 49.1s
1:	learn: 770875.6268868	total: 50.4ms	remaining: 25.2s
2:	learn: 744189.5953798	total: 51.4ms	remaining: 17.1s
3:	learn: 718274.9825499	total: 52.2ms	remaining: 13s
4:	learn: 695630.6448569	total: 53.1ms	remaining: 10.6s
5:	learn: 673664.4752119	total: 54ms	remaining: 8.95s
6:	learn: 653597.2612507	total: 54.8ms	remaining: 7.77s
7:	learn: 634980.7758254	total: 55.6ms	remaining: 6.89s
8:	learn: 617722.5325417	total: 56.3ms	remaining: 6.2s
9:	learn: 601422.1334900	total: 57.1ms	remaining: 5.65s
10:	learn: 585285.4041698	total: 57.9ms	remaining: 5.2s
11:	learn: 570599.7898326	total: 58.7ms	remaining: 4.83s
12:	learn: 556241.0109455	total: 59.4ms	remaining: 4.51s
13:	learn: 542854.8859307	total: 60.2ms	remaining: 4.24s
14:	learn: 530642.2486683	total: 61ms	remaining: 4.01s
15:	learn: 519589.0202684	total: 71.8ms	remaining: 4.41s
16:	learn: 509587.0552728	total: 72.7ms	remaining: 4.2s
17:	learn: 500361.57

In [42]:
print(
    f'''
MAPE ridge, {mape_ridge} 
MAPE knn, {mape_knn} 
MAPE tree, {mape_tree}
MAPE ext, {mape_ext}
MAPE rf, {mape_rf}
MAPE xgb, {mape_xgb}
MAPE cat, {mape_cat}
'''
)


MAPE ridge, 0.3089978964660717 
MAPE knn, 0.2664997873718837 
MAPE tree, 0.20762074416498374
MAPE ext, 0.20506518012087233
MAPE rf, 0.19851163530326968
MAPE xgb, 0.21523184757618696
MAPE cat, 0.2338072868715645



In [43]:
print(
    f'''
MSE ridge, {mse_ridge} 
MSE knn, {mse_knn} 
MSE tree, {mse_tree}
MSE ext, {mse_ext}
MSE rf, {mse_rf}
MSE xgb, {mse_xgb}
MSE cat, {mse_cat}
'''
)


MSE ridge, 174656232781.61212 
MSE knn, 187541172495.08008 
MSE tree, 155880954817.57477
MSE ext, 153249784643.80124
MSE rf, 115086394441.5453
MSE xgb, 110402999942.39896
MSE cat, 121909025927.54237



In [44]:
print(
    f'''
RMSE ridge, {np.sqrt(mse_ridge)} 
RMSE knn, {np.sqrt(mse_knn)} 
RMSE tree, {np.sqrt(mse_tree)}
RMSE ext, {np.sqrt(mse_ext)}
RMSE rf, {np.sqrt(mse_rf)}
RMSE xgb, {np.sqrt(mse_xgb)}
RMSE cat, {np.sqrt(mse_cat)}
'''
)


RMSE ridge, 417918.93087249843 
RMSE knn, 433060.2411848496 
RMSE tree, 394817.62222268496
RMSE ext, 391471.3075613604
RMSE rf, 339243.85689581075
RMSE xgb, 332269.46886886697
RMSE cat, 349154.730638928



In [45]:
print(
    f'''
R2 ridge, {r2_ridge} 
R2 tree, {r2_tree}
R2 ext, {r2_ext}
R2 rf, {r2_rf}
R2 xgb, {r2_xgb}
R2 cat, {r2_cat}
'''
)


R2 ridge, 0.7424032462472911 
R2 tree, 0.7700945033946283
R2 ext, 0.7739751601827688
R2 rf, 0.8302615306817325
R2 xgb, 0.8371689693703471
R2 cat, 0.8201989769734923

