## Treinando Modelos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

### Pegando os dados da camada Refined

In [2]:
df = pd.read_csv('../data/refined/imoveis.csv')
df['crawler'].value_counts()

liberdade     1078
saúde         1077
bela_vista    1075
ipiranga      1069
brooklin      1012
broklin       1012
Name: crawler, dtype: int64

### Separando os dados com maior correlação positiva ou negativa e a variavel alvo

In [3]:
df[["area_limpo", "Banheiro", "Quarto", "condominio","preço"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_limpo  6323 non-null   int64  
 1   Banheiro    6323 non-null   int64  
 2   Quarto      6323 non-null   int64  
 3   condominio  6063 non-null   float64
 4   preço       6323 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 247.1 KB


In [4]:
# Substituir valores nulos por 0 no DataFrame df
df.fillna(0, inplace=True)

In [5]:
x = X = df[["area_limpo", "Banheiro", "Quarto", "condominio"]]
y = df["preço"]

### Separando dados de Treino e teste

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42, shuffle=True)

In [7]:
X.shape

(6323, 4)

In [8]:
x_test.shape

(1265, 4)

In [9]:
y_train

4567    1500000
5842    3500000
5113     604000
3333     890000
3382     949000
         ...   
3772     780000
5191    1000000
5226    1200000
5390    1040500
860     1059980
Name: preço, Length: 5058, dtype: int64

### Iniciando treinamento do Modelo

#### Modelo de regressão linear simples

In [10]:
#fit_intercept Ele controla se o modelo de regressão linear deve calcular ou não o intercepto (também conhecido como viés ou coeficiente linear)
linear1 = LinearRegression(fit_intercept=True)
linear2 = LinearRegression(fit_intercept=False)

In [11]:
linear1.fit(x_train,y_train)
linear2.fit(x_train,y_train)

LinearRegression(fit_intercept=False)

In [12]:
y_pred_train1 = linear1.predict(x_train)
y_pred_test1 = linear1.predict(x_test)
y_pred_train2 = linear2.predict(x_train)
y_pred_test2 = linear2.predict(x_test)

Calculando o MAPE (Mean Absolute Percentage Error)

In [13]:
print(f'''
Mape Train:
{np.mean(np.abs(y_train - y_pred_train1)/y_train)} , {np.mean(np.abs(y_train - y_pred_train2)/y_train)}
Mape Teste:
{np.mean(np.abs(y_test - y_pred_test1)/y_test)} , {np.mean(np.abs(y_test - y_pred_test2)/y_test)}
''')


Mape Train:
0.31305991721100956 , 0.3272471980359713
Mape Teste:
0.3090014711185113 , 0.3207849469867356



Calculando o MSE (Mean Squared Error)

In [14]:
mean_squared_error(y_test, y_pred_test1)

174658132962.8032

In [15]:
mean_squared_error(y_test, y_pred_test2)

175790169283.3162

Calulando o R²

In [16]:
r2_linear1 = r2_score(y_test, y_pred_test1)
r2_score(y_test, y_pred_test1)

0.7424004437105677

In [17]:
r2_linear1 = r2_score(y_test, y_pred_test2)
r2_score(y_test, y_pred_test2)

0.7407308274784408

## Treinando outros modelos

Treinamento

In [18]:
models = {
    'Ridge': Ridge(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'ExtraTreeRegressor': ExtraTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=False)
}

for model_name, model_instance in models.items():
    # Treine o modelo
    model_instance.fit(x_train, y_train)
    
    # Faça previsões
    y_pred = model_instance.predict(x_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'R-squared for {model_name}: {r2:.4f}')
    print(f'Mean Squared Error for {model_name}: {mse:.4f}')
    print(f'Mean Absolute Error for {model_name}: {mae:.4f}')
    print(f'Mena Absolute Percentage Error {model_name}: {mape:.2f}\n')

R-squared for Ridge: 0.7424
Mean Squared Error for Ridge: 174656232781.6121
Mean Absolute Error for Ridge: 286079.4087
Mena Absolute Percentage Error Ridge: 0.31

R-squared for KNeighborsRegressor: 0.7234
Mean Squared Error for KNeighborsRegressor: 187541172495.0801
Mean Absolute Error for KNeighborsRegressor: 257811.7477
Mena Absolute Percentage Error KNeighborsRegressor: 0.27

R-squared for DecisionTreeRegressor: 0.7717
Mean Squared Error for DecisionTreeRegressor: 154763050787.1259
Mean Absolute Error for DecisionTreeRegressor: 195837.9878
Mena Absolute Percentage Error DecisionTreeRegressor: 0.21

R-squared for ExtraTreeRegressor: 0.8025
Mean Squared Error for ExtraTreeRegressor: 133883825087.5450
Mean Absolute Error for ExtraTreeRegressor: 188153.7517
Mena Absolute Percentage Error ExtraTreeRegressor: 0.21



R-squared for RandomForestRegressor: 0.8306
Mean Squared Error for RandomForestRegressor: 114851913807.2769
Mean Absolute Error for RandomForestRegressor: 186373.2914
Mena Absolute Percentage Error RandomForestRegressor: 0.20

R-squared for XGBRegressor: 0.8372
Mean Squared Error for XGBRegressor: 110402999942.3990
Mean Absolute Error for XGBRegressor: 197373.7704
Mena Absolute Percentage Error XGBRegressor: 0.22

R-squared for CatBoostRegressor: 0.8202
Mean Squared Error for CatBoostRegressor: 121909025927.5424
Mean Absolute Error for CatBoostRegressor: 217304.9230
Mena Absolute Percentage Error CatBoostRegressor: 0.23



In [29]:
linear_reg_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('linear_reg', LinearRegression())  # Modelo de regressão linear
])

ridge_reg_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('ridge_reg', Ridge())  # Modelo de regressão Ridge
])

knn_pipeline = Pipeline([ 
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('knn', KNeighborsRegressor())  # Modelo K-Nearest Neighbors
])

decision_tree = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('decision_tree', DecisionTreeRegressor())  # Modelo Decision Tree
])

extra_tree = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('extra_tree', ExtraTreeRegressor())  # Modelo Extra Tree
])

random_forest = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('random_forest', RandomForestRegressor())  # Modelo Random Forest
])

xgboost = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('xgboost', XGBRegressor())  # Modelo XGBoost
])

catboost = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('catboost', CatBoostRegressor(verbose=False))  # Modelo CatBoost
])

param_grid_linear_reg = {
}
    
param_grid_ridge_reg = {
    'ridge_reg__alpha': [0.1, 1.0, 10.0]
}

param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance']
}

# Dicionários de hiperparâmetros para modelos adicionais
param_grid_decision_tree = {
    'decision_tree__max_depth': [None, 10, 20],
    'decision_tree__min_samples_split': [2, 5, 10]
}

param_grid_extra_tree = {
    'extra_tree__max_depth': [None, 10, 20],
    'extra_tree__min_samples_split': [2, 5, 10]
}

param_grid_random_forest = {
    'random_forest__n_estimators': [50, 100, 200],
    'random_forest__max_depth': [None, 10, 20],
    'random_forest__min_samples_split': [2, 5, 10]
}

param_grid_xgboost = {
    'xgboost__n_estimators': [50, 100, 200],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.2]
}

param_grid_catboost = {
    'catboost__n_estimators': [50, 100, 200],
    'catboost__depth': [6, 8, 10],
    'catboost__learning_rate': [0.01, 0.1, 0.2]
}

# Adicione mais modelos e dicionários de hiperparâmetros conforme necessário

models_and_params = [
    ('Linear Regression', linear_reg_pipeline, param_grid_linear_reg),
    ('Ridge Regression', ridge_reg_pipeline, param_grid_ridge_reg),
    ('KNeighborsRegressor', knn_pipeline, param_grid_knn),
    ('Decision Tree', decision_tree, param_grid_decision_tree),
    ('Extra Tree', extra_tree, param_grid_extra_tree),
    ('Random Forest', random_forest, param_grid_random_forest),
    ('XGBoost', xgboost, param_grid_xgboost),
    ('CatBoost', catboost, param_grid_catboost)
]

# Use GridSearchCV para otimizar os modelos
for model_name, model, param_grid in models_and_params:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(x_train, y_train)

    # Exiba os melhores hiperparâmetros e métricas de desempenho
    print(f"Best Parameters for {model_name}:")
    print(grid_search.best_params_)
    print(f"Best Mean Squared Error for {model_name}: {abs(grid_search.best_score_):.2f}")
    
    # Avalie o modelo otimizado no conjunto de teste
    y_pred = grid_search.best_estimator_.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'R-squared for {model_name}: {r2:.2f}')
    print(f'Mean Squared Error for {model_name}: {mse:.2f}')
    print(f'Mean Absolute Error for {model_name}: {mae:.2f}')
    print(f'Mena Absolute Percentage Error {model_name}: {mape:.2f}\n')

Best Parameters for Linear Regression:
{}
Best Mean Squared Error for Linear Regression: 213424474001.6755
R-squared for Linear Regression: 0.74
Mean Squared Error for Linear Regression: 174658132962.81
Mean Absolute Error for Linear Regression: 286083.63
Mena Absolute Percentage Error Linear Regression: 0.31

Best Parameters for Ridge Regression:
{'ridge_reg__alpha': 0.1}
Best Mean Squared Error for Ridge Regression: 213142389685.0551
R-squared for Ridge Regression: 0.74
Mean Squared Error for Ridge Regression: 175272974300.11
Mean Absolute Error for Ridge Regression: 286565.89
Mena Absolute Percentage Error Ridge Regression: 0.31

Best Parameters for KNeighborsRegressor:
{'knn__n_neighbors': 7, 'knn__weights': 'distance'}
Best Mean Squared Error for KNeighborsRegressor: 111306960797.7127
R-squared for KNeighborsRegressor: 0.82
Mean Squared Error for KNeighborsRegressor: 123246670310.29
Mean Absolute Error for KNeighborsRegressor: 177415.84
Mena Absolute Percentage Error KNeighborsReg