## Treinando Modelos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pickle
import joblib
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

### Pegando os dados da camada Refined

In [2]:
df = pd.read_csv('../data/refined/imoveis.csv')
df['crawler'].value_counts()

liberdade     1078
saúde         1077
bela_vista    1075
ipiranga      1069
brooklin      1012
broklin       1012
Name: crawler, dtype: int64

### Separando os dados com maior correlação positiva ou negativa e a variavel alvo

In [3]:
df[["crawler","area_limpo", "Banheiro", "Quarto", "condominio","preço"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6323 entries, 0 to 6322
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   crawler     6323 non-null   object 
 1   area_limpo  6323 non-null   int64  
 2   Banheiro    6323 non-null   int64  
 3   Quarto      6323 non-null   int64  
 4   condominio  6063 non-null   float64
 5   preço       6323 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 296.5+ KB


In [4]:
label_encoder = LabelEncoder()
df['crawler_codificado'] = label_encoder.fit_transform(df['crawler'])+1

In [28]:
valores_unicos = df[['crawler', 'crawler_codificado']].drop_duplicates()
print(valores_unicos)

         crawler  crawler_codificado
0     bela_vista                   1
1075   liberdade                   5
2153    brooklin                   3
3165       saúde                   6
4242    ipiranga                   4
5311     broklin                   2


In [6]:
# Substituir valores nulos por 0 no DataFrame df
df.fillna(0, inplace=True)

In [7]:
x = X = df[["crawler_codificado","area_limpo", "Banheiro", "Quarto", "condominio"]]
y = df["preço"]

### Separando dados de Treino e teste

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=42, shuffle=True)

In [9]:
X.shape

(6323, 5)

In [10]:
x_test.shape

(1265, 5)

In [11]:
y_train

4567    1500000
5842    3500000
5113     604000
3333     890000
3382     949000
         ...   
3772     780000
5191    1000000
5226    1200000
5390    1040500
860     1059980
Name: preço, Length: 5058, dtype: int64

### Iniciando treinamento do Modelo

#### Modelo de regressão linear simples

In [12]:
#fit_intercept Ele controla se o modelo de regressão linear deve calcular ou não o intercepto (também conhecido como viés ou coeficiente linear)
linear1 = LinearRegression(fit_intercept=True)
linear2 = LinearRegression(fit_intercept=False)

In [13]:
linear1.fit(x_train,y_train)
linear2.fit(x_train,y_train)

LinearRegression(fit_intercept=False)

In [14]:
y_pred_train1 = linear1.predict(x_train)
y_pred_test1 = linear1.predict(x_test)
y_pred_train2 = linear2.predict(x_train)
y_pred_test2 = linear2.predict(x_test)

Calculando o MAPE (Mean Absolute Percentage Error)

In [15]:
print(f'''
Mape Train:
{np.mean(np.abs(y_train - y_pred_train1)/y_train)} , {np.mean(np.abs(y_train - y_pred_train2)/y_train)}
Mape Teste:
{np.mean(np.abs(y_test - y_pred_test1)/y_test)} , {np.mean(np.abs(y_test - y_pred_test2)/y_test)}
''')


Mape Train:
0.2896454939185862 , 0.2820355892024813
Mape Teste:
0.2910258277372013 , 0.2815927282506992



Calculando o MSE (Mean Squared Error)

In [16]:
mean_squared_error(y_test, y_pred_test1)

168656732944.5689

In [17]:
mean_squared_error(y_test, y_pred_test2)

168342035448.98294

Calulando o R²

In [18]:
r2_linear1 = r2_score(y_test, y_pred_test1)
r2_score(y_test, y_pred_test1)

0.7512517806370982

In [19]:
r2_linear1 = r2_score(y_test, y_pred_test2)
r2_score(y_test, y_pred_test2)

0.751715921263436

## Treinando outros modelos

Treinamento

In [20]:
models = {
    'Ridge': Ridge(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'ExtraTreeRegressor': ExtraTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'XGBRegressor': XGBRegressor(),
    'CatBoostRegressor': CatBoostRegressor(verbose=False)
}

for model_name, model_instance in models.items():
    # Treine o modelo
    model_instance.fit(x_train, y_train)
    
    # Faça previsões
    y_pred = model_instance.predict(x_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'R-squared for {model_name}: {r2:.4f}')
    print(f'Mean Squared Error for {model_name}: {mse:.4f}')
    print(f'Mean Absolute Error for {model_name}: {mae:.4f}')
    print(f'Mena Absolute Percentage Error {model_name}: {mape:.2f}\n')

R-squared for Ridge: 0.7513
Mean Squared Error for Ridge: 168655369768.0363
Mean Absolute Error for Ridge: 277220.6863
Mena Absolute Percentage Error Ridge: 0.29

R-squared for KNeighborsRegressor: 0.7237
Mean Squared Error for KNeighborsRegressor: 187340864828.1256
Mean Absolute Error for KNeighborsRegressor: 256927.8383
Mena Absolute Percentage Error KNeighborsRegressor: 0.26

R-squared for DecisionTreeRegressor: 0.8285
Mean Squared Error for DecisionTreeRegressor: 116308591954.2128
Mean Absolute Error for DecisionTreeRegressor: 168381.9752
Mena Absolute Percentage Error DecisionTreeRegressor: 0.17

R-squared for ExtraTreeRegressor: 0.7785
Mean Squared Error for ExtraTreeRegressor: 150170356982.1560
Mean Absolute Error for ExtraTreeRegressor: 193639.2201
Mena Absolute Percentage Error ExtraTreeRegressor: 0.19



R-squared for RandomForestRegressor: 0.8851
Mean Squared Error for RandomForestRegressor: 77899852965.7570
Mean Absolute Error for RandomForestRegressor: 152370.1405
Mena Absolute Percentage Error RandomForestRegressor: 0.15

R-squared for XGBRegressor: 0.8871
Mean Squared Error for XGBRegressor: 76578395957.2856
Mean Absolute Error for XGBRegressor: 163232.5834
Mena Absolute Percentage Error XGBRegressor: 0.17

R-squared for CatBoostRegressor: 0.8805
Mean Squared Error for CatBoostRegressor: 81022853093.5424
Mean Absolute Error for CatBoostRegressor: 177857.0540
Mena Absolute Percentage Error CatBoostRegressor: 0.19



In [21]:
linear_reg_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('linear_reg', LinearRegression())  # Modelo de regressão linear
])

ridge_reg_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('ridge_reg', Ridge())  # Modelo de regressão Ridge
])

knn_pipeline = Pipeline([ 
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('knn', KNeighborsRegressor())  # Modelo K-Nearest Neighbors
])

decision_tree = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('decision_tree', DecisionTreeRegressor())  # Modelo Decision Tree
])

extra_tree = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('extra_tree', ExtraTreeRegressor())  # Modelo Extra Tree
])

random_forest = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('random_forest', RandomForestRegressor())  # Modelo Random Forest
])

xgboost = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('xgboost', XGBRegressor())  # Modelo XGBoost
])

catboost = Pipeline([
    ('scaler', MinMaxScaler()),  # Aplica o MinMaxScaler
    ('catboost', CatBoostRegressor(verbose=False))  # Modelo CatBoost
])

param_grid_linear_reg = {
}
    
param_grid_ridge_reg = {
    'ridge_reg__alpha': [0.1, 1.0, 10.0]
}

param_grid_knn = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance']
}

# Dicionários de hiperparâmetros para modelos adicionais
param_grid_decision_tree = {
    'decision_tree__max_depth': [None, 10, 20],
    'decision_tree__min_samples_split': [2, 5, 10]
}

param_grid_extra_tree = {
    'extra_tree__max_depth': [None, 10, 20],
    'extra_tree__min_samples_split': [2, 5, 10]
}

param_grid_random_forest = {
    'random_forest__n_estimators': [50, 100, 200],
    'random_forest__max_depth': [None, 10, 20],
    'random_forest__min_samples_split': [2, 5, 10]
}

param_grid_xgboost = {
    'xgboost__n_estimators': [50, 100, 200],
    'xgboost__max_depth': [3, 5, 7],
    'xgboost__learning_rate': [0.01, 0.1, 0.2]
}

param_grid_catboost = {
    'catboost__n_estimators': [50, 100, 200],
    'catboost__depth': [6, 8, 10],
    'catboost__learning_rate': [0.01, 0.1, 0.2]
}

# Adicione mais modelos e dicionários de hiperparâmetros conforme necessário

models_and_params = [
    ('Linear Regression', linear_reg_pipeline, param_grid_linear_reg),
    ('Ridge Regression', ridge_reg_pipeline, param_grid_ridge_reg),
    ('KNeighborsRegressor', knn_pipeline, param_grid_knn),
    ('Decision Tree', decision_tree, param_grid_decision_tree),
    ('Extra Tree', extra_tree, param_grid_extra_tree),
    ('Random Forest', random_forest, param_grid_random_forest),
    ('XGBoost', xgboost, param_grid_xgboost),
    ('CatBoost', catboost, param_grid_catboost)
]

# Use GridSearchCV para otimizar os modelos
for model_name, model, param_grid in models_and_params:
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(x_train, y_train)

    # Exiba os melhores hiperparâmetros e métricas de desempenho
    print(f"Best Parameters for {model_name}:")
    print(grid_search.best_params_)
    print(f"Best Mean Squared Error for {model_name}: {abs(grid_search.best_score_):.2f}")
    
    # Avalie o modelo otimizado no conjunto de teste
    y_pred = grid_search.best_estimator_.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'R-squared for {model_name}: {r2:.2f}')
    print(f'Mean Squared Error for {model_name}: {mse:.2f}')
    print(f'Mean Absolute Error for {model_name}: {mae:.2f}')
    print(f'Mena Absolute Percentage Error {model_name}: {mape:.2f}\n')

Best Parameters for Linear Regression:
{}
Best Mean Squared Error for Linear Regression: 199440431443.52
R-squared for Linear Regression: 0.75
Mean Squared Error for Linear Regression: 168656732944.56
Mean Absolute Error for Linear Regression: 277223.57
Mena Absolute Percentage Error Linear Regression: 0.29

Best Parameters for Ridge Regression:
{'ridge_reg__alpha': 0.1}
Best Mean Squared Error for Ridge Regression: 199182425147.49
R-squared for Ridge Regression: 0.75
Mean Squared Error for Ridge Regression: 169172403773.23
Mean Absolute Error for Ridge Regression: 277664.63
Mena Absolute Percentage Error Ridge Regression: 0.29

Best Parameters for KNeighborsRegressor:
{'knn__n_neighbors': 7, 'knn__weights': 'distance'}
Best Mean Squared Error for KNeighborsRegressor: 93638482092.56
R-squared for KNeighborsRegressor: 0.84
Mean Squared Error for KNeighborsRegressor: 108124703072.07
Mean Absolute Error for KNeighborsRegressor: 175982.74
Mena Absolute Percentage Error KNeighborsRegressor:

In [33]:
# Criar e treinar o modelo
modelo_random_forest = RandomForestRegressor(max_depth=20, min_samples_split=2, n_estimators=200)
modelo_random_forest.fit(x_train, y_train)  # Substitua x_train e y_train pelos seus dados de treinamento

joblib.dump(modelo_random_forest, '../modelo/modelo_random_forest.pkl')
modelo_carregado_rf = joblib.load('../modelo/modelo_random_forest.pkl')

In [34]:
novos_dados = pd.DataFrame({
    'crawler': ['broklin'],  # Substitua 'Valor_Do_Crawler' pelo valor real
    'area_limpo': [40],  # Substitua 'Valor_Da_Area_Limpo' pelo valor real
    'Banheiro': [1],  # Substitua 'Quantidade_De_Banheiros' pelo valor real
    'Quarto': [2],  # Substitua 'Quantidade_De_Quartos' pelo valor real
    'condominio': [700]  # Substitua 'Valor_Do_Condominio' pelo valor real
})

mapeamento_crawler = {
    'belavista': 1,
    'liberdade': 5,
    'brooklin': 3,
    'saude': 6,
    'ipiranga': 4,
    'broklin': 2  # Adicione os outros mapeamentos conforme necessário
}

# Substituir valores da coluna 'crawler' pelos códigos numéricos correspondentes
novos_dados['crawler'] = novos_dados['crawler'].map(mapeamento_crawler)

# Resultado
print(novos_dados)

   crawler  area_limpo  Banheiro  Quarto  condominio
0        2          40         1       2         700


In [35]:

previsoes_rf = modelo_carregado_rf.predict(novos_dados)  # Substitua novos_dados pelos seus novos dados
print("Previsões:")
print(previsoes_rf)

Previsões:
[410015.5]
