In [223]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.tree import DecisionTreeRegressor

In [293]:
df = pd.read_csv('casas_ponta_grossa.csv')

In [294]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3956 entries, 0 to 3955
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Quartos           3552 non-null   float64
 1   Banheiros         3313 non-null   float64
 2   Vagas de Garagem  2624 non-null   float64
 3   Área Total        2968 non-null   float64
 4   Valor Venda       3952 non-null   float64
 5   Bairro            3576 non-null   object 
 6   Referência        3956 non-null   object 
 7   Suítes            3956 non-null   float64
dtypes: float64(6), object(2)
memory usage: 247.4+ KB


In [295]:
df.dropna(inplace=True)

In [296]:
df.reset_index(drop=True, inplace=True)

In [297]:
from sklearn.preprocessing import LabelEncoder

In [298]:
label_encoder = LabelEncoder()

In [299]:
df['Bairro'] = label_encoder.fit_transform(df['Bairro'])

In [300]:
y = df['Valor Venda']
df.drop(columns='Valor Venda', inplace=True)
df.drop(columns='Referência', inplace=True)

In [301]:
df

Unnamed: 0,Quartos,Banheiros,Vagas de Garagem,Área Total,Bairro,Suítes
0,3.0,2.0,2.0,164.0,14,0.0
1,3.0,2.0,2.0,80.0,14,1.0
2,1.0,1.0,2.0,360.0,0,0.0
3,2.0,1.0,1.0,40.0,14,0.0
4,2.0,1.0,1.0,45.0,5,0.0
...,...,...,...,...,...,...
1676,3.0,3.0,2.0,300.0,8,1.0
1677,4.0,4.0,6.0,640.0,0,1.0
1678,3.0,5.0,2.0,314.0,14,2.0
1679,4.0,3.0,2.0,400.0,9,1.0


In [302]:
y 

0       430000.0
1       169900.0
2       150000.0
3       130000.0
4       126900.0
          ...   
1676    730000.0
1677    700000.0
1678    700000.0
1679    700000.0
1680    700000.0
Name: Valor Venda, Length: 1681, dtype: float64

In [303]:
x = df.columns
x = x.tolist()

In [304]:
x = ['Bairro']

In [305]:
for name_columns in x:
    print('Processando {}'.format(name_columns))
    mean = df[name_columns].mean()
    std = df[name_columns].std()
    df[name_columns] = (df[name_columns] - mean) / std

Processando Bairro


In [306]:
df

Unnamed: 0,Quartos,Banheiros,Vagas de Garagem,Área Total,Bairro,Suítes
0,3.0,2.0,2.0,164.0,1.331530,0.0
1,3.0,2.0,2.0,80.0,1.331530,1.0
2,1.0,1.0,2.0,360.0,-1.940158,0.0
3,2.0,1.0,1.0,40.0,1.331530,0.0
4,2.0,1.0,1.0,45.0,-0.771698,0.0
...,...,...,...,...,...,...
1676,3.0,3.0,2.0,300.0,-0.070622,1.0
1677,4.0,4.0,6.0,640.0,-1.940158,1.0
1678,3.0,5.0,2.0,314.0,1.331530,2.0
1679,4.0,3.0,2.0,400.0,0.163070,1.0


In [329]:
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(df, y, test_size=0.5)

In [None]:
X_teste

In [308]:

# Modelo linear
linearRegression = LinearRegression()

# Função para visualização dos coeficientes encontrados
# print(linearRegression.coef_)

# Modelo linear utilizando Lasso
regressionLasso = linear_model.Lasso(alpha=0.9)

# Modelo usando Decission Tree mudando parâmetros de profundidade
dt_1 = DecisionTreeRegressor(max_depth=5)

# Modelo usando SVM
svm_reg = svm.SVR(C=0.5,cache_size=2000,kernel='poly')

# Modelo KNN
knn = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance')

# Modelo AdaBoost
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),random_state=5, n_estimators=1000)

# Modelo Random Forest
rf = RandomForestRegressor(n_estimators=100,max_depth=1000, random_state=20)

# Modelo  HistGradientBoostingRegressor HGBR
est = HistGradientBoostingRegressor()

# Voting
ereg = VotingRegressor(estimators=[('RF', rf), ('ada', ada), ('HGBR',linearRegression )])


titles = ['Linear Regression', 'Regression Lasso', 'Decission Tree D=5', 'SVM', 'KNN', 'AdaBoost', 'Random Forest', 'HGBR', 'Voting']
methods = [linearRegression, regressionLasso,  dt_1, svm_reg, knn, ada, rf, est, ereg ]


In [331]:
# Treinamento dos classificadores

scores = []
for method, name in zip(methods, titles):
    method.fit(X_treinamento, y_treinamento)
    scores.append(method.score(X_teste, y_teste))
    print("Acurácia do classificador: {} = {:.4}".format(name, method.score(X_teste, y_teste)))

Acurácia do classificador: Linear Regression = 0.3879
Acurácia do classificador: Regression Lasso = 0.3879
Acurácia do classificador: Decission Tree D=5 = 0.3509
Acurácia do classificador: SVM = -0.1049
Acurácia do classificador: KNN = 0.2733
Acurácia do classificador: AdaBoost = 0.01577
Acurácia do classificador: Random Forest = 0.2944
Acurácia do classificador: HGBR = 0.4593
Acurácia do classificador: Voting = 0.3766


In [332]:
p = rf.predict(X_teste)

In [333]:
num = 365
p[num].round(), y[num]

(169770.0, 310000.0)

In [254]:
y.to_numpy()

array([430000, 350000, 300000, ..., 700000, 700000, 700000], dtype=int64)

In [180]:
# Modelo Random Forest
rf2 = RandomForestRegressor(n_estimators=10000, random_state=0, min_samples_leaf=1,  n_jobs=6)

In [181]:
rf2.fit(X_treinamento, y_treinamento)
rf2.score(X_teste, y_teste)

0.6166011430837992

In [243]:
from sklearn.metrics import mean_squared_error

In [334]:
mse = mean_squared_error(y_teste, p)
mse

231987289414.94568

In [335]:
from sklearn.metrics import mean_squared_log_error

In [336]:
msel = mean_squared_log_error(y_teste, p)
msel

0.29257635896310474

In [337]:
from sklearn.metrics import mean_absolute_error

In [338]:
mae = mean_absolute_error(y_teste, p)
mae

263421.3445069431

In [339]:
from sklearn.metrics import median_absolute_error

In [340]:
mae_z = median_absolute_error(y_teste, p)
mae_z

108630.0

In [341]:
def mape(y_true, y_pred):
    return np.mean( np.abs(y_true - y_pred) / y_true)

mape = mape(y_teste,p)
mape

0.510625078456916

In [318]:
from sklearn.decomposition import PCA

In [326]:
pca = PCA(0.95) 
z = pca.fit(df)

In [327]:
pca.explained_variance_ratio_

array([0.99988919])

In [328]:
df = pca.transform(df)