In [None]:
#All imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [None]:
#Dataset
df = pd.read_csv('./bases/Real estate.csv', decimal=',')

In [None]:
df

In [None]:
# Checando algumas informações estatísticas.
df.describe()

In [None]:

#Removendo a coluna 'No', pq não será útil.
df.drop('No',axis=1,inplace=True)

In [None]:
# Renomeando as colunas
df.rename(columns={"X1 transaction date":"transaction_date","X2 house age":"house_age","X3 distance to the nearest MRT station":"distance_to_nearest_station","X4 number of convenience stores":"number_of_convenience_stores","X5 latitude":"latitude","X6 longitude":"longitude","Y house price of unit area":"house_price"},inplace=True)

In [None]:
df.info()

In [None]:
# verificando as colunas que tem valores não definidos
hasNan = df.isnull().sum()
print(hasNan)

In [None]:
#visualizando valores da coluna 'Preço da casa'
df['house_price'].value_counts().plot(kind='bar')

In [None]:
#Vizualizando a distribuição dos preços das casas
sns.displot(df, x="house_price", bins=200)

In [None]:
x_tr=df.drop(['house_price'], axis=1)
y_tr=df['house_price']

In [None]:
#Exec
X_train, X_test, y_train, y_test = train_test_split(x_tr, y_tr, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_train= y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)
X_train = scalerX.transform(X_train)
y_train = scalery.transform(y_train)
X_test = scalerX.transform(X_test)
y_test = scalery.transform(y_test)

In [None]:
df.keys()

Classe RunRegression

In [None]:
from sklearn.model_selection import KFold
from sklearn import metrics

class runregression:
  '''
    Parâmetros de entrada:

    X: atributos de entradas
    y: atributo alvo
    model: algoritmo para construção do estimador
    cv: quantidade de folds da validação cruzada

    Guarda as métricas de cada execução em resultados e 
    Mostra os valores médios com desvio padrão para as métricas (mostraresultadomedio)   
  '''

  def __init__(self, X, y, model, cv = 5):
    
    self.__resultados = {
      'MAE': [],
      'MSE': [],
      'RMSE': [],
      'R2': []
    }

    print(f'validação cruzada com {cv} folds')
    self.__avaliamodelo_cv(X, y, model, cv)

  @property
  def resultados(self):
    return self.__resultados

  def __avaliamodelo_cv(self, X, y, model, cv):
    #gera as amostras para cv folds com embaralhamento (permite fazer várias validações cruzadas)
    kf = KFold(shuffle=True, n_splits=cv)
    #para cada fold: treina, testa e armazena os resultados
    for train_index, test_index in kf.split(X, y):
      model.fit(X.iloc[train_index], y.iloc[train_index])
      pred = model.predict(X.iloc[test_index])
      self.__resultados['MAE'].append(metrics.mean_absolute_error(y[test_index], pred))
      self.__resultados['MSE'].append(metrics.mean_squared_error(y[test_index], pred))
      self.__resultados['RMSE'].append(np.sqrt(metrics.mean_squared_error(y[test_index], pred)))
      self.__resultados['R2'].append(metrics.r2_score(y[test_index], pred))

  def mostraresultadomedio(self):
    print(f"MAE: {np.mean(self.__resultados['MAE']):.2f} +- {np.std(self.__resultados['MAE']):.2f}")
    print(f"MSE: {np.mean(self.__resultados['MSE']):.2f} +- {np.std(self.__resultados['MSE']):.2f}")
    print(f"RMSE: {np.mean(self.__resultados['RMSE']):.2f} +- {np.std(self.__resultados['RMSE']):.2f}")
    print(f"R2: {np.mean(self.__resultados['R2']):.2f} +- {np.std(self.__resultados['R2']):.2f}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns={'house_price'}), df['house_price'], test_size=0.33)

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
y_train= y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [None]:
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)
X_train = scalerX.transform(X_train)
y_train = scalery.transform(y_train)
X_test = scalerX.transform(X_test)
y_test = scalery.transform(y_test)

Regressão Linear

In [None]:
cross_validation_RL = runregression(df.drop(columns={'house_price'}), df['house_price'], LinearRegression(), cv = 5)
cross_validation_RL.mostraresultadomedio()

KNN

In [None]:
#define os parâmetros a serem testados com o KNN
param_grid_knn = {'n_neighbors': range(1,40,2), 'weights': ['uniform', 'distance'], 'p': [1, 2, 3]} 
#cria o objeto do gridsearchcv
gridknn = GridSearchCV(KNeighborsRegressor(), param_grid_knn,  cv = 5, verbose = 1) 
#executa o gridsearchcv para a base separando X e y
gridknn.fit(X_train,y_train)

In [None]:
gridknn.best_params_

Testando agora com os melhores parâmetros avaliados

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('R2:', metrics.r2_score(y_test, pred))

In [None]:
knn = gridknn.best_estimator_ #usa o melhor modelo encontrado pelo gridsearchcv
cross_validation_knn = runregression(df.drop(columns={'house_price'}), df['house_price'], knn, cv = 10)
cross_validation_knn.mostraresultadomedio()

SVM

In [None]:
param_grid_svm = {
    'C': [1, 10, 100], 
    'gamma': ['auto', 'scale'], 
    'kernel': ['rbf', 'sigmoid']
} 
gridsvm = GridSearchCV(SVR(),param_grid_svm, verbose = 3)
gridsvm.fit(X_train,y_train)