In [1]:
import plotly.express as px # gráficos dinâmicos
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Linear Regression
from sklearn.linear_model import LinearRegression

# Mostrar Residuos de Regressao
from yellowbrick.regressor import ResidualsPlot

# Pré-processamento
from sklearn.model_selection import train_test_split

# Metricas 
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Graphic objects do Plotly
import plotly.graph_objects as go

In [2]:
db_casas = pd.read_csv('house_prices.csv')
db_casas.drop('date', axis='columns', inplace=True)

In [3]:
db_casas.head(2)

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639


Na regressão múltipla iremos até o parâmetro longitude. Desprezaremos *sqft_living15* e *sqft_lot15*.

In [4]:
db_casas.iloc[:, 1:18].head(4)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393


In [5]:
db_casas.iloc[:,1:18]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,221900.0,3,1.00,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.7210,-122.319
2,180000.0,2,1.00,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.00,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.00,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,360000.0,3,2.50,1530,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346
21609,400000.0,4,2.50,2310,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362
21610,402101.0,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299
21611,400000.0,3,2.50,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069


In [6]:
X_casas = db_casas.iloc[:, 2:18].values # tudo depois de price até long inclusive
y_casas = db_casas.iloc[:, 1].values # price

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_casas, y_casas, test_size=0.3, random_state=0)

In [8]:
X_train.shape, X_test.shape

((15129, 16), (6484, 16))

In [9]:
mult_lr_casas = LinearRegression()
mult_lr_casas.fit(X_train, y_train)

In [10]:
# b0
mult_lr_casas.intercept_

5736222.703406082

 Agora teremos uma lista de coeficiente $b_i$ de acordo com o número de atributos previsores.

In [11]:
mult_lr_casas.coef_

array([-3.08423090e+04,  3.66540816e+04,  1.12179158e+02,  8.00604121e-03,
        9.60355724e+03,  5.85441638e+05,  5.60621840e+04,  2.54795004e+04,
        1.01092121e+05,  6.96565321e+01,  4.25226267e+01, -2.60724691e+03,
        1.40070753e+01, -5.53557431e+02,  6.11778251e+05, -1.95564688e+05])

In [12]:
mult_lr_casas.score(X_train, y_train)

0.702988808595501

In [13]:

mult_lr_casas.score(X_test, y_test)

0.6885414149062226

Uma melhoria maior em relação a uma regressão simples.

In [14]:
previsoes= mult_lr_casas.predict(X_test)
previsoes

array([ 383751.77768686, 1514216.17517464,  546921.96179099, ...,
        314968.57623292,  219405.55573554,  148128.8346704 ])

In [15]:
y_test

array([ 297000., 1578000.,  562100., ...,  380000.,  268000.,  206000.])

In [16]:
mean_absolute_error(y_test, previsoes)

123888.44377484522

O preço pelo modelo então é $X \pm 123888.44377484522$