In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from scipy import stats
import matplotlib.pyplot as plt
import pickle
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.float_format', str)

In [3]:
df = pd.read_csv('data/kc_house_data_cleaned.csv')

In [4]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,2,7,1180,0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,2,7,2170,400,1951,1991.0,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,2,6,770,0,1933,0.0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,4,7,1050,910,1965,0.0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,2,8,1680,0,1987,0.0,98074,47.6168,-122.045,1800,7503


# Baseline Model

In [12]:
col_selector = ['price','sqft_living','grade','view','bedrooms','bathrooms']
df_base = df[col_selector]

In [13]:
y = df_base.price
X = df_base.drop(columns = ['price'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [16]:
ss = StandardScaler()
X_trans_train = ss.fit_transform(X_train)
X_trans_test = ss.transform(X_test)

In [17]:
model = sm.OLS(y_train, sm.add_constant(X_trans_train))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.576
Model:                            OLS   Adj. R-squared:                  0.576
Method:                 Least Squares   F-statistic:                     4106.
Date:                Tue, 13 Sep 2022   Prob (F-statistic):               0.00
Time:                        15:21:22   Log-Likelihood:            -2.0887e+05
No. Observations:               15117   AIC:                         4.178e+05
Df Residuals:                   15111   BIC:                         4.178e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.397e+05   1971.010    273.810      0.0

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15,5), sharey=True)
ax1.set_ylabel("price")

df_base.plot.scatter(x="bathrooms", y="price", ax=ax1)
df_base.plot.scatter(x="sqft_living", y="price", ax=ax2)
df_base.plot.scatter(x="grade", y="price", ax=ax3);

# POSSIBLE FEATURES

* grade: curved line
* old house: not affect too much, but has a bowl shape
* grade + year built