# King County Housing
#### House Price Estimate

**Authors:** Hatice Kastan, Czarina Luna, Ross McKim, Weston Shuken

##### January 2022

***

![image](Images/daria-nepriakhina-LZkbXfzJK4M-unsplash.jpg)

## Overview

    Overview of our project.

## Business Problem

    Stakeholder is a real estate company.
    Business Problem is predicting price and building a house price calculator.

## Data Understanding
    Describe the data being used for this project.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pandas.api.types import is_numeric_dtype

In [2]:
data = pd.read_csv('Data/kc_house_data.csv')

In [3]:
data.dropna(inplace=True)

##### Baseline Model

In [4]:
numerical = ['bedrooms', 'bathrooms', 'sqft_living',
             'sqft_lot', 'floors', 'sqft_above',
             'yr_built', 'yr_renovated', 'zipcode',
             'lat', 'long', 'sqft_living15', 'sqft_lot15']

y = data.price
X = data[numerical]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

baseline_model = sm.OLS(y_test, sm.add_constant(X_test)).fit()
baseline_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.627
Model:,OLS,Adj. R-squared:,0.626
Method:,Least Squares,F-statistic:,406.7
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:26,Log-Likelihood:,-43158.0
No. Observations:,3153,AIC:,86340.0
Df Residuals:,3139,BIC:,86430.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.42e+07,8e+06,-3.024,0.003,-3.99e+07,-8.51e+06
bedrooms,-6.582e+04,5340.167,-12.326,0.000,-7.63e+04,-5.54e+04
bathrooms,6.702e+04,8862.638,7.562,0.000,4.96e+04,8.44e+04
sqft_living,208.6041,11.891,17.544,0.000,185.290,231.918
sqft_lot,0.1674,0.113,1.476,0.140,-0.055,0.390
floors,2.899e+04,9971.665,2.907,0.004,9439.634,4.85e+04
sqft_above,27.4506,12.038,2.280,0.023,3.847,51.054
yr_built,-2293.6297,184.111,-12.458,0.000,-2654.620,-1932.640
yr_renovated,13.7637,9.679,1.422,0.155,-5.215,32.742

0,1,2,3
Omnibus:,2200.751,Durbin-Watson:,2.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60857.961
Skew:,2.947,Prob(JB):,0.0
Kurtosis:,23.7,Cond. No.,212000000.0


In [5]:
baseline_rsquared = baseline_model.rsquared

## Data Cleaning
    Describe and justify the process for preparing the data for analysis.

In [6]:
# Data prep and cleaning
raw_data = pd.read_csv('Data/kc_house_data.csv')

# Change to datetime and add month column
raw_data['date'] = pd.to_datetime(raw_data['date'])
raw_data['month'] = pd.DatetimeIndex(raw_data['date']).month

# Change waterfront missing value to No
raw_data.loc[raw_data.waterfront.isnull(), 'waterfront'] = "NO"

# Change view missing value to None
raw_data.loc[raw_data.view.isnull(), 'view'] = "NONE"
view_dict = {'NONE':0, 'FAIR':1, 'AVERAGE':2, 'GOOD':3, 'EXCELLENT':4}
raw_data['view'].replace(view_dict, inplace=True)

# Change condition to numerical value
cond_dict = {'Poor':0, 'Fair':1, 'Average':2, 'Good':3, 'Very Good':4}
raw_data['condition'].replace(cond_dict, inplace=True)

# Change grade to numerical value
raw_data['grade'] = raw_data['grade'].map(lambda x: int(x.split(' ')[0]))

# Add has_basement column
raw_data['sqft_basement'] = raw_data['sqft_basement'].replace('?', '0').astype('float')
raw_data['basement'] = raw_data['sqft_basement'].apply(lambda x: 0 if x == 0 else 1)

# Change some yr_renovated missing value to 0 and add renovated column
raw_data.loc[raw_data.yr_renovated.isnull(), 'yr_renovated'] = 0
raw_data['renovated'] = raw_data['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)

# Add house_age column
raw_data['age'] = raw_data['date'].dt.year - raw_data['yr_built']

In [7]:
clean_numerical = ['bedrooms', 'bathrooms', 'sqft_living',
             'sqft_lot', 'floors', 'condition', 'sqft_above', 'sqft_basement',
             'age', 'yr_renovated', 'zipcode', 'view',
             'lat', 'long', 'sqft_living15', 'sqft_lot15', 'month']

In [8]:
# Run linear regression to cleaned data

y = raw_data.price
X = raw_data[clean_numerical]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clean_model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
clean_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.659
Model:,OLS,Adj. R-squared:,0.658
Method:,Least Squares,F-statistic:,1959.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:27,Log-Likelihood:,-236690.0
No. Observations:,17277,AIC:,473400.0
Df Residuals:,17259,BIC:,473600.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.636e+06,3.45e+06,-1.926,0.054,-1.34e+07,1.18e+05
bedrooms,-5.427e+04,2341.499,-23.178,0.000,-5.89e+04,-4.97e+04
bathrooms,5.364e+04,3922.372,13.676,0.000,4.6e+04,6.13e+04
sqft_living,142.6310,22.996,6.202,0.000,97.556,187.706
sqft_lot,0.1167,0.058,1.997,0.046,0.002,0.231
floors,2.525e+04,4295.499,5.878,0.000,1.68e+04,3.37e+04
condition,3.222e+04,2809.689,11.469,0.000,2.67e+04,3.77e+04
sqft_above,107.8018,22.951,4.697,0.000,62.816,152.788
sqft_basement,49.7231,22.779,2.183,0.029,5.073,94.373

0,1,2,3
Omnibus:,13551.563,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,878990.5
Skew:,3.262,Prob(JB):,0.0
Kurtosis:,37.329,Cond. No.,211000000.0


In [9]:
def corr_check(df, threshold):
    '''
    Enter dataframe and threshold for correlation
    Returns table of the highly correlated pairs
    '''
    corr_df = df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
    corr_df['pairs'] = list(zip(corr_df.level_0, corr_df.level_1))
    corr_df.set_index(['pairs'], inplace = True)
    corr_df.drop(columns=['level_1', 'level_0'], inplace = True)
    corr_df.columns = ['cc']
    corr_df = corr_df.drop_duplicates()
    corr_df = corr_df[(corr_df['cc'] > threshold) & (corr_df['cc'] < 1)]
    return corr_df

corr_check(raw_data, .7)

Unnamed: 0_level_0,cc
pairs,Unnamed: 1_level_1
"(renovated, yr_renovated)",0.999968
"(yr_built, age)",0.999873
"(sqft_living, sqft_above)",0.876448
"(basement, sqft_basement)",0.820893
"(grade, sqft_living)",0.762779
"(sqft_living, sqft_living15)",0.756402
"(sqft_above, grade)",0.756073
"(sqft_living, bathrooms)",0.755758
"(sqft_living15, sqft_above)",0.731767
"(sqft_lot, sqft_lot15)",0.718204


In [10]:
# Drop columns
raw_data.drop(columns=['id', 'date', 'yr_renovated', 'sqft_above', 'sqft_basement',
                      'yr_built', 'yr_renovated'], inplace=True)

In [11]:
raw_data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
0,221900.0,3,1.0,1180,5650,1.0,NO,0,2,7,98178,47.5112,-122.257,1340,5650,10,0,0,59
1,538000.0,3,2.25,2570,7242,2.0,NO,0,2,7,98125,47.721,-122.319,1690,7639,12,1,1,63


## Data Exploration
    Generate insights and visualizations about price and its relationships with variables.

In [12]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21597 non-null  float64
 1   bedrooms       21597 non-null  int64  
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  int64  
 4   sqft_lot       21597 non-null  int64  
 5   floors         21597 non-null  float64
 6   waterfront     21597 non-null  object 
 7   view           21597 non-null  int64  
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  zipcode        21597 non-null  int64  
 11  lat            21597 non-null  float64
 12  long           21597 non-null  float64
 13  sqft_living15  21597 non-null  int64  
 14  sqft_lot15     21597 non-null  int64  
 15  month          21597 non-null  int64  
 16  basement       21597 non-null  int64  
 17  renovated      21597 non-null  int64  
 18  age   

In [13]:
raw_data.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,zipcode,lat,long,sqft_living15,sqft_lot15,month,basement,renovated,age
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,540296.6,3.3732,2.115826,2080.32185,15099.41,1.494096,0.233181,2.409825,7.657915,98077.951845,47.560093,-122.213982,1986.620318,12758.283512,6.573969,0.3851,0.034449,43.323286
std,367368.1,0.926299,0.768984,918.106125,41412.64,0.539683,0.764673,0.650546,1.1732,53.513072,0.138552,0.140724,685.230472,27274.44195,3.115061,0.48663,0.182384,29.377285
min,78000.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,3.0,98001.0,47.1559,-122.519,399.0,651.0,1.0,0.0,0.0,-1.0
25%,322000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,2.0,7.0,98033.0,47.4711,-122.328,1490.0,5100.0,4.0,0.0,0.0,18.0
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,2.0,7.0,98065.0,47.5718,-122.231,1840.0,7620.0,6.0,0.0,0.0,40.0
75%,645000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,3.0,8.0,98118.0,47.678,-122.125,2360.0,10083.0,9.0,1.0,0.0,63.0
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,4.0,4.0,13.0,98199.0,47.7776,-121.315,6210.0,871200.0,12.0,1.0,1.0,115.0


### Baseline Model
    Run simple linear regression on feature highest correlated with price.

In [14]:
# ols

##### Model Metrics Table
    Create table of metrics we care about, and update with every additional model after.

In [15]:
# metric_df

## Feature Engineering
    Create new variables to predict the price.

In [16]:
from haversine import haversine, Unit

seattle = (47.608013, -122.335167)
redmond = (47.673988, -122.121513)

def get_dist(center, lat, long):
    house = (lat, long)
    return haversine(seattle, house, unit=Unit.MILES)

# Add distance_from_seattle column
raw_data['distance'] = raw_data.apply(lambda x: get_dist(seattle, x.lat, x.long), axis=1)

# Add distance_from_redmond column
raw_data['distance_r'] = raw_data.apply(lambda x: get_dist(redmond, x.lat, x.long), axis=1)

In [17]:
raw_data.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'month', 'basement', 'renovated', 'age',
       'distance', 'distance_r'],
      dtype='object')

In [18]:
# Run linear regression to additional features
add_numerical = ['bedrooms', 'bathrooms', 'sqft_living',
             'sqft_lot', 'floors', 'condition',
             'age', 'zipcode', 'view',
             'lat', 'long', 'sqft_living15', 'sqft_lot15', 'month', 'distance', 'distance_r']

y = raw_data.price
X = raw_data[add_numerical]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clean_model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
clean_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.689
Model:,OLS,Adj. R-squared:,0.689
Method:,Least Squares,F-statistic:,2551.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:29,Log-Likelihood:,-235890.0
No. Observations:,17277,AIC:,471800.0
Df Residuals:,17261,BIC:,471900.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.698e+07,3.95e+06,24.567,0.000,8.92e+07,1.05e+08
bedrooms,-5.351e+04,2232.355,-23.971,0.000,-5.79e+04,-4.91e+04
bathrooms,3.928e+04,3678.852,10.677,0.000,3.21e+04,4.65e+04
sqft_living,232.3602,3.607,64.415,0.000,225.290,239.431
sqft_lot,0.2355,0.056,4.221,0.000,0.126,0.345
floors,2.983e+04,3645.943,8.180,0.000,2.27e+04,3.7e+04
condition,2.661e+04,2640.964,10.075,0.000,2.14e+04,3.18e+04
age,1386.4619,78.924,17.567,0.000,1231.763,1541.161
zipcode,-818.4563,37.855,-21.621,0.000,-892.656,-744.257

0,1,2,3
Omnibus:,14478.478,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1191667.292
Skew:,3.549,Prob(JB):,0.0
Kurtosis:,43.062,Cond. No.,7.27e+19


### Feature Scaling
    Perform log transformation and standardization.

In [19]:
# Log Transform
to_transform = ['sqft_living', 'distance', 'distance_r']
log_data = raw_data.copy()

for col in to_transform:
    log_data[col] = np.log(log_data[col])

In [20]:
# Run linear regression to log transformed
log_numerical = ['bedrooms', 'bathrooms', 'sqft_living',
             'sqft_lot', 'floors', 'condition',
             'age', 'zipcode', 'view',
             'lat', 'long', 'sqft_living15', 'sqft_lot15', 'month', 'distance', 'distance_r']

y = log_data.price
X = log_data[add_numerical]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
log_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.646
Method:,Least Squares,F-statistic:,2100.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:29,Log-Likelihood:,-237010.0
No. Observations:,17277,AIC:,474000.0
Df Residuals:,17261,BIC:,474200.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,8.314e+07,3.95e+06,21.063,0.000,7.54e+07,9.09e+07
bedrooms,-3.268e+04,2480.891,-13.171,0.000,-3.75e+04,-2.78e+04
bathrooms,8.978e+04,3890.493,23.076,0.000,8.22e+04,9.74e+04
sqft_living,2.595e+05,8357.037,31.051,0.000,2.43e+05,2.76e+05
sqft_lot,0.3916,0.059,6.590,0.000,0.275,0.508
floors,4415.8315,3986.563,1.108,0.268,-3398.237,1.22e+04
condition,2.807e+04,2823.071,9.944,0.000,2.25e+04,3.36e+04
age,1173.0233,86.563,13.551,0.000,1003.350,1342.697
zipcode,-873.6092,40.564,-21.537,0.000,-953.119,-794.100

0,1,2,3
Omnibus:,17873.626,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3041860.14
Skew:,4.805,Prob(JB):,0.0
Kurtosis:,67.29,Cond. No.,1.2e+21


In [21]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)

model = sm.OLS(y_train, sm.add_constant(X_scaled)).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.646
Method:,Least Squares,F-statistic:,2100.0
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:29,Log-Likelihood:,-237010.0
No. Observations:,17277,AIC:,474000.0
Df Residuals:,17261,BIC:,474200.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.415e+05,1670.848,324.098,0.000,5.38e+05,5.45e+05
bedrooms,-2.955e+04,2243.662,-13.171,0.000,-3.39e+04,-2.52e+04
bathrooms,6.892e+04,2986.569,23.076,0.000,6.31e+04,7.48e+04
sqft_living,1.099e+05,3539.582,31.051,0.000,1.03e+05,1.17e+05
sqft_lot,1.574e+04,2388.644,6.590,0.000,1.11e+04,2.04e+04
floors,2388.6500,2156.446,1.108,0.268,-1838.204,6615.504
condition,1.827e+04,1837.258,9.944,0.000,1.47e+04,2.19e+04
age,3.444e+04,2541.375,13.551,0.000,2.95e+04,3.94e+04
zipcode,-4.676e+04,2170.993,-21.537,0.000,-5.1e+04,-4.25e+04

0,1,2,3
Omnibus:,17873.626,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3041860.14
Skew:,4.805,Prob(JB):,0.0
Kurtosis:,67.29,Cond. No.,2.81e+16


### Feature Selection
    Feature ranking with recursive feature elimination.

In [22]:
# RFE

## Data Modeling
    Describe and justify the process for modeling the data.
    Run multiple linear regression on top ranking features.

In [23]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [24]:
lr = LinearRegression()

In [25]:
# Our model needs to have only numeric variables.
# Using this function, we can drop all columns without numeric varibales.
# We will input this function within our next function.
def only_numeric(data):
    '''returns a dataframe with only numeric values'''
    for column in data.columns:
        if is_numeric_dtype(data[column]) == False:
            data = data.drop(column, axis=1)
        else:
            continue
    return data

In [26]:
# This returns our y and X for any data frame. 
# Uses all the numeric columns, need to pass a string as a target variable.
def get_y_X(data, target):
    data = only_numeric(data) # Making data only columns with numeric values.
    y = data[target] 
    X = data.drop(target, axis=1)
    return y, X

In [27]:
# This function will return a train / test split variables for an X and y. 
def my_train_test(ys, Xs):
    X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=.2)

    return X_train, X_test, y_train, y_test

In [28]:
# Function to compare R2 values and RMSE values of the train and testing models
def train_test_compare(X_tr, X_te, y_tr, y_te):
    model = lr.fit(X_tr, y_tr) # fit the model
    
    #R2 Scores
    train_score = lr.score(X_tr, y_tr)
    test_score = lr.score(X_te, y_te)
    
    #RMSE
    y_hat_train = lr.predict(X_tr)
    y_hat_test = lr.predict(X_te)
    
    train_rmse = np.sqrt(mean_squared_error(y_tr, y_hat_train))
    test_rmse = np.sqrt(mean_squared_error(y_te, y_hat_test))
    
    #intercept / coef
    inter = model.intercept_
    stats = sm.OLS(y_tr, sm.add_constant(X_tr)).fit()
    summary = stats.summary()
    
    return print(f' training data R2: {train_score}\n testing data R2: {test_score} \
                    \n training data rmse: {train_rmse}\n testing data rmse: {test_rmse} \
                    \n {summary}') 

In [29]:
# Get dummies
zipcode_dummies = pd.get_dummies(raw_data['zipcode'], drop_first=True)
waterfront_dummies = pd.get_dummies(raw_data['waterfront'], drop_first=True)
view_dummies = pd.get_dummies(raw_data['view'], drop_first=True)
month_dummies = pd.get_dummies(raw_data['month'], drop_first=True)

df_clean_dumm = pd.concat([raw_data, waterfront_dummies, 
                           view_dummies, month_dummies, zipcode_dummies], axis=1)
df_clean_dumm.drop(columns=['zipcode', 'waterfront', 'view', 'month'], inplace=True)

In [30]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bedrooms       21597 non-null  int64  
 1   bathrooms      21597 non-null  float64
 2   sqft_living    21597 non-null  float64
 3   sqft_lot       21597 non-null  int64  
 4   floors         21597 non-null  float64
 5   condition      21597 non-null  int64  
 6   age            21597 non-null  int64  
 7   zipcode        21597 non-null  int64  
 8   view           21597 non-null  int64  
 9   lat            21597 non-null  float64
 10  long           21597 non-null  float64
 11  sqft_living15  21597 non-null  int64  
 12  sqft_lot15     21597 non-null  int64  
 13  month          21597 non-null  int64  
 14  distance       21597 non-null  float64
 15  distance_r     21597 non-null  float64
dtypes: float64(7), int64(9)
memory usage: 2.6 MB


In [31]:
y = df_clean_dumm.price
X = df_clean_dumm.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.815
Model:,OLS,Adj. R-squared:,0.814
Method:,Least Squares,F-statistic:,756.1
Date:,"Tue, 04 Jan 2022",Prob (F-statistic):,0.0
Time:,12:48:30,Log-Likelihood:,-231410.0
No. Observations:,17277,AIC:,463000.0
Df Residuals:,17176,BIC:,463800.0
Df Model:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.298e+07,9.72e+06,6.478,0.000,4.39e+07,8.2e+07
bedrooms,-2.933e+04,1773.856,-16.536,0.000,-3.28e+04,-2.59e+04
bathrooms,2.589e+04,2953.916,8.764,0.000,2.01e+04,3.17e+04
sqft_living,190.9401,3.118,61.234,0.000,184.828,197.052
sqft_lot,0.2343,0.043,5.398,0.000,0.149,0.319
floors,-3.891e+04,3373.847,-11.533,0.000,-4.55e+04,-3.23e+04
condition,2.58e+04,2126.329,12.132,0.000,2.16e+04,3e+04
grade,6.079e+04,1998.373,30.421,0.000,5.69e+04,6.47e+04
lat,-4.798e+04,7.38e+04,-0.650,0.516,-1.93e+05,9.67e+04

0,1,2,3
Omnibus:,16641.766,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3436224.996
Skew:,4.151,Prob(JB):,0.0
Kurtosis:,71.589,Cond. No.,4.77e+19


#### Check Assumptions of Linear Regression
    Linearity, independence, normality, homoescadicity

In [32]:
# code