## 1) Importing the Housing Data and looking at the size of the dataset.
5768 rows 14 columns


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Loading the dataset
dataset = pd.read_csv('Master_File_Housing_Data.csv')

dataset.shape

(5768, 14)

## 2) Visualizing the dataset columns

In [2]:
dataset.head()

Unnamed: 0,APN,FORMATTED_ADDRESS,COMMUNITY,ABOVEGROUNDAREA,BELOWGROUNDAREA,PARCEL_AREA_SQFT,YEARBUILT,BATHROOMS,BEDROOMS,NUM_STORIES,FIREPLACES,CONSTRUCTIONTYPE,Assessed Value,Sale Price
0,302924440099,2910 California St,NORTHEAST,,,10640,,,,,,,320000,352500
1,402824210259,3041 Holmes Ave #401,CALHOUN-ISLE,,,11425,,,,,,,460000,1359150
2,402824240090,3248 Holmes Ave,CALHOUN-ISLE,,,5450,,,,,,,520500,555350
3,602823110060,3117 42ND Ave S,LONGFELLOW,,,6750,,,,,,,350000,650000
4,802824430075,4433 Washburn Ave S,SOUTHWEST,,,5670,,,,,,,900000,1245000


## 3) Data Transformation (part 1)

Start removing all blanks from the categorical data field 'Constructointype' (the only categorical field that was missing any rows).  We were only missing 47 rows in a dataset of >5k instances, so this is an acceptable reduction is dataset size.

Additionally, we have a bunch of blank values in the below ground square footage column (likely because of no basement), but we need to put in 0 so that we can run our models.

In [3]:
df= dataset.dropna(how='any', subset=['CONSTRUCTIONTYPE'])
df['BELOWGROUNDAREA'].fillna(0,inplace=True)

X = df.iloc[:,2:-1].values
y = df.iloc[:,13].values

#df.head()
#print(X)

print(X.shape)

print(X)

(5684, 11)
[['CALHOUN-ISLE' 343.0 0.0 ... 0.0 'Wood Frame' 91500]
 ['CALHOUN-ISLE' 343.0 0.0 ... 0.0 'Wood Frame' 93000]
 ['CENTRAL' 344.0 0.0 ... 0.0 'Wood Frame' 80000]
 ...
 ['SOUTHWEST' 6565.0 3171.0 ... 2.0 'Concrete' 1400000]
 ['CALHOUN-ISLE' 6857.0 2733.0 ... 5.0 'Wood Frame' 2319500]
 ['CALHOUN-ISLE' 6972.0 3399.0 ... 2.0 'Wood Frame' 2992500]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


## 4) Dealing with missing variables (part 2)
 ### Impute the numeric columns.

Note: leaving out the below ground square footage because we replcaed blanks with 0 already, so we won't want to impute on this column.

In [4]:
#From the scikit.impute library we first import the SimpleImputer class
#from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Imputer
#Next we define an object of the SimpleImputer class by looking at the docstring (use Shift+Tab)
imputer = Imputer(missing_values=np.nan, strategy='mean') 
imputer.fit(X[:,[1,3,5,6,7,8,10]])
X[:,[1,3,5,6,7,8,10]]= imputer.transform(X[:,[1,3,5,6,7,8,10]])
print(X[0])

['CALHOUN-ISLE' 343.0 0.0 11634.0 1963.0 1.0 0.0 1.0 0.0 'Wood Frame'
 91500.0]




## 5) Dealing with categorical variables
Using OneHotCoding on attributes 'Community' & 'ConstructionType'

Display: test row to verify all categorical variables are removed

In [6]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
X[:, 9] = labelencoder.fit_transform(X[:, 9])
print(X[0])

[0 343.0 0.0 11634.0 1963.0 1.0 0.0 1.0 0.0 5 91500.0]


In [7]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,9])
data = onehotencoder.fit_transform(X).toarray()
#print(data[0])
NewData = data[:,[0,1,2,3,4,5,6,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23,24,25]]

X_sig = NewData[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
print(X_sig[0])

[1.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 1.0000e+00 3.4300e+02 0.0000e+00 1.1634e+04
 1.9630e+03 1.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 9.1500e+04]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
# Start of elimination: 
import statsmodels.api as sm
X_sig = NewData[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,2845.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,10:05:21,Log-Likelihood:,-73863.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5660,BIC:,147900.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.845e+04,1.3e+04,1.415,0.157,-7110.452,4.4e+04
x2,6066.9591,1.25e+04,0.485,0.628,-1.85e+04,3.06e+04
x3,3.304e+04,1.35e+04,2.442,0.015,6517.294,5.96e+04
x4,1.345e+04,1.3e+04,1.037,0.300,-1.2e+04,3.89e+04
x5,1.053e+04,1.3e+04,0.810,0.418,-1.49e+04,3.6e+04
x6,1.152e+04,1.26e+04,0.913,0.361,-1.32e+04,3.63e+04
x7,1.701e+04,1.28e+04,1.329,0.184,-8087.090,4.21e+04
x8,1.154e+04,1.25e+04,0.920,0.357,-1.3e+04,3.61e+04
x9,2.803e+04,1.26e+04,2.217,0.027,3242.828,5.28e+04

0,1,2,3
Omnibus:,11366.823,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75308166.498
Skew:,15.764,Prob(JB):,0.0
Kurtosis:,566.015,Cond. No.,17500000.0


## 6) Remove a column from each of the OneHotEncoded variables groups with P>0.5 to find statistically significant variables

In [9]:
#E1: removed position 2 = column 1
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]

#Start: X_sig = NewData[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,2969.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,10:59:04,Log-Likelihood:,-73863.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5661,BIC:,147900.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.317e+04,7168.761,1.837,0.066,-881.922,2.72e+04
x2,2.794e+04,8496.202,3.289,0.001,1.13e+04,4.46e+04
x3,8047.2587,6626.453,1.214,0.225,-4943.128,2.1e+04
x4,5107.3226,6623.883,0.771,0.441,-7878.026,1.81e+04
x5,6064.9783,5684.178,1.067,0.286,-5078.188,1.72e+04
x6,1.158e+04,6204.650,1.867,0.062,-579.604,2.37e+04
x7,6181.5579,5923.126,1.044,0.297,-5430.038,1.78e+04
x8,2.263e+04,5986.041,3.781,0.000,1.09e+04,3.44e+04
x9,2.045e+04,8897.217,2.298,0.022,3004.809,3.79e+04

0,1,2,3
Omnibus:,11367.293,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75322755.714
Skew:,15.766,Prob(JB):,0.0
Kurtosis:,566.069,Cond. No.,17500000.0


In [10]:
#E2: removed position 16 = column 16
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23]]

#E1: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,3104.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:01:24,Log-Likelihood:,-73863.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5662,BIC:,147900.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.396e+04,6998.730,1.994,0.046,237.006,2.77e+04
x2,2.891e+04,8277.739,3.492,0.000,1.27e+04,4.51e+04
x3,8077.3675,6625.752,1.219,0.223,-4911.645,2.11e+04
x4,5036.7619,6621.985,0.761,0.447,-7944.866,1.8e+04
x5,6099.7073,5683.392,1.073,0.283,-5041.918,1.72e+04
x6,1.164e+04,6203.298,1.876,0.061,-522.064,2.38e+04
x7,6512.9457,5886.542,1.106,0.269,-5026.932,1.81e+04
x8,2.286e+04,5968.091,3.831,0.000,1.12e+04,3.46e+04
x9,2.118e+04,8776.928,2.414,0.016,3977.876,3.84e+04

0,1,2,3
Omnibus:,11367.493,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75311764.039
Skew:,15.766,Prob(JB):,0.0
Kurtosis:,566.028,Cond. No.,17500000.0


In [11]:
#E3: removed position 13 = column 13
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,14,15,17,18,19,20,21,22,23]]

#E2: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,3252.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:03:21,Log-Likelihood:,-73863.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5663,BIC:,147900.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.389e+04,6997.809,1.986,0.047,175.844,2.76e+04
x2,2.929e+04,8258.794,3.547,0.000,1.31e+04,4.55e+04
x3,8143.7276,6624.745,1.229,0.219,-4843.310,2.11e+04
x4,5085.8938,6621.294,0.768,0.442,-7894.379,1.81e+04
x5,6080.2674,5683.059,1.070,0.285,-5060.704,1.72e+04
x6,1.165e+04,6202.994,1.878,0.060,-511.716,2.38e+04
x7,6600.3948,5884.902,1.122,0.262,-4936.266,1.81e+04
x8,2.283e+04,5967.630,3.826,0.000,1.11e+04,3.45e+04
x9,2.058e+04,8732.124,2.356,0.018,3458.056,3.77e+04

0,1,2,3
Omnibus:,11367.833,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75332856.42
Skew:,15.767,Prob(JB):,0.0
Kurtosis:,566.107,Cond. No.,7990000.0


In [12]:
#E4: removed position 16 = column 18
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,14,15,17,19,20,21,22,23]]

#E3: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,14,15,17,18,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,3415.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:05:13,Log-Likelihood:,-73864.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5664,BIC:,147900.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.328e+04,6909.343,1.922,0.055,-262.067,2.68e+04
x2,2.771e+04,7747.944,3.577,0.000,1.25e+04,4.29e+04
x3,7683.4287,6571.742,1.169,0.242,-5199.702,2.06e+04
x4,4731.1011,6589.674,0.718,0.473,-8187.183,1.76e+04
x5,5576.1868,5608.993,0.994,0.320,-5419.588,1.66e+04
x6,1.117e+04,6140.969,1.818,0.069,-872.077,2.32e+04
x7,6133.5089,5823.550,1.053,0.292,-5282.880,1.75e+04
x8,2.237e+04,5907.336,3.786,0.000,1.08e+04,3.39e+04
x9,1.927e+04,8407.082,2.293,0.022,2792.283,3.58e+04

0,1,2,3
Omnibus:,11369.701,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75379078.167
Skew:,15.773,Prob(JB):,0.0
Kurtosis:,566.28,Cond. No.,4550000.0


In [13]:
#E5: removed position 11 = column 11
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,12,14,15,17,19,20,21,22,23]]

#E4: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,11,12,14,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,3596.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:07:24,Log-Likelihood:,-73864.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5665,BIC:,147900.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.338e+04,6569.417,2.037,0.042,503.251,2.63e+04
x2,2.788e+04,6815.204,4.091,0.000,1.45e+04,4.12e+04
x3,7755.7419,6382.539,1.215,0.224,-4756.479,2.03e+04
x4,4780.6879,6501.326,0.735,0.462,-7964.400,1.75e+04
x5,5657.3757,5326.845,1.062,0.288,-4785.280,1.61e+04
x6,1.124e+04,5926.645,1.897,0.058,-377.615,2.29e+04
x7,6207.0593,5601.792,1.108,0.268,-4774.597,1.72e+04
x8,2.245e+04,5621.789,3.993,0.000,1.14e+04,3.35e+04
x9,1.939e+04,7997.918,2.425,0.015,3714.147,3.51e+04

0,1,2,3
Omnibus:,11369.49,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75370244.385
Skew:,15.773,Prob(JB):,0.0
Kurtosis:,566.246,Cond. No.,3630000.0


In [14]:
#E6: removed position 12 = column 14
X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,12,15,17,19,20,21,22,23]]

#E5: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,12,14,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,3796.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:09:03,Log-Likelihood:,-73864.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5666,BIC:,147900.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.426e+04,6393.987,2.231,0.026,1728.804,2.68e+04
x2,2.778e+04,6812.441,4.077,0.000,1.44e+04,4.11e+04
x3,8744.7719,6154.385,1.421,0.155,-3320.178,2.08e+04
x4,5242.3687,6452.913,0.812,0.417,-7407.810,1.79e+04
x5,6753.3604,4986.516,1.354,0.176,-3022.120,1.65e+04
x6,1.211e+04,5739.195,2.109,0.035,854.536,2.34e+04
x7,7092.6845,5393.209,1.315,0.189,-3480.070,1.77e+04
x8,2.333e+04,5416.650,4.307,0.000,1.27e+04,3.39e+04
x9,1.95e+04,7995.451,2.439,0.015,3823.725,3.52e+04

0,1,2,3
Omnibus:,11367.222,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75293978.924
Skew:,15.765,Prob(JB):,0.0
Kurtosis:,565.961,Cond. No.,3460000.0


In [15]:
#E7: removed position 4 = column 4
X_sig = NewData[:,[0,2,3,5,6,7,8,9,10,12,15,17,19,20,21,22,23]]

#E6: X_sig = NewData[:,[0,2,3,4,5,6,7,8,9,10,12,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,4020.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:10:34,Log-Likelihood:,-73864.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5667,BIC:,147900.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.316e+04,6247.966,2.106,0.035,911.972,2.54e+04
x2,2.685e+04,6715.484,3.998,0.000,1.37e+04,4e+04
x3,7527.4650,5969.012,1.261,0.207,-4174.084,1.92e+04
x4,5625.7152,4789.293,1.175,0.240,-3763.131,1.5e+04
x5,1.085e+04,5525.596,1.963,0.050,13.571,2.17e+04
x6,5815.9043,5158.988,1.127,0.260,-4297.686,1.59e+04
x7,2.218e+04,5229.732,4.242,0.000,1.19e+04,3.24e+04
x8,1.837e+04,7873.948,2.333,0.020,2934.913,3.38e+04
x9,4.584e+04,1.3e+04,3.516,0.000,2.03e+04,7.14e+04

0,1,2,3
Omnibus:,11364.562,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75164889.459
Skew:,15.757,Prob(JB):,0.0
Kurtosis:,565.478,Cond. No.,3440000.0


In [16]:
#E8: removed position 6 = column 7
X_sig = NewData[:,[0,2,3,5,6,8,9,10,12,15,17,19,20,21,22,23]]

#E7: X_sig = NewData[:,[0,2,3,5,6,7,8,9,10,12,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,4270.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:11:52,Log-Likelihood:,-73865.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5668,BIC:,147900.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.143e+04,6056.958,1.887,0.059,-442.519,2.33e+04
x2,2.523e+04,6561.119,3.846,0.000,1.24e+04,3.81e+04
x3,5908.2715,5793.768,1.020,0.308,-5449.730,1.73e+04
x4,4168.3140,4611.621,0.904,0.366,-4872.227,1.32e+04
x5,9299.0780,5352.664,1.737,0.082,-1194.192,1.98e+04
x6,2.039e+04,4982.916,4.093,0.000,1.06e+04,3.02e+04
x7,1.665e+04,7725.417,2.156,0.031,1509.043,3.18e+04
x8,4.628e+04,1.3e+04,3.551,0.000,2.07e+04,7.18e+04
x9,1.052e+04,7503.092,1.402,0.161,-4190.380,2.52e+04

0,1,2,3
Omnibus:,11361.556,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75098419.014
Skew:,15.748,Prob(JB):,0.0
Kurtosis:,565.23,Cond. No.,3430000.0


In [17]:
#E9: removed position 4 = column 5
X_sig = NewData[:,[0,2,3,6,8,9,10,12,15,17,19,20,21,22,23]]

#E8: X_sig = NewData[:,[0,2,3,5,6,8,9,10,12,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,4555.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:14:16,Log-Likelihood:,-73865.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5669,BIC:,147900.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.007e+04,5867.629,1.717,0.086,-1429.144,2.16e+04
x2,2.423e+04,6466.674,3.747,0.000,1.16e+04,3.69e+04
x3,4753.6522,5651.100,0.841,0.400,-6324.665,1.58e+04
x4,8191.9998,5210.563,1.572,0.116,-2022.696,1.84e+04
x5,1.874e+04,4633.733,4.044,0.000,9653.661,2.78e+04
x6,1.548e+04,7614.958,2.033,0.042,549.647,3.04e+04
x7,4.641e+04,1.3e+04,3.561,0.000,2.09e+04,7.2e+04
x8,1.061e+04,7502.312,1.414,0.157,-4098.998,2.53e+04
x9,34.8972,6.268,5.568,0.000,22.610,47.184

0,1,2,3
Omnibus:,11360.07,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75157962.486
Skew:,15.742,Prob(JB):,0.0
Kurtosis:,565.454,Cond. No.,3420000.0


In [18]:
#E10: removed position 3 = column 3
X_sig = NewData[:,[0,2,6,8,9,10,12,15,17,19,20,21,22,23]]

#E9: X_sig = NewData[:,[0,2,3,6,8,9,10,12,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,4881.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:16:47,Log-Likelihood:,-73865.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5670,BIC:,147900.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,9325.0462,5799.599,1.608,0.108,-2044.386,2.07e+04
x2,2.36e+04,6422.955,3.674,0.000,1.1e+04,3.62e+04
x3,7566.6606,5157.126,1.467,0.142,-2543.279,1.77e+04
x4,1.792e+04,4529.570,3.955,0.000,9036.495,2.68e+04
x5,1.476e+04,7566.489,1.950,0.051,-75.462,2.96e+04
x6,4.643e+04,1.3e+04,3.563,0.000,2.09e+04,7.2e+04
x7,1.062e+04,7502.101,1.416,0.157,-4084.936,2.53e+04
x8,34.3468,6.233,5.510,0.000,22.127,46.566
x9,-0.1282,0.044,-2.943,0.003,-0.214,-0.043

0,1,2,3
Omnibus:,11358.899,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75144695.561
Skew:,15.738,Prob(JB):,0.0
Kurtosis:,565.404,Cond. No.,3420000.0


In [19]:
#E11: removed position 11 = column 20
X_sig = NewData[:,[0,2,6,8,9,10,12,15,17,19,21,22,23]]

#E10: X_sig = NewData[:,[0,2,6,8,9,10,12,15,17,19,20,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,5256.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:18:17,Log-Likelihood:,-73866.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5671,BIC:,147800.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.011e+04,5754.487,1.757,0.079,-1171.087,2.14e+04
x2,2.453e+04,6365.439,3.854,0.000,1.21e+04,3.7e+04
x3,7041.0726,5134.450,1.371,0.170,-3024.413,1.71e+04
x4,1.754e+04,4516.596,3.884,0.000,8688.843,2.64e+04
x5,1.506e+04,7561.605,1.991,0.047,232.928,2.99e+04
x6,4.705e+04,1.3e+04,3.613,0.000,2.15e+04,7.26e+04
x7,1.076e+04,7501.187,1.434,0.152,-3947.964,2.55e+04
x8,31.4215,5.621,5.590,0.000,20.401,42.442
x9,-0.1239,0.043,-2.857,0.004,-0.209,-0.039

0,1,2,3
Omnibus:,11372.388,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75588712.574
Skew:,15.781,Prob(JB):,0.0
Kurtosis:,567.064,Cond. No.,3420000.0


In [20]:
#E12: removed position 3 = column 6
X_sig = NewData[:,[0,2,8,9,10,12,15,17,19,21,22,23]]

#E11: X_sig = NewData[:,[0,2,6,8,9,10,12,15,17,19,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,5693.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:19:34,Log-Likelihood:,-73867.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5672,BIC:,147800.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,9038.6711,5701.663,1.585,0.113,-2138.767,2.02e+04
x2,2.353e+04,6324.214,3.721,0.000,1.11e+04,3.59e+04
x3,1.65e+04,4452.520,3.706,0.000,7772.080,2.52e+04
x4,1.405e+04,7526.619,1.867,0.062,-703.094,2.88e+04
x5,4.738e+04,1.3e+04,3.640,0.000,2.19e+04,7.29e+04
x6,1.078e+04,7501.753,1.437,0.151,-3927.319,2.55e+04
x7,31.4335,5.622,5.591,0.000,20.413,42.454
x8,-0.1224,0.043,-2.822,0.005,-0.207,-0.037
x9,1.262e+04,2416.302,5.223,0.000,7883.074,1.74e+04

0,1,2,3
Omnibus:,11383.037,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76188774.034
Skew:,15.813,Prob(JB):,0.0
Kurtosis:,569.302,Cond. No.,3420000.0


In [22]:
#E13: removed position 6 = column 12
X_sig = NewData[:,[0,2,8,9,10,15,17,19,21,22,23]]

#E12: X_sig = NewData[:,[0,2,8,9,10,12,15,17,19,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,6209.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:22:25,Log-Likelihood:,-73868.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5673,BIC:,147800.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,9241.4331,5700.451,1.621,0.105,-1933.630,2.04e+04
x2,2.76e+04,5657.500,4.878,0.000,1.65e+04,3.87e+04
x3,1.647e+04,4452.890,3.699,0.000,7741.462,2.52e+04
x4,1.567e+04,7442.634,2.105,0.035,1079.274,3.03e+04
x5,4.41e+04,1.28e+04,3.440,0.001,1.9e+04,6.92e+04
x6,31.7851,5.617,5.659,0.000,20.774,42.797
x7,-0.1149,0.043,-2.669,0.008,-0.199,-0.031
x8,1.24e+04,2411.663,5.142,0.000,7671.955,1.71e+04
x9,-7872.7723,3509.648,-2.243,0.025,-1.48e+04,-992.521

0,1,2,3
Omnibus:,11376.941,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75971399.139
Skew:,15.794,Prob(JB):,0.0
Kurtosis:,568.493,Cond. No.,3310000.0


In [23]:
#E14: removed position 1 = column 0
X_sig = NewData[:,[2,8,9,10,15,17,19,21,22,23]]

#E13: X_sig = NewData[:,[0,2,8,9,10,15,17,19,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,6828.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:24:35,Log-Likelihood:,-73869.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5674,BIC:,147800.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,2.541e+04,5494.658,4.624,0.000,1.46e+04,3.62e+04
x2,1.46e+04,4301.032,3.394,0.001,6166.259,2.3e+04
x3,1.395e+04,7367.464,1.893,0.058,-495.821,2.84e+04
x4,4.498e+04,1.28e+04,3.512,0.000,1.99e+04,7.01e+04
x5,31.2600,5.608,5.574,0.000,20.265,42.255
x6,-0.1045,0.043,-2.454,0.014,-0.188,-0.021
x7,1.234e+04,2411.722,5.116,0.000,7611.516,1.71e+04
x8,-7674.9854,3508.030,-2.188,0.029,-1.46e+04,-797.906
x9,-7139.7810,2724.740,-2.620,0.009,-1.25e+04,-1798.250

0,1,2,3
Omnibus:,11369.099,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75702568.934
Skew:,15.769,Prob(JB):,0.0
Kurtosis:,567.491,Cond. No.,3310000.0


In [24]:
#E15: removed position 3 = column 9
X_sig = NewData[:,[2,8,10,15,17,19,21,22,23]]

#E14: X_sig = NewData[:,[2,8,9,10,15,17,19,21,22,23]]
obj_OLS = sm.OLS(endog = y, exog = X_sig).fit() 
obj_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.923
Model:,OLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,7583.0
Date:,"Wed, 06 May 2020",Prob (F-statistic):,0.0
Time:,11:25:52,Log-Likelihood:,-73871.0
No. Observations:,5684,AIC:,147800.0
Df Residuals:,5675,BIC:,147800.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,2.352e+04,5404.982,4.352,0.000,1.29e+04,3.41e+04
x2,1.364e+04,4272.437,3.194,0.001,5269.236,2.2e+04
x3,4.489e+04,1.28e+04,3.504,0.000,1.98e+04,7e+04
x4,30.5626,5.598,5.460,0.000,19.589,41.536
x5,-0.0885,0.042,-2.121,0.034,-0.170,-0.007
x6,1.232e+04,2412.254,5.108,0.000,7593.787,1.71e+04
x7,-7150.2822,3497.859,-2.044,0.041,-1.4e+04,-293.141
x8,-7406.8578,2721.704,-2.721,0.007,-1.27e+04,-2071.278
x9,0.8723,0.013,68.878,0.000,0.847,0.897

0,1,2,3
Omnibus:,11366.505,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,75494704.986
Skew:,15.762,Prob(JB):,0.0
Kurtosis:,566.714,Cond. No.,3310000.0


In [25]:
#End of Elimination, final column:
X_sig = NewData[:,[2,8,10,15,17,19,21,22,23]]

### Statistically significant variables
2: community-Central

8: community-Southwest

10: constructiontype-brick

15: abovegroundarea

17: parcel_area_sqft

19: bathrooms

21: num_stories

22: fireplaces

23: assessed_price