In [27]:
import pandas as pd
import numpy as np

# For plotting
import matplotlib.pyplot as plt

# Stats tools
from scipy.stats import kurtosis, skew # For skew and kurtosis
import statsmodels.formula.api as sm # For OLS
import statsmodels.stats.outliers_influence as oi # for RESET 

In [5]:
all_data = pd.read_stata('housing.dta')

In [19]:
all_data.describe()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
count,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0,546.0
mean,68121.59375,5150.265625,2.965201,1.285714,1.807692,0.858974,0.177656,0.349817,0.045788,0.31685,0.692308,0.234432
std,26702.669922,2168.160156,0.737387,0.502159,0.868203,0.348369,0.382573,0.47735,0.209215,0.465676,0.861305,0.424033
min,25000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49125.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,62000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,82000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,190000.0,16200.0,6.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0


In [7]:
list(all_data) # List of all the variables

['price',
 'lotsize',
 'bedrooms',
 'bathrms',
 'stories',
 'driveway',
 'recroom',
 'fullbase',
 'gashw',
 'airco',
 'garagepl',
 'prefarea']

In [14]:
first_model = sm.ols(formula="np.log(price) ~ np.log(lotsize) + bedrooms + bathrms + airco", data=all_data).fit()
print(first_model.params)

Intercept          7.093777
np.log(lotsize)    0.400422
bedrooms           0.077700
bathrms            0.215830
airco              0.211675
dtype: float64


In [15]:
print(first_model.summary())

                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.567
Model:                            OLS   Adj. R-squared:                  0.564
Method:                 Least Squares   F-statistic:                     177.4
Date:                Wed, 28 Nov 2018   Prob (F-statistic):           5.56e-97
Time:                        15:02:39   Log-Likelihood:                -5.5286
No. Observations:                 546   AIC:                             21.06
Df Residuals:                     541   BIC:                             42.57
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           7.0938      0.232     

In [29]:
oi.reset_ramsey(first_model, degree = 4)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.56130513]]), p=0.640754122583765, df_denom=538, df_num=3>

In [26]:
oi.reset_ramsey(first_model, degree = 3)

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[0.56442]]), p=0.5690256377878806, df_denom=539, df_num=2>

In [57]:
reset_model = sm.ols(formula="price ~ lotsize + bedrooms + bathrms + airco + np.power(price,2) + np.power(price, 3)", data=all_data).fit()
print(reset_model.summary())

hypotheses = '(np.power(price,2) = np.power(price, 3) = 0)'
t_test = reset_model.t_test(hypotheses)
t_test


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.961
Model:                            OLS   Adj. R-squared:                  0.961
Method:                 Least Squares   F-statistic:                     6724.
Date:                Wed, 28 Nov 2018   Prob (F-statistic):               0.00
Time:                        15:34:37   Log-Likelihood:                -5452.4
No. Observations:                 546   AIC:                         1.091e+04
Df Residuals:                     543   BIC:                         1.092e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           1422.8913     44

PatsyError: unrecognized token in constraint
    (np.power(price,2) = np.power(price, 3) = 0)
     ^

In [52]:
model2 = sm.ols(formula="np.log(price) ~ np.log(lotsize) + bedrooms + bathrms + airco + driveway + recroom + fullbase + gashw + garagepl + prefarea + stories", data=all_data).fit()
print(model2.summary())


                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.687
Model:                            OLS   Adj. R-squared:                  0.680
Method:                 Least Squares   F-statistic:                     106.3
Date:                Wed, 28 Nov 2018   Prob (F-statistic):          9.24e-127
Time:                        15:32:09   Log-Likelihood:                 82.412
No. Observations:                 546   AIC:                            -140.8
Df Residuals:                     534   BIC:                            -89.19
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           7.7451      0.216     

In [62]:
hypotheses = 'driveway - recroom = 0, driveway - fullbase = 0'
t_test = model2.t_test(hypotheses)
t_test

<class 'statsmodels.stats.contrast.ContrastResults'>
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.0522      0.039      1.349      0.178      -0.024       0.128
c1             0.0057      0.036      0.158      0.875      -0.065       0.077

In [65]:
second_model = sm.ols(formula="np.log(price) ~ np.log(lotsize) + bedrooms + prefarea + recroom + garagepl", data=all_data).fit()
print(second_model.summary()) 
# note AIC and BIC higher than first model so we should use model a
# note the R^2 and adjusted R^2 are also larger in model a so model a is better



                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.501
Model:                            OLS   Adj. R-squared:                  0.496
Method:                 Least Squares   F-statistic:                     108.2
Date:                Wed, 28 Nov 2018   Prob (F-statistic):           4.79e-79
Time:                        15:40:06   Log-Likelihood:                -44.792
No. Observations:                 546   AIC:                             101.6
Df Residuals:                     540   BIC:                             127.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           7.2736      0.262     

In [67]:
second_model.wald_test('prefarea = recroom = garagepl = 0')

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[29.69575977]]), p=8.778921221510035e-18, df_denom=540, df_num=3>

In [77]:
second_model.wald_test('prefarea = recroom,  recroom = garagepl,prefarea = 0').summary()

'<F test: F=array([[29.69575977]]), p=8.77892122151066e-18, df_denom=540, df_num=3>'

In [68]:
first_model.wald_test('bathrms = airco = 0')

<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[93.36980291]]), p=1.4633532700965718e-35, df_denom=541, df_num=2>