In [31]:
import pandas as pd
import matplotlib.pyplot as plt
from semopy import Model
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
plt.style.use('fivethirtyeight')

pd.set_option('display.max_columns', None)

# Data

In [3]:
houses = pd.read_pickle('homeDataCSV.pkl')
houses.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Data Generation
(Section 4.2 Sharma, 2019)

In [4]:
model = sm.OLS(houses[['price']], houses[['bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']]).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.880
Model:                            OLS   Adj. R-squared (uncentered):              0.880
Method:                 Least Squares   F-statistic:                          1.319e+04
Date:                Sun, 04 Dec 2022   Prob (F-statistic):                        0.00
Time:                        18:31:41   Log-Likelihood:                     -2.9715e+05
No. Observations:               21613   AIC:                                  5.943e+05
Df Residuals:                   21601   BIC:                                  5.944e+05
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

### Identification of key variables in predicting price from p-value of the OLS

1. sqft living
2. grade
3. waterfront -- boolean
4. sqft_basement -- skewed right
5. condition
6. view -- bolean
7. sqft_above -- highly correlated with sqft living
8. bedrooms

## Creation of Data Generating Model (Model 5)

In [54]:
# getting a sense for our eta1
model = sm.OLS(houses[['grade']], houses[['sqft_living','condition']]).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  grade   R-squared (uncentered):                   0.972
Model:                            OLS   Adj. R-squared (uncentered):              0.972
Method:                 Least Squares   F-statistic:                          3.750e+05
Date:                Sat, 10 Dec 2022   Prob (F-statistic):                        0.00
Time:                        11:24:39   Log-Likelihood:                         -36278.
No. Observations:               21613   AIC:                                  7.256e+04
Df Residuals:                   21611   BIC:                                  7.258e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

from  the above regression, x1 and x2 will be `sqft_living` and `sqft_basement`.

Now we must identify our x2 and x3 similarly.

In [55]:
# getting a sense for our eta2
model = sm.OLS(houses[['price']], houses[['sqft_living','waterfront','grade']]).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.852
Model:                            OLS   Adj. R-squared (uncentered):              0.852
Method:                 Least Squares   F-statistic:                          4.132e+04
Date:                Sat, 10 Dec 2022   Prob (F-statistic):                        0.00
Time:                        11:25:48   Log-Likelihood:                     -2.9944e+05
No. Observations:               21613   AIC:                                  5.989e+05
Df Residuals:                   21610   BIC:                                  5.989e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

from these results, we can identify that the model parameters for data generation will be:<br>
`x1` = sqft_basement **condition**<br> 
`x2` = sqft_living<br>
`x3` = waterfront<br>
`eta1` = grade<br>
`eta2` = price<br>

We will now generate synthetic data by adjusting the input values of `x1`, `x2`, and `x3`, as well as adjusting the weights in the model as described in **Section 4.2** of Sharma, 2019

### Parameter Selection + Definition

In [44]:
# parameter values
# x1_e1 = 0.0034
# x2_e1 = -0.0013
# se_x2_e1 = 9.19*(10**-6)
# x2_e2 = 244.1761
# x3_e2 = 8.463*(10**5)
# e1_e2 = 4627.9743

x1_e1 = 1.2967
x2_e1 = 0.0015
se_x2_e1 = 8.66*(10**-6)
x2_e2 = 244.1761
x3_e2 = 8.463*(10**5)
e1_e2 = 4627.9743

# mods
sample_sizes = [50, 100, 150, 200, 250, 500] # directly from sharma methods
x2_e1_mods = [x2_e1+(z*se_x2_e1) for z in range(-2,3,1)] # + -2, -1, 0, 1, 2 stdev
# mimic the ave cases evaluated by Sharma, 2019 (not using the r library that they used, as well as a boolean variable with x3)

# Model Definition + Data Generation

In [47]:
"""
These models are generated from Figure 1 of Sharma, 2019
"""
for x2_e1_mod in x2_e1_mods:
    data_model = f'''eta1 ~ {str(x1_e1)}*x1 +  {str(x2_e1_mod)}*x2 
            eta2 ~ {str(x2_e2)}*x2 + {str(x3_e2)}*x3 + {str(e1_e2)}*eta1'''
    for sample_size in sample_sizes:
        sample_data = houses[['sqft_basement','sqft_living','waterfront']].sample(n = sample_size)
        y_ground = model.predict(sample_data)

# this is how data will be generated! notebook selection by analysis methods is completed in `model_analysis.ipynb`