# Real estate price model workflow

Sam Maurer, Feb 2018; Paul Waddell, June 2018

Python 3.6, intended to be backward compatible with 2.7

In [1]:
from __future__ import print_function
import warnings;warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

In [2]:
# Standard to run UrbanSim from the root level of the project directory

import os; os.chdir('..')

In [3]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import OLSRegressionStep
import orca

In [4]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

## Explore the Orca registrations

In [5]:
orca.list_tables()

['parcels', 'buildings', 'rentals', 'units', 'households', 'persons', 'jobs']

In [10]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

PARCELS
['primary_id', 'development_type_id', 'land_value', 'acres', 'county_id', 'zone_id', 'proportion_undevelopable', 'tax_exempt_status', 'apn', 'parcel_id_local', 'geom_id', 'imputation_flag', 'x', 'y', 'shape_area', 'block_id', 'node_id']

BUILDINGS
['building_id', 'parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'residential_sqft', 'sqft_per_unit', 'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft', 'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'building_type_id']

RENTALS
['neighborhood', 'price', 'bedrooms', 'date', 'sqft_per_unit', 'lon', 'lat', 'price_per_sqft']

UNITS
['Unnamed: 0', 'building_id', 'num_units', 'tenure', 'unit_num', 'unit_residential_price', 'unit_residential_rent']

HOUSEHOLDS
['household_id', 'serialno', 'persons', 'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head', 'age_of_head', 'workers', 'state',

In [11]:
orca.list_broadcasts()

[('parcels', 'buildings'),
 ('buildings', 'units'),
 ('units', 'households'),
 ('households', 'persons')]

In [12]:
orca.list_steps()

['large-mnl-test',
 'model_one',
 'model_two',
 'ols-test',
 'small-mnl-test',
 'test_manual_registration',
 'initialize_network',
 'network_aggregations']

## Explore the data

Orca doesn't execute code to load the registered objects until it needs to.

In [13]:
orca.get_table('parcels').to_frame().describe()

Unnamed: 0,primary_id,development_type_id,land_value,acres,county_id,zone_id,proportion_undevelopable,tax_exempt_status,geom_id,x,y,shape_area,block_id,node_id
count,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0,1956207.0
mean,1018502.0,2.607759,234801.9,2.294288,53.93547,829.32,0.001633774,0.01256104,8795114000000.0,-122.1763,37.77351,9284.664,60542940000000.0,382156300.0
std,597283.3,5.310605,1610710.0,36.12166,38.08734,412.9778,0.04038695,0.11137,5076215000000.0,0.2991972,0.3621875,146179.3,380760000000.0,949091300.0
min,1.0,-1.0,0.0,4.52709e-07,1.0,1.0,0.0,0.0,17066880.0,-123.5266,36.89751,0.00183205,60014000000000.0,302878.0
25%,499973.5,1.0,39957.0,0.1084042,13.0,492.0,0.0,0.0,4405538000000.0,-122.4059,37.48103,438.6967,60133210000000.0,56159740.0
50%,1005872.0,1.0,100800.0,0.148965,75.0,844.0,0.0,0.0,8793873000000.0,-122.1419,37.7539,602.8407,60750330000000.0,65351370.0
75%,1544652.0,1.0,225434.5,0.2294452,85.0,1189.0,0.0,0.0,13187560000000.0,-121.9512,37.98583,928.5328,60855080000000.0,65625800.0
max,2054502.0,25.0,1393464000.0,23351.99,97.0,1454.0,1.0,1.0,17592170000000.0,-121.2109,38.86017,94502240.0,61130120000000.0,5458527000.0


In [14]:
orca.get_table('buildings').to_frame().describe()

Unnamed: 0,building_id,parcel_id,development_type_id,improvement_value,residential_units,residential_sqft,sqft_per_unit,non_residential_sqft,building_sqft,nonres_rent_per_sqft,res_price_per_sqft,stories,year_built,redfin_sale_price,redfin_sale_year,building_type_id
count,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1824229.0,1081100.0,1081100.0,1824229.0
mean,922379.0,1012861.0,1.835162,318687.8,1.534,2657.113,1669.294,1280.864,3784.462,0.9218572,292.7191,1.214429,1965.524,528730.7,2002.828,1.710299
std,531735.2,593806.5,3.44108,3571671.0,6.137263,30124.08,1063.476,23922.26,34945.68,4.053916,264.7712,0.5567798,25.01754,718812.3,7.143867,2.396463
min,1.0,26.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1790.0,0.0,1962.0,0.0
25%,462341.0,495850.0,1.0,82675.03,1.0,1230.0,1176.0,0.0,1320.0,0.0,151.8671,1.0,1952.0,239000.0,1998.0,1.0
50%,922473.0,1002691.0,1.0,167553.5,1.0,1754.0,1564.0,0.0,1858.0,0.0,254.4594,1.0,1966.0,400000.0,2004.0,1.0
75%,1382840.0,1532117.0,1.0,285342.3,1.0,2400.0,2029.424,0.0,2524.0,0.0,369.9121,1.0,1983.0,650000.0,2009.0,1.0
max,1843350.0,2054501.0,24.0,3355514000.0,1912.0,14580000.0,30000.0,16573100.0,14580000.0,128.9625,50161.09,59.0,2015.0,180000000.0,2014.0,16.0


## Generate accessibility measures for the price model

The network accessibility metrics are not stored on disk; for now we'll generate them using legacy code.

In [18]:
orca.run(['initialize_network'])

Running step 'initialize_network'
Time to execute step 'initialize_network': 5.81 s
Total time to execute iteration 1 with iteration value None: 5.81 s


In [19]:
orca.run(['network_aggregations'])

Running step 'network_aggregations'
Computing accessibility variables
Computing residential_units_500
Removed 76038 rows because they contain missing values
Computing residential_units_1500
Removed 76038 rows because they contain missing values
Computing population_1500
Removed 305829 rows because they contain missing values
Computing poor_1500
Removed 79602 rows because they contain missing values
Computing renters_1500
Removed 145852 rows because they contain missing values
Computing med_income_1500
Removed 305829 rows because they contain missing values
Computing job_1500
Removed 111044 rows because they contain missing values
Computing job_25km
Removed 111044 rows because they contain missing values
Computing ave_hhsize
Removed 305829 rows because they contain missing values
Computing med_rent_sqft_1500
Computing med_rent_sqft_7000
       residential_units_500  residential_units_1500  population_1500  \
count          415716.000000           415716.000000    415716.000000   
mean  

In [20]:
orca.list_tables()

['parcels',
 'buildings',
 'rentals',
 'units',
 'households',
 'persons',
 'jobs',
 'nodes']

In [21]:
print(orca.get_table('nodes').to_frame().columns.tolist())

['residential_units_500', 'residential_units_1500', 'population_1500', 'poor_1500', 'renters_1500', 'med_income_1500', 'job_1500', 'job_25km', 'ave_hhsize', 'med_rent_sqft_1500', 'med_rent_sqft_7000']


# Estimate a price model

The basic idea of the parcel template is that we create model steps by _passing arguments to classes_ rather than by writing Python functions and giving them Orca decorators, as we would for a fully custom model.

Much of the functionality for this is already built into UrbanSim and Orca, we'll just need to extend things here and there.

This demo uses a new RegressionStep() class that provides a full model development workflow: estimating a model, registering it with Orca, saving it for future use.

### Specify parameters and pass them to a model object

In [22]:
# Specify the model expression and names of tables to draw data from (the first table
# is the primary one; additional tables must be able to merge onto it unambiguously)

tables = ['buildings', 'parcels']

model_expression = (
    "np.log1p(res_price_per_sqft) ~ year_built "
)

# Give the prospective model step some tags, and a name if desired

name = None
tags = ['residential-price-hedonic', 'paul', '20180708']

# For prediction, specify destination column (if different from the dependent variable
# used for estimation), and how to reverse the left-hand-side transformation

out_column = 'fitted_price'
out_transform = np.exp

In [23]:
# Generate a new column to store the fitted prices

zeros = np.repeat(0.0, len(orca.get_table('buildings')))
orca.get_table('buildings').update_col('fitted_price', zeros)

In [24]:
# Create the model object

model = OLSRegressionStep(model_expression, tables, name=name, tags=tags,
                       out_column=out_column, out_transform=out_transform)

### Fit the model

In [27]:
m = OLSRegressionStep()
m.tables = ['buildings', 'parcels']
m.model_expression = 'np.log1p(res_price_per_sqft) ~ year_built'
m.fit()

                                 OLS Regression Results                                 
Dep. Variable:     np.log1p(res_price_per_sqft)   R-squared:                       0.008
Model:                                      OLS   Adj. R-squared:                  0.008
Method:                           Least Squares   F-statistic:                 1.429e+04
Date:                          Sun, 08 Jul 2018   Prob (F-statistic):               0.00
Time:                                  14:26:20   Log-Likelihood:            -3.2330e+06
No. Observations:                       1748191   AIC:                         6.466e+06
Df Residuals:                           1748189   BIC:                         6.466e+06
Df Model:                                     1                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [28]:
m = OLSRegressionStep(tables=['buildings','parcels'], 
                      model_expression='np.log1p(res_price_per_sqft) ~ year_built')
m.fit()

                                 OLS Regression Results                                 
Dep. Variable:     np.log1p(res_price_per_sqft)   R-squared:                       0.008
Model:                                      OLS   Adj. R-squared:                  0.008
Method:                           Least Squares   F-statistic:                 1.429e+04
Date:                          Sun, 08 Jul 2018   Prob (F-statistic):               0.00
Time:                                  14:33:47   Log-Likelihood:            -3.2330e+06
No. Observations:                       1748191   AIC:                         6.466e+06
Df Residuals:                           1748189   BIC:                         6.466e+06
Df Model:                                     1                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [25]:
model.fit()

UnboundLocalError: local variable 'expr_cols' referenced before assignment

### If we like it, register it as an Orca step

In [None]:
model.register()

In [None]:
orca.list_steps()

### Run the Orca step

In [None]:
orca.run(['RegressionStep-20180214-210159'])

### Check the fitted values

In [None]:
df = orca.get_table('buildings').to_frame(['res_price_per_sqft', 'fitted_price'])
df.loc[df.fitted_price > 0].describe()

### BONUS

Running "model.register()" also registered the step with the new ModelManager extention, which saves it to disk so that it can be automatically re-loaded in the future.

The "test" model steps here were estimated earlier and loaded from disk. They're fully functional: we can run them in Orca, inspect the estimation results, etc.

In [None]:
mm.list_steps()

In [None]:
rs = mm.get_step('test-1')
type(rs)

In [None]:
rs.model_expression

In [None]:
rs.model.report_fit()