# Real estate price model workflow

Sam Maurer, Feb 2018  
Python 3.6, intended to be backward compatible with 2.7

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd

import orca

In [2]:
# Import scripts from this repository
import os; os.chdir('../')
from extensions import modelmanager as mm
from models import RegressionStep

## Bootstrap Orca with some legacy registrations

This exercise starts from a point where data is already registered in Orca. Eventually, the vision is that data will be loaded based on config files in the 'data' directory.

For now, the 'legacy' directory contains some code from Paul Sohn's [urbansim_parcels](https://github.com/urbansim/urbansim_parcels) project. Importing 'datasources.py' and 'models.py' registers a handful of Orca objects.

In [3]:
os.chdir('legacy')
import legacy_datasources
import legacy_models

## Explore the Orca registrations

In [4]:
orca.list_tables()

['households', 'buildings', 'parcels', 'jobs']

In [5]:
orca.list_columns()

[('households', 'node_id'), ('buildings', 'node_id'), ('jobs', 'node_id')]

In [6]:
orca.list_broadcasts()

[('parcels', 'buildings'),
 ('buildings', 'households'),
 ('buildings', 'jobs'),
 ('nodes', 'buildings')]

In [7]:
orca.list_injectables()

['settings', 'store', 'net_store']

In [8]:
orca.list_steps()

['build_networks', 'neighborhood_vars']

## Explore the data

Orca doesn't execute code to load the registered objects until it needs to

In [9]:
orca.get_table('households').to_frame().describe()

Unnamed: 0,building_id,tenure,persons,workers,age_of_head,income,children,race_id,cars,base_luz,segmentation_col,node_id
count,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,58671.0,10199.0
mean,370371.030339,2.406913,2.156057,1.156534,44.336742,64154.87,0.434099,2.257282,1.363859,92.911353,1.0,42205.252868
std,79639.958079,0.916539,1.299009,0.798054,16.097489,67859.93,0.876846,1.478598,0.865866,5.21015,0.0,4091.490536
min,5120.0,0.0,1.0,0.0,16.0,-9999.0,0.0,1.0,0.0,88.0,1.0,36360.0
25%,352274.5,1.0,1.0,1.0,31.0,24000.0,0.0,1.0,1.0,89.0,1.0,39026.0
50%,363553.0,3.0,2.0,1.0,41.0,45000.0,0.0,2.0,1.0,93.0,1.0,41731.0
75%,380838.5,3.0,3.0,2.0,55.0,82500.0,0.0,2.0,2.0,93.0,1.0,45043.0
max,679716.0,4.0,11.0,5.0,93.0,1125300.0,6.0,8.0,4.0,108.0,1.0,52488.0


In [10]:
print(len(orca.get_table('households').local_columns))  # native columns only
print(len(orca.get_table('households').to_frame().columns))  # native plus virtual

11
12


## Generate accessibility measures for the price model

The network accessibility metrics are not stored on disk; for now we'll generate them using legacy code.

In [11]:
orca.run(['build_networks'])

Running step 'build_networks'
Time to execute step 'build_networks': 0.30 s
Total time to execute iteration 1 with iteration value None: 0.30 s


In [12]:
%%capture
orca.run(['neighborhood_vars'])

In [13]:
orca.list_tables()

['households', 'buildings', 'parcels', 'jobs', 'nodes']

In [14]:
print(orca.get_table('nodes').to_frame().columns.tolist())

['ave_parcel_size', 'jobs_1500m', 'jobs_800m', 'jobs_400m', 'ave_income', 'ave_age_of_head_1500m', 'ave_children_1500m', 'ave_year_built_1500m', 'population_400m', 'jobs_3000m', 'households_3000m', 'residential_units_3000m', 'residential_units_1500m', 'residential_units_800m']


## Display all the registered data columns

In [15]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

HOUSEHOLDS
['building_id', 'tenure', 'persons', 'workers', 'age_of_head', 'income', 'children', 'race_id', 'cars', 'base_luz', 'segmentation_col', 'node_id']

BUILDINGS
['parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'non_residential_sqft', 'stories', 'year_built', 'residential_sqft', 'note', 'res_price_per_sqft', 'node_id']

PARCELS
['land_value', 'tax_exempt', 'proportion_undevelopable', 'acres', 'county_id', 'zoning_id', 'y', 'development_type_id', 'taz_id', 'distance_to_school', 'parcel_acres', 'distance_to_freeway', 'distance_to_park', 'distance_to_coast', 'msa_id', 'luz_id', 'node_id', 'distance_to_transit', 'mgra_id', 'distance_to_onramp', 'x']

JOBS
['sector_id', 'building_id', 'node_id']

NODES
['ave_parcel_size', 'jobs_1500m', 'jobs_800m', 'jobs_400m', 'ave_income', 'ave_age_of_head_1500m', 'ave_children_1500m', 'ave_year_built_1500m', 'population_400m', 'jobs_3000m', 'households_3000m', 'residential_units_3000m', 'residential_units_1500m', 'res

In [16]:
# These are the tables with direct relational links

orca.list_broadcasts()

[('parcels', 'buildings'),
 ('buildings', 'households'),
 ('buildings', 'jobs'),
 ('nodes', 'buildings')]

## Estimate a price model

The basic idea of the parcel template is that we create model steps by _passing arguments to classes_ rather than by writing Python functions and giving them Orca decorators, as we would for a fully custom model.

Much of the functionality for this is already built into UrbanSim and Orca, we'll just need to extend things here and there.

This demo uses the new RegressionStep() class that adds some features to urbansim.models.RegressionModel(). 

In [47]:
# Tables containing columns used in the model expression
tables = ['buildings', 'parcels', 'nodes']

model_expression = (
    "np.log1p(res_price_per_sqft) ~ "
        "parcel_acres + "
        "I(year_built < 1940) + "
        "I(year_built > 2005) + "
        "year_built + "
        "ave_income + "
        "distance_to_freeway + "
        "population_400m + "
        "jobs_3000m")

In [48]:
model = RegressionStep(model_expression, tables)
model.fit()

R-Squared: 0.518
Adj. R-Squared: 0.516

+------------------------------+-------------+------------+---------+
| Component                    | Coefficient | Std. Error | T-Score |
+------------------------------+-------------+------------+---------+
| Intercept                    |    55.314   |   2.454    |  22.537 |
| I(year_built < 1940)[T.True] |    -0.262   |   0.064    |  -4.105 |
| I(year_built > 2005)[T.True] |    1.299    |   0.100    |  12.941 |
| parcel_acres                 |    -1.283   |   0.063    | -20.481 |
| year_built                   |    -0.026   |   0.001    | -21.265 |
| ave_income                   |    0.177    |   0.057    |  3.093  |
| distance_to_freeway          |    0.000    |   0.000    |  1.359  |
| population_400m              |    -0.306   |   0.020    | -15.510 |
| jobs_3000m                   |    0.007    |   0.002    |  2.922  |
+------------------------------+-------------+------------+---------+


In [21]:
# Add prediction components
# Register and save
# Delete
# Load from disk
