# Real estate rental hedonic model workflow

Paul Waddell, June 2018

Python 3.6, intended to be backward compatible with 2.7

In [1]:
from __future__ import print_function
import warnings;warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

In [2]:
from scipy.stats import norm
%matplotlib inline

import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
import seaborn as sns
sns.set()

In [3]:
# Standard to run UrbanSim from the root level of the project directory

import os; os.chdir('../..')

In [4]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import OLSRegressionStep
import orca

In [5]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

## Explore the Orca registrations

In [None]:
orca.list_tables()

In [None]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

In [None]:
orca.list_broadcasts()

In [None]:
orca.list_steps()

## Explore the data

Orca doesn't execute code to load the registered objects until it needs to.

In [None]:
orca.get_table('parcels').to_frame().describe()

In [None]:
orca.get_table('buildings').to_frame().describe()

In [None]:
rentals = orca.get_table('craigslist').to_frame()
rentals.columns

In [None]:
rentals.describe()

In [None]:
rents_nodup = rentals.drop_duplicates(['pid'])
rents_nodup.describe()

In [None]:
rents_nodup.rent.quantile([0.005, 0.995])

In [None]:
rents_nodup.bedrooms.quantile([0.005, 0.995])

In [None]:
rents_nodup.sqft.quantile([0.005, 0.995])

In [None]:
rentlow = rents_nodup.rent.quantile(0.005)
renthigh = rents_nodup.rent.quantile(0.995)
bedshigh = rents_nodup.bedrooms.quantile(0.995)
sqftlow = rents_nodup.sqft.quantile(0.005)
sqfthigh = rents_nodup.sqft.quantile(0.995)

In [None]:
rent_mask = (rents_nodup['rent'] > rentlow) & (rents_nodup['rent'] <= renthigh)
beds_mask = (rents_nodup['bedrooms'] <= bedshigh)
sqft_mask = (rents_nodup['sqft'] > sqftlow) & (rents_nodup['sqft'] <= sqfthigh)

# filter the thorough listings according to these masks
filtered_rents = pd.DataFrame(rents_nodup[rent_mask & beds_mask & sqft_mask])
len(filtered_rents)

In [None]:
plt.rcParams['figure.figsize']=10,10
%matplotlib inline

sns.set_style("white")
sns.set_style("ticks")
ax = sns.distplot(filtered_rents.rent, bins=50, fit=norm,  kde=False)

In [None]:
plt.rcParams['figure.figsize']=10,10
%matplotlib inline

sns.set_style("white")
sns.set_style("ticks")
ax = sns.distplot(np.log(filtered_rents.rent), bins=50, fit=norm, kde=False)

In [None]:
filtered_rents.describe()

In [None]:
filtered_rents.to_csv('data/rental_listings_cleaned.csv')

## Generate accessibility measures for the price model

The network accessibility metrics are not stored on disk; for now we'll generate them using legacy code.

In [None]:
orca.run(['initialize_network_small'])

In [None]:
orca.run(['initialize_network_drive'])

In [6]:
orca.run(['initialize_network_walk'])

Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 58.50 s
Total time to execute iteration 1 with iteration value None: 58.50 s


In [None]:
orca.run(['network_aggregations_small'])

In [None]:
orca.run(['network_aggregations_drive'])

In [8]:
orca.run(['network_aggregations_walk'])

Running step 'network_aggregations_walk'
Computing accessibility variables
Computing units_500_walk
Computing pop_500_walk
Removed 191599 rows because they contain missing values
Computing poor_500_walk
Removed 53660 rows because they contain missing values
Computing renters_500_walk
Removed 103635 rows because they contain missing values
Computing med_income_500_walk
Removed 191599 rows because they contain missing values
Computing jobs_500_walk
Computing med_rent_500_walk
       units_500_walk   pop_500_walk  poor_500_walk  renters_500_walk  \
count   415716.000000  415716.000000  415716.000000     415716.000000   
mean       122.117058     276.371033      27.571995         52.772377   
std        571.182678    1215.510498     142.118103        217.522293   
min          0.000000       0.000000       0.000000          0.000000   
25%          2.000000       4.507194       0.122638          0.298244   
50%         60.863455     146.891197       8.577946         13.422408   
75%       

In [None]:
orca.list_tables()

In [None]:
print(orca.get_table('nodesdrive').to_frame().columns.tolist())

In [None]:
nodesdrive = orca.get_table('nodesdrive').to_frame()

In [None]:
nodesdrive.to_csv('data/nodesdrive.csv')

In [None]:
nodessmall = orca.get_table('nodessmall').to_frame()

In [None]:
nodessmall.to_csv('data/nodessmall.csv')

In [None]:
rentals = orca.get_table('rentals').to_frame()

In [None]:
rentals.head()

In [None]:
rentals.node_id_drive.isnull().sum()

In [None]:
print('{} nodes have median income zero or below'.format(len(nodessmall[nodessmall.med_income_10000 == -1])))

print('Total nodes count {}'.format(len (nodessmall)))

# Estimate a rental hedonic model

The basic idea of the parcel template is that we create model steps by _passing arguments to classes_ rather than by writing Python functions and giving them Orca decorators, as we would for a fully custom model.

Much of the functionality for this is already built into UrbanSim and Orca, we'll just need to extend things here and there.

This demo uses a new RegressionStep() class that provides a full model development workflow: estimating a model, registering it with Orca, saving it for future use.

### Specify parameters and pass them to a model object

In [None]:
m = OLSRegressionStep()
m.tables = ['rentals', 'nodesdrive', 'nodessmall']

In [None]:
m.model_expression = 'np.log1p(rent_sqft) ~ +  np.log(population_1500+1) + np.log(job_1500+1) + \
    np.log(med_income_10000+2) + np.log(jobs_25000)'
m.fit()

In [None]:
m.model_expression = 'np.log1p(rent) ~ + np.log(sqft) + C(bedrooms) + np.log(population_1500+1) + \
    np.log(med_income_10000+2) + np.log(job_1500+1) + np.log(jobs_25000)'
m.fit()

### If we like it, register it as an Orca step

In [None]:
model.register()

In [None]:
orca.list_steps()

### Run the Orca step

In [None]:
orca.run(['RegressionStep-20180214-210159'])

### Check the fitted values

In [None]:
df = orca.get_table('buildings').to_frame(['res_price_per_sqft', 'fitted_price'])
df.loc[df.fitted_price > 0].describe()

### BONUS

Running "model.register()" also registered the step with the new ModelManager extention, which saves it to disk so that it can be automatically re-loaded in the future.

The "test" model steps here were estimated earlier and loaded from disk. They're fully functional: we can run them in Orca, inspect the estimation results, etc.

In [None]:
mm.list_steps()

In [None]:
rs = mm.get_step('test-1')
type(rs)

In [None]:
rs.model_expression

In [None]:
rs.model.report_fit()