# Data exploration

Sam Maurer, May 2018 | Python 3.6

This notebook is for development and testing of the code to load base data tables into Orca. It also demonstrates ways to explore the registered data.

In [1]:
import os; os.chdir('../')

In [2]:
os.getcwd()

'/Users/maurer/Dropbox/Git-imac/ual/urbansim_parcel_bayarea'

In [3]:
import numpy as np
import pandas as pd

In [4]:
from urbansim_templates import modelmanager as mm
import orca

  from pandas.core import datetools


In [None]:
# Run this cell to override the standard data directory, if needed
orca.add_injectable('data_directory', '/home/data/')

In [5]:
# Load script-based Orca registrations
from scripts import datasources
from scripts import models

### Tables loaded by datasources.py

In [6]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

PARCELS
['development_type_id', 'land_value', 'acres', 'county_id', 'zone_id', 'proportion_undevelopable', 'tax_exempt_status', 'apn', 'parcel_id_local', 'geom_id', 'imputation_flag', 'x', 'y', 'shape_area', 'block_id', 'node_id']

BUILDINGS
['parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'residential_sqft', 'sqft_per_unit', 'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft', 'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'building_type_id']

UNITS
['Unnamed: 0', 'building_id', 'num_units', 'tenure', 'unit_num', 'unit_residential_price', 'unit_residential_rent']

HOUSEHOLDS
['household_id', 'serialno', 'persons', 'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head', 'age_of_head', 'workers', 'state', 'county', 'tract', 'block group', 'children', 'tenure', 'recent_mover', 'block_group_id', 'single_family', 'unit_id']

PERSONS
['Unn

In [6]:
orca.get_table('parcels').to_frame().node_id.dropna().max()

'999546390'

In [7]:
orca.list_injectables()

[]

In [8]:
orca.list_steps()

['ols-test',
 'small-mnl-test',
 'model_two',
 'model_one',
 'large-mnl-test',
 'test_manual_registration',
 'initialize_network',
 'network_aggregations']

### Build network

In [9]:
orca.run(['initialize_network'])

Running step 'initialize_network'
Time to execute step 'initialize_network': 9.41 s
Total time to execute iteration 1 with iteration value None: 9.41 s


In [10]:
orca.list_injectables()

['iter_var', 'iter_step', 'net']

In [11]:
# It also replaces node_ids with new ones
orca.get_table('parcels').to_frame().node_id.dropna().max()

5458526997

### Run network aggregations

In [12]:
orca.run(['network_aggregations'])

Running step 'network_aggregations'
Computing accessibility variables
Computing sum_income_3000
Removed 189769 rows because they contain missing values
Computing residential_units_500
Removed 4 rows because they contain missing values
Computing residential_units_1500
Removed 4 rows because they contain missing values
Computing population
Removed 189769 rows because they contain missing values
Computing poor
Removed 53114 rows because they contain missing values
Computing renters
Removed 102597 rows because they contain missing values
Computing ave_income_500
Removed 189769 rows because they contain missing values
       sum_income_3000  residential_units_500  residential_units_1500  \
count     3.082600e+04           30826.000000            30826.000000   
mean      1.478443e+09               4.360008                6.175417   
std       1.927634e+09               2.391189                2.258848   
min       0.000000e+00               0.000000                0.000000   
25%       4.96

### Look at some data

In [13]:
households = orca.get_table('households').to_frame()

In [14]:
print(len(households))

2679684


In [15]:
households.head()

Unnamed: 0,household_id,serialno,persons,building_type,cars,income,race_of_head,hispanic_head,age_of_head,workers,...,county,tract,block group,children,tenure,recent_mover,block_group_id,single_family,unit_id,node_id
0,0,2010000487191,1,6.0,1.0,85000.0,1,no,47,1.0,...,85,500901,1,,2,0,60855009011,False,1711366,65468916
1,1,2013000554587,1,9.0,1.0,27000.0,6,no,52,1.0,...,85,500901,1,,2,0,60855009011,False,1711818,65468916
2,2,2011001140920,2,2.0,1.0,6000.0,6,no,60,1.0,...,85,500901,1,,2,0,60855009011,True,1711727,65468916
3,3,2012001376432,1,3.0,1.0,28000.0,8,yes,51,1.0,...,85,500901,1,,2,0,60855009011,True,1711548,4182147571
4,4,2013000186929,1,6.0,1.0,10000.0,1,no,64,,...,85,500901,1,,2,0,60855009011,False,1711121,4182147571


### Look into an H5 file manually

In [16]:
hdf = pd.HDFStore('data/bayarea_ual.h5', 'r')

In [17]:
hdf.keys()

['/buildings',
 '/establishments',
 '/households',
 '/parcels',
 '/persons',
 '/zones']

In [18]:
parcels = pd.read_hdf('data/bayarea_ual.h5', 'parcels')

In [19]:
parcels.columns.tolist()

['development_type_id',
 'land_value',
 'acres',
 'county_id',
 'zone_id',
 'proportion_undevelopable',
 'tax_exempt_status',
 'apn',
 'parcel_id_local',
 'geom_id',
 'imputation_flag',
 'x',
 'y',
 'shape_area',
 'block_id',
 'node_id']