# Data exploration

Sam Maurer, May 2018 | Python 3.6

This notebook is for development and testing of the base data tables being used for the lab model. It also demonstrates ways to explore data registered with Orca.

In [1]:
import os; os.chdir('../')

In [2]:
import numpy as np
import pandas as pd

In [3]:
from urbansim_templates import modelmanager as mm
import orca

  from pandas.core import datetools


In [4]:
# Load script-based Orca registrations
from scripts import datasources
from scripts import models

### Tables loaded by datasources.py

In [5]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

PARCELS
['development_type_id', 'land_value', 'acres', 'county_id', 'zone_id', 'proportion_undevelopable', 'tax_exempt_status', 'apn', 'parcel_id_local', 'geom_id', 'imputation_flag', 'x', 'y', 'shape_area', 'block_id', 'node_id']

BUILDINGS
['parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'residential_sqft', 'sqft_per_unit', 'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft', 'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'building_type_id', 'node_id']

UNITS
['Unnamed: 0', 'building_id', 'num_units', 'tenure', 'unit_num', 'unit_residential_price', 'unit_residential_rent', 'node_id']

HOUSEHOLDS
['household_id', 'serialno', 'persons', 'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head', 'age_of_head', 'workers', 'state', 'county', 'tract', 'block group', 'children', 'tenure', 'recent_mover', 'block_group_id', 'single_family', 'un

### Run network aggregations

In [6]:
%%time
orca.get_injectable('net')

CPU times: user 17.1 s, sys: 494 ms, total: 17.6 s
Wall time: 17.6 s


<pandana.network.Network at 0x11f6e6048>

In [7]:
orca.run(['network_aggregations'])

Running step 'network_aggregations'
Computing accessibility variables
Computing sum_income_3000
Removed 2679684 rows because they contain missing values
Computing residential_units_500
Removed 1843351 rows because they contain missing values
Computing residential_units_1500
Removed 1843351 rows because they contain missing values
Computing population
Removed 2679684 rows because they contain missing values
Computing poor
Removed 648892 rows because they contain missing values
Computing renters
Removed 1144592 rows because they contain missing values
Computing ave_income_1500
Removed 2679684 rows because they contain missing values
Computing ave_income_500
Removed 2679684 rows because they contain missing values
       sum_income_3000  residential_units_500  residential_units_1500  \
count         226060.0               226060.0                226060.0   
mean               0.0                    0.0                     0.0   
std                0.0                    0.0               

### Look at some data

In [8]:
households = orca.get_table('households').to_frame()

In [13]:
print(len(households))

2679684


In [9]:
households.head()

Unnamed: 0,household_id,serialno,persons,building_type,cars,income,race_of_head,hispanic_head,age_of_head,workers,...,county,tract,block group,children,tenure,recent_mover,block_group_id,single_family,unit_id,node_id
0,0,2010000487191,1,6.0,1.0,85000.0,1,no,47,1.0,...,85,500901,1,,2,0,60855009011,False,1711366,65430040
1,1,2013000554587,1,9.0,1.0,27000.0,6,no,52,1.0,...,85,500901,1,,2,0,60855009011,False,1711818,65529799
2,2,2011001140920,2,2.0,1.0,6000.0,6,no,60,1.0,...,85,500901,1,,2,0,60855009011,True,1711727,65529802
3,3,2012001376432,1,3.0,1.0,28000.0,8,yes,51,1.0,...,85,500901,1,,2,0,60855009011,True,1711548,65430040
4,4,2013000186929,1,6.0,1.0,10000.0,1,no,64,,...,85,500901,1,,2,0,60855009011,False,1711121,65430040


In [12]:
households.income.describe()

count    2.679684e+06
mean     1.131234e+05
std      1.079445e+05
min     -1.160000e+04
25%      4.030000e+04
50%      8.400000e+04
75%      1.501000e+05
max      1.397000e+06
Name: income, dtype: float64

In [10]:
nodes = orca.get_table('nodes').to_frame()

In [11]:
nodes.head()

Unnamed: 0,sum_income_3000,residential_units_500,residential_units_1500,population,poor,renters,ave_income_1500,ave_income_500
8,0.0,0.0,0.0,0.0,0.0,0.0,-inf,-inf
9,0.0,0.0,0.0,0.0,0.0,0.0,-inf,-inf
10,0.0,0.0,0.0,0.0,0.0,0.0,-inf,-inf
11,0.0,0.0,0.0,0.0,0.0,0.0,-inf,-inf
12,0.0,0.0,0.0,0.0,0.0,0.0,-inf,-inf


### Debug network aggregations that aren't working

In [14]:
net = orca.get_injectable('net')

In [15]:
net.set(households.node_id, households.income)

Removed 2679684 rows because they contain missing values


In [51]:
hh = households[['node_id','income']]
print(len(hh))
print(hh.head())

2679684
    node_id   income
0  65430040  85000.0
1  65529799  27000.0
2  65529802   6000.0
3  65430040  28000.0
4  65430040  10000.0


In [20]:
df = hh.dropna(how='any')
print(len(df))

2489915


In [49]:
ids = pd.Series(net.node_ids)

In [50]:
ids.describe()

count    226060.000000
mean     142550.592882
std      101599.344926
min           8.000000
25%       63110.750000
50%      122479.500000
75%      183096.250000
max      354696.000000
dtype: float64

In [41]:
len(hh.loc[hh.node_id.isin(ids)])

0

Problem seems to be that the node id's in the network file are different from the node id's in the MTC parcels table.

In [52]:
p = orca.get_table('parcels').to_frame()

In [61]:
p.node_id.head()

b'parcel_id'
229116       53091636
244166       53059159
202378     1852532916
2004420    4533108955
340332      443532766
Name: node_id, dtype: object

Or maybe the ids are fine and they're just not making it to the other tables correctly.

### Look into H5 file manually

In [6]:
hdf = pd.HDFStore('data/bayarea_ual.h5', 'r')

In [9]:
hdf.keys()

['/buildings',
 '/establishments',
 '/households',
 '/parcels',
 '/persons',
 '/zones']

In [10]:
hdf.close()

In [11]:
buildings = pd.read_hdf('data/bayarea_ual.h5', 'buildings')

In [14]:
buildings.columns.tolist()

['parcel_id',
 'development_type_id',
 'improvement_value',
 'residential_units',
 'residential_sqft',
 'sqft_per_unit',
 'non_residential_sqft',
 'building_sqft',
 'nonres_rent_per_sqft',
 'res_price_per_sqft',
 'stories',
 'year_built',
 'redfin_sale_price',
 'redfin_sale_year',
 'redfin_home_type',
 'costar_property_type',
 'costar_rent',
 'building_type_id']

In [15]:
parcels = pd.read_hdf('data/bayarea_ual.h5', 'parcels')

In [16]:
parcels.columns.tolist()

['development_type_id',
 'land_value',
 'acres',
 'county_id',
 'zone_id',
 'proportion_undevelopable',
 'tax_exempt_status',
 'apn',
 'parcel_id_local',
 'geom_id',
 'imputation_flag',
 'x',
 'y',
 'shape_area',
 'block_id',
 'node_id']