# More troubleshooting of network aggregations

Sam Maurer, July 2018 - Python 3.6

Confirm what's causing the -1 aggregation values

In [14]:
import numpy as np
import orca
import pandana as pdna
import pandas as pd
import os; os.chdir('..')
from urbansim.utils import misc, networks
import warnings;warnings.simplefilter('ignore')

In [9]:
os.getcwd()

'/Users/maurer/Dropbox/Git-mbp13/ual/urbansim_parcel_bayarea'

In [8]:
pdna.__version__

'0.3.0'

In [2]:
d = 'data/'

In [3]:
@orca.table(cache=True)
def parcels():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'parcel_attr.csv',
#        d + 'parcels_with_nodes.csv',
        index_col='primary_id', dtype={'primary_id': int, 'block_id':str})
    return df

@orca.table(cache=True)
def buildings():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'buildings_v2.csv',
        index_col='building_id', dtype={'building_id': int, 'parcel_id': int})
    df['res_sqft_per_unit'] = df['residential_sqft'] / df['residential_units']
    df['res_sqft_per_unit'][df['res_sqft_per_unit'] == np.inf] = 0
    return df

@orca.table(cache=True)
def units():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'units_v2.csv',
        index_col='unit_id', dtype={'unit_id': int, 'building_id': int})
    return df

@orca.table(cache=True)
def households():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'households_v2.csv',
        index_col='household_id', dtype={
            'househould_id': int, 'block_group_id': str, 'state': str, 
            'county': str, 'tract': str, 'block_group': str,
            'building_id': int, 'unit_id': int})
    return df

In [4]:
@orca.step()
def initialize_network_walk():
    """
    This will be turned into a data loading template.
    """

    @orca.injectable('netwalk', cache=True)
    def build_networkwalk():
        nodeswalk = pd.read_csv(d + 'bayarea_walk_nodes.csv') \
            .set_index('osmid')
        edgeswalk = pd.read_csv(d + 'bayarea_walk_edges.csv')
        netwalk = pdna.Network(nodeswalk.x, nodeswalk.y, edgeswalk.u, \
                               edgeswalk.v, edgeswalk[['length']], twoway=True)
        netwalk.precompute(2500)
        return netwalk

    parcels = orca.get_table('parcels').to_frame(columns=['x', 'y'])
    idswalk_parcel = orca.get_injectable('netwalk').get_node_ids(parcels.x, parcels.y)
    orca.add_column('parcels', 'node_id_walk', idswalk_parcel, cache=False)
    orca.broadcast('nodeswalk', 'parcels', cast_index=True, onto_on='node_id_walk')

#     rentals = orca.get_table('rentals').to_frame(columns=['longitude', 'latitude'])
#     idswalk_rentals = orca.get_injectable('netwalk').get_node_ids(rentals.longitude, rentals.latitude)
#     orca.add_column('rentals', 'node_id_walk', idswalk_rentals, cache=False)
#     orca.broadcast('nodeswalk', 'rentals', cast_index=True, onto_on='node_id_walk')

    @orca.column('buildings', 'node_id_walk')
    def node_id(parcels, buildings):
        return misc.reindex(parcels.node_id_walk, buildings.parcel_id)

    @orca.column('units', 'node_id_walk')
    def node_id(buildings, units):
        return misc.reindex(buildings.node_id_walk, units.building_id)

    @orca.column('households', 'node_id_walk')
    def node_id(units, households):
        return misc.reindex(units.node_id_walk, households.unit_id)

    @orca.column('persons', 'node_id_walk')
    def node_id(households, persons):
        return misc.reindex(households.node_id_walk, persons.household_id)

    @orca.column('jobs', 'node_id_walk')
    def node_id(buildings, jobs):
        return misc.reindex(buildings.node_id_walk, jobs.building_id)

    # While we're at it, we can use these node_id columns to define direct broadcasts
    # between the nodes table and lower-level ones, which speeds up merging

    orca.broadcast('nodeswalk', 'units', cast_index=True, onto_on='node_id_walk')

In [5]:
@orca.step()
def network_aggregations_walk_test(netwalk):
    """
    This will be turned into a network aggregation template.
    """

    nodeswalk = networks.from_yaml(netwalk, 'network_aggregations_walk_test.yaml')
    nodeswalk = nodeswalk.fillna(0)
    print(nodeswalk.describe())
    orca.add_table('nodeswalk', nodeswalk)

In [None]:
orca.run(["initialize_network_walk"])

In [17]:
orca.run(["network_aggregations_walk_test"])

Running step 'network_aggregations_walk_test'
Computing accessibility variables
Computing pop_500_walk
Removed 189769 rows because they contain missing values
        pop_500_walk
count  619889.000000
mean      561.022034
std       908.448853
min         0.000000
25%        11.000000
50%       224.000000
75%       770.000000
max     20222.000000
Time to execute step 'network_aggregations_walk_test': 90.43 s
Total time to execute iteration 1 with iteration value None: 90.43 s


## Comfirm graph is undirected

In [18]:
edgeswalk = pd.read_csv(d + 'bayarea_walk_edges.csv')

In [19]:
edgeswalk.head(3)

Unnamed: 0,uniqueid,u,v,key,oneway,highway,name,length,lanes,width,est_width,maxspeed,access,service,bridge,tunnel,area,junction,osmid,ref
0,,25457938,2399878404,0,False,tertiary,Camino Arroyo,147.758,,,,,,,,,,,157807061,
1,,25457938,2876495484,0,False,residential,Lindsteadt Way,17.489,,,,,,,,,,,157807205,
2,,25457938,2399878306,0,False,residential,Lindsteadt Way,64.719,,,,,,,,,,,157807205,


In [21]:
edgeswalk.loc[(edgeswalk.u==25457938) & (edgeswalk.v==2399878404)]

Unnamed: 0,uniqueid,u,v,key,oneway,highway,name,length,lanes,width,est_width,maxspeed,access,service,bridge,tunnel,area,junction,osmid,ref
0,,25457938,2399878404,0,False,tertiary,Camino Arroyo,147.758,,,,,,,,,,,157807061,


Yes, seems undirected

## What are the "missing values" that were removed?

189769 rows

In [25]:
hh = orca.get_table('households').to_frame()

In [28]:
sum(hh.node_id_walk.isnull())

189769

This ultimately comes from the node id on the parcels table, via buildings and units..

In [29]:
p = orca.get_table('parcels').to_frame()

In [30]:
sum(p.node_id_walk.isnull())

0

In [31]:
b = orca.get_table('buildings').to_frame()

In [32]:
sum(b.node_id_walk.isnull())

0

In [33]:
u = orca.get_table('units').to_frame()

In [34]:
sum(u.node_id_walk.isnull())

0

So it's a problem linking the households to units, maybe?

In [35]:
sum(hh.unit_id.isnull())

0

In [36]:
hh.unit_id.describe()

count    2.677468e+06
mean     1.288427e+06
std      8.497798e+05
min     -1.000000e+00
25%      5.409708e+05
50%      1.272990e+06
75%      2.024749e+06
max      2.781489e+06
Name: unit_id, dtype: float64

In [38]:
sum(hh.unit_id==-1)

189769

In [39]:
sum(hh.unit_id==-1)/len(hh)

0.070876290584985513

## Why didn't this show up on Sam B's machine?

Don't know! I don't see any discrepancies in the code. Best guess is that maybe the message just wasn't printed?