# Troubleshooting node id's

Sam Maurer, July 2018

The problem is that a large portion of households aren't being matched to walk network aggregations, which might be affecting the regression results.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import sys
print(sys.version)

3.6.4 |Anaconda custom (x86_64)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


### Check if saved data tables exhibit the problem

In [40]:
p = pd.read_csv('../data/parcels_with_nodes.csv').set_index('primary_id')

In [10]:
len(p)

1956207

In [5]:
p.columns

Index(['primary_id', 'development_type_id', 'land_value', 'acres', 'county_id',
       'zone_id', 'proportion_undevelopable', 'tax_exempt_status', 'apn',
       'parcel_id_local', 'geom_id', 'imputation_flag', 'x', 'y', 'shape_area',
       'block_id', 'node_id', 'node_id_small', 'node_id_walk'],
      dtype='object')

In [17]:
nw = pd.read_csv('../data/nodeswalk_vars.csv').set_index('osmid')

In [11]:
len(nw)

415716

In [8]:
nw.head(3)

Unnamed: 0,osmid,units_500_walk,sqft_unit_500_walk,singles_500_walk,elderly_hh_500_walk,children_500_walk,units_sf_500_walk,units_mf_500_walk,pop_500_walk,hh_500_walk,...,prop_elderly_1500_walk,prop_black_1500_walk,prop_white_1500_walk,prop_asian_1500_walk,prop_hisp_1500_walk,prop_rich_1500_walk,prop_poor_1500_walk,pop_jobs_ratio_1500_walk,avg_hhs_500_walk,avg_hhs_1500_walk
0,25457938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.103236,0.007344,0.694627,0.01121,0.877464,0.006958,0.124855,2.743372,0.0,3.98
1,25457939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.125,0.0,1.0,0.0,0.384615,0.076923,0.038462,0.05029,0.0,2.888889
2,26027651,336.0,1000.871399,86.0,153.0,178.0,66.0,270.0,783.0,334.0,...,0.282793,0.010092,0.614254,0.294418,0.098181,0.18602,0.063045,5.803326,2.337313,2.523899


In [15]:
len(p.loc[~p.node_id_walk.isin(nw.index)])

562628

In [16]:
len(p.loc[~p.node_id_walk.isin(nw.index)])/len(p)

0.2876116893559833

Yes, 29% of the parcels have node id's that aren't in the aggregation table

In [22]:
# How many nodes is that?

len(p.loc[~p.node_id_walk.isin(nw.index)].node_id_walk.unique())

115794

In [23]:
len(p.loc[~p.node_id_walk.isin(nw.index)].node_id_walk.unique())/len(p.node_id_walk.unique())

0.35801984361328143

### What does the network file look like?

In [18]:
nodes = pd.read_csv('../data/bayarea_walk_nodes.csv').set_index('osmid')

  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
len(nodes)

619889

In [24]:
nodes.head(3)

Unnamed: 0_level_0,x,y,ref,highway
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
25457938,-121.553913,37.007868,,traffic_signals
25457939,-121.550781,37.002077,,
26392420,-121.384907,36.988574,,


In [21]:
len(p.loc[~p.node_id_walk.isin(nodes.index)])

0

### Recreate the aggregations

In [3]:
import orca
import os; os.chdir('..')
import pandana as pdna
from urbansim.utils import misc, networks
import warnings;warnings.simplefilter('ignore')
os.getcwd()

'/Users/maurer/Dropbox/Git-imac/ual/urbansim_parcel_bayarea'

In [4]:
pdna.__version__

'0.3.0'

In [5]:
d = 'data/'

In [6]:
@orca.table(cache=True)
def parcels():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'parcel_attr.csv',
#        d + 'parcels_with_nodes.csv',
        index_col='primary_id', dtype={'primary_id': int, 'block_id':str})
    return df

@orca.table(cache=True)
def buildings():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'buildings_v2.csv',
        index_col='building_id', dtype={'building_id': int, 'parcel_id': int})
    df['res_sqft_per_unit'] = df['residential_sqft'] / df['residential_units']
    df['res_sqft_per_unit'][df['res_sqft_per_unit'] == np.inf] = 0
    return df

@orca.table(cache=True)
def units():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'units_v2.csv',
        index_col='unit_id', dtype={'unit_id': int, 'building_id': int})
    return df

@orca.table(cache=True)
def households():
    df = pd.read_csv(
        d + 'mtc_data_platform_format_7-6-18/' + 'households_v2.csv',
        index_col='household_id', dtype={
            'househould_id': int, 'block_group_id': str, 'state': str, 
            'county': str, 'tract': str, 'block_group': str,
            'building_id': int, 'unit_id': int})
    return df

In [7]:
@orca.step()
def initialize_network_walk():

    @orca.injectable('netwalk', cache=True)
    def build_networkwalk():
        nodeswalk = pd.read_csv(d + 'bayarea_walk_nodes.csv') \
            .set_index('osmid')
        edgeswalk = pd.read_csv(d + 'bayarea_walk_edges.csv')
        netwalk = pdna.Network(nodeswalk.x, nodeswalk.y, edgeswalk.u, \
                               edgeswalk.v, edgeswalk[['length']], twoway=True)
        netwalk.precompute(2500)
        return netwalk

    parcels = orca.get_table('parcels').to_frame(columns=['x', 'y'])
    idswalk_parcel = orca.get_injectable('netwalk').get_node_ids(parcels.x, parcels.y)
    orca.add_column('parcels', 'node_id_walk', idswalk_parcel, cache=False)
    orca.broadcast('nodeswalk', 'parcels', cast_index=True, onto_on='node_id_walk')
    
    @orca.column('buildings', 'node_id_walk')
    def node_id(parcels, buildings):
        return misc.reindex(parcels.node_id_walk, buildings.parcel_id)

    @orca.column('units', 'node_id_walk')
    def node_id(buildings, units):
        return misc.reindex(buildings.node_id_walk, units.building_id)

    @orca.column('households', 'node_id_walk')
    def node_id(units, households):
        return misc.reindex(units.node_id_walk, households.unit_id)

In [8]:
@orca.step()
def network_aggregations_walk_test(netwalk):

    nodeswalk = networks.from_yaml(netwalk, 'network_aggregations_walk_test.yaml')
    nodeswalk = nodeswalk.fillna(0)
    print(nodeswalk.describe())
    orca.add_table('nodeswalk', nodeswalk)

In [9]:
orca.run(["initialize_network_walk"])

Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 119.73 s
Total time to execute iteration 1 with iteration value None: 119.74 s


### Aggregations are of households and buildings - what do those id's look like?

In [11]:
h = orca.get_table('households').to_frame()

In [18]:
len(h.loc[~h.node_id_walk.isin(nw.index)])

768814

In [19]:
b = orca.get_table('buildings').to_frame()

In [20]:
len(b.loc[~b.node_id_walk.isin(nw.index)])

519055

### Ok, do the aggregations

In [22]:
orca.run(["network_aggregations_walk_test"])

Running step 'network_aggregations_walk_test'
Computing accessibility variables
Computing pop_500_walk
Removed 189769 rows because they contain missing values
Computing sqft_unit_500_walk
Computing singles_500_walk
Removed 52084 rows because they contain missing values
Computing elderly_hh_500_walk
Removed 34553 rows because they contain missing values
Computing children_500_walk
Removed 189769 rows because they contain missing values
        pop_500_walk  sqft_unit_500_walk  singles_500_walk  \
count  619889.000000       619889.000000     619889.000000   
mean      561.022034         1239.638062         67.199196   
std       908.448853         1012.501038        212.378616   
min         0.000000            0.000000          0.000000   
25%        11.000000          449.750000          0.000000   
50%       224.000000         1327.473999         13.000000   
75%       770.000000         1760.372437         61.000000   
max     20222.000000        30000.000000       8822.000000   

  

In [24]:
nw2 = orca.get_table('nodeswalk').to_frame()

In [25]:
len(h.loc[~h.node_id_walk.isin(nw2.index)])

189769

In [26]:
ids = orca.get_injectable('netwalk').node_ids

In [27]:
len(h.loc[~h.node_id_walk.isin(ids)])

189769

Aggregations seem to be returning fewer rows than are nodes in the network, which contradicts the pandana documentation: https://github.com/UDST/pandana/blob/master/pandana/network.py#L315-L319

Don't see anything obvious in the helper function either: https://github.com/UDST/urbansim/blob/master/urbansim/utils/networks.py

### Retry with a single aggregation, for easier troubleshooting

In [30]:
with open('configs/net_test.yaml', 'w') as t:
    t.write('''

name: network_aggregations
desc: Network aggregations
model_type: networks
node_col: node_id_walk
variable_definitions:

  - name: pop_500_walk
    dataframe: households
    varname: persons
    radius: 500
    decay: flat
''')

In [31]:
@orca.step()
def net_test(netwalk):

    nodestest = networks.from_yaml(netwalk, 'net_test.yaml')
    nodestest = nodestest.fillna(0)
    print(nodestest.describe())
    orca.add_table('nodestest', nodestest)

In [33]:
orca.run(["net_test"])

Running step 'net_test'
Computing accessibility variables
Computing pop_500_walk
Removed 189769 rows because they contain missing values
        pop_500_walk
count  619889.000000
mean      561.022034
std       908.448853
min         0.000000
25%        11.000000
50%       224.000000
75%       770.000000
max     20222.000000
Time to execute step 'net_test': 3.90 s
Total time to execute iteration 1 with iteration value None: 3.90 s


In [34]:
nt = orca.get_table('nodestest').to_frame()

In [35]:
len(h.loc[~h.node_id_walk.isin(nt.index)])

189769

In [36]:
len(h.loc[(h.building_id>-1) & (~h.node_id_walk.isin(nt.index))])

0

In [37]:
sum(h.node_id_walk.isnull())

189769

This is fine - it's just the households that aren't matched to builidngs

In [42]:
len(nt)

619889

In [43]:
len(orca.get_injectable('netwalk').node_ids)

619889

In [47]:
p2 = orca.get_table('parcels').to_frame()

In [48]:
len(p2.node_id_walk.unique())

323429

In [49]:
len(p2.loc[~p2.node_id_walk.isin(nt.index)])

0

Problem is not reproduced with this aggregation

In [21]:
os.chdir('..')

In [39]:
os.chdir('notebooks-sam')