In [1]:
import numpy as np
import pandas as pd

from setup import *

%load_ext autoreload
%autoreload 2

In [202]:
hh = pd.read_csv(survey_dir+"household.csv")
pr = pd.read_csv(survey_dir+"person.csv")
pl = pd.read_csv(survey_dir+"place.csv")
pt = pd.read_csv(survey_dir+"place_transit.csv")
lc = pd.read_csv(survey_dir+"location.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
num_pl = len(pl)
print(len(hh), "Households", len(pr), "People", len(pl), "Places", len(pl[pl.placeno!=1]), "Trips")

12391 Households 30683 People 128229 Places 99652 Trips


In [4]:
# Income
f_inc = (hh.hhinc >= 0) | (hh.hhinc2 >= -1)
print(np.sum(f_inc==0), 'households with no income')

30 households with no income


In [5]:
# Trip numbers
f_hhtrips = (hh.hhtrips>0)
print(np.sum(f_hhtrips==0), 'households with no trips')

349 households with no trips


In [6]:
hh = hh[f_inc & f_hhtrips]

In [7]:
# Age
f_age = (pr.aage >= -1)
print(np.sum(f_age==0), 'people with no age')

10 people with no age


In [8]:
# License
f_lic = (pr.lic >= -1)
print(np.sum(f_lic==0), 'people with no license info')

2 people with no license info


In [9]:
# Education
f_edu = (pr.educ > 0)
print(np.sum(f_edu==0), 'people with no education info')

42 people with no education info


In [10]:
pr = pr[f_age & f_lic & f_edu]

In [11]:
# Travel Mode
f_mode = (pl['mode'] >= -1) & (pl['mode'] // 100 < 8)
print(np.sum(f_mode==0), 'places with invalid modes (not available or air travel)')

331 places with invalid modes (not available or air travel)


In [12]:
pl = pl[f_mode]

In [13]:
# Trip Location
f_state_loc = (lc['state_fips'].isin([17,18,26,55]))
# tight restrictions to speed up the image download process
#f_state_loc = (lc['state_fips'].isin([17]))
pl = pd.merge(lc[f_state_loc][['sampno','locno']], pl, on=['sampno','locno'])

print(np.sum(f_state_loc==0), 'locations not in valid states')
print(num_pl-len(pl), 'places not in valid states')

1968 locations not in valid states
989 places not in valid states


In [14]:
lc = lc[f_state_loc]

In [16]:
print("Left with", len(hh), "Households", len(pr), "People", len(pl), "Places", len(pl[pl.placeno!=1]), "Trips")

Left with 12013 Households 30640 People 127240 Places 98899 Trips


## Turn Places into Trips

In [42]:
pl_new = pl[['sampno', 'perno', 'placeGroup', 'locno', 'arrtime', 'deptime', 'travtime', 'mode']].sort_values(by=['sampno','perno','placeGroup']).reset_index(drop=True)
pl_new['prev_placeGroup'] = pl_new['placeGroup'] - 1

In [65]:
tp = pd.merge(pl_new, pl_new, left_on=['sampno', 'perno', 'placeGroup'], right_on=['sampno', 'perno', 'prev_placeGroup'], suffixes=('_1','_2'))
tp = tp[['sampno', 'perno', 'placeGroup_1', 'placeGroup_2', 'locno_1', 'locno_2', 'deptime_1', 'arrtime_2', 'travtime_2', 'mode_2']]

In [66]:
lc = lc[['sampno', 'locno', 'loctype', 'state', 'country', 'state_fips',
       'county_fips', 'tract_fips', 'out_region', 'home', 'latitude',
       'longitude']]
tp = pd.merge(tp, lc, left_on=['sampno', 'locno_1'], right_on = ['sampno', 'locno'])
tp = pd.merge(tp, lc, left_on=['sampno', 'locno_2'], right_on = ['sampno', 'locno'], suffixes=("_1", "_2"))

In [69]:
tp.to_csv(data_dir+"trips.csv", index=False)

In [70]:
tp['tract_1'] = tp['state_fips_1']*1000000000+tp['county_fips_1']*1000000+tp['tract_fips_1']
tp['tract_2'] = tp['state_fips_2']*1000000000+tp['county_fips_2']*1000000+tp['tract_fips_2']

## Simplify Mode Representation

In [85]:
tp['mode'] = tp['mode_2']//100
tp['mode'] = tp['mode'].map({1:1,2:2,3:3,4:3,5:4,6:3,7:3})
print("1: Active; 2: Auto; 3: Mobility Services; 4: Public Transit")
print(tp['mode'].value_counts().sort_index() / len(tp))

1: Active; 2: Auto; 3: Mobility Services; 4: Public Transit
1    0.134597
2    0.706344
3    0.055057
4    0.104002
Name: mode, dtype: float64


## OD pairs

In [183]:
od = tp.groupby(['tract_1','tract_2'], as_index=False).count()[['tract_1','tract_2','sampno']]
print("# OD pairs:", len(od))
print("# OD pairs with more than 15 trips:", len(od[od['sampno']>15]), "accounting for", od[od['sampno']>15].sum()['sampno'], "trips")

# OD pairs: 46878
# OD pairs with more than 15 trips: 457 accounting for 16011 trips


## OD Mode

In [185]:
od_mode = tp.groupby(['tract_1','tract_2','mode'], as_index=False).count()[['tract_1','tract_2','mode','sampno']]

od_mode['key'] = 0
od['key'] = 0
# Use all OD
od_mode_full = pd.merge(od[['tract_1','tract_2','key']].drop_duplicates(), od_mode[['mode','key']].drop_duplicates()).drop("key", 1)
od_mode = od_mode.drop("key",1)
od = od.drop("key",1)
od_mode_full["i"] = 1

In [186]:
od_mode = pd.merge(od_mode_full, od_mode, on=['tract_1','tract_2','mode'], how='outer').fillna(0)
od_mode = pd.merge(od_mode, od, on=['tract_1','tract_2'], suffixes=("_mode","_od"))
od_mode['mode_share'] = od_mode['sampno_mode'] / od_mode['sampno_od']

In [187]:
od_mode.to_csv(data_dir+"od_mode.csv", index=False)

## Filter OD pairs
- A lot of public transit trips are filltered.

In [196]:
od = od[od['sampno']>15]

In [197]:
tracts = pd.concat([od['tract_1'], od['tract_2']]).drop_duplicates().tolist()
print("# Census tracts in these OD pairs:", len(tracts))

# Census tracts in these OD pairs: 282


In [190]:
od_mode = tp.groupby(['tract_1','tract_2','mode'], as_index=False).count()[['tract_1','tract_2','mode','sampno']]

od_mode['key'] = 0
od['key'] = 0
# Use filtered OD (>15 trips)
od_mode_full = pd.merge(od[['tract_1','tract_2','key']].drop_duplicates(), od_mode[['mode','key']].drop_duplicates()).drop("key", 1)
od_mode = od_mode.drop("key",1)
od = od.drop("key",1)
od_mode_full["i"] = 1

In [191]:
od_mode = pd.merge(od_mode_full, od_mode, on=['tract_1','tract_2','mode'], how='outer').fillna(0)
od_mode = pd.merge(od_mode, od, on=['tract_1','tract_2'], suffixes=("_mode","_od"))
od_mode['mode_share'] = od_mode['sampno_mode'] / od_mode['sampno_od']

In [198]:
od_mode.to_csv(data_dir+"od_mode_filtered.csv", index=False)

In [200]:
# Output Census Tracts to Get Images
tracts = pd.concat([od['tract_1'], od['tract_2']]).drop_duplicates().tolist()
lc = pd.read_csv(survey_dir+"location.csv")
lc['tract'] = lc['state_fips']*1000000000+lc['county_fips']*1000000+lc['tract_fips']

lc = lc[['tract','state_fips','county_fips','tract_fips','latitude','longitude']].drop_duplicates()
lc = lc[lc['tract'].isin(tracts)]

lc.to_csv(data_dir+"census_tracts_filtered.csv", index=False)