In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import statsmodels.api as sm
import numpy as np
import geopandas as gpd
from matplotlib import pyplot as plt
import matplotlib.ticker
from haversine import haversine_vector, Unit
pd.options.mode.chained_assignment = None  # default='warn'
from sttn.data.lehd import OriginDestinationEmploymentDataProvider
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
provider = OriginDestinationEmploymentDataProvider()
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import scipy.optimize as optimize
import seaborn as sns

import math
from sttn.network import SpatioTemporalNetwork
from sttn.utils import add_distance
import os
%matplotlib inline

In [135]:
cities = [
    ('New York City',  [
        'Kings County, NY','Queens County, NY', 'New York County, NY','Bronx County, NY',
'Richmond County, NY','Westchester County, NY','Bergen County, NJ','Hudson County, NJ',
'Passaic County, NJ','Putnam County, NY','Rockland County, NY','Suffolk County, NY',
'Nassau County, NY','Middlesex County, NJ','Monmouth County, NJ','Ocean County, NJ',
'Somerset County, NJ','Essex County, NJ','Union County, NJ','Morris County, NJ',
'Sussex County, NJ','Hunterdon County, NJ','Pike County, PA',]),
    
#     ('Los Angeles', ['Los Angeles County, CA','Orange County, CA']),
    
    ('Chicago', [
        'Cook County, IL','DeKalb County, IL','DuPage County, IL','Grundy County, IL',
'Kankakee County, IL','Kane County, IL','Kendall County, IL','McHenry County, IL',
'Will County, IL','Jasper County, IN','Lake County, IN','Newton County, IN',
'Porter County, IN','Lake County, IL', 'Kenosha County, WI'
    ]),
    
#     ('Houston', ['Harris County, TX','Austin County, TX','Brazoria County, TX',
# 'Chambers County, TX','Fort Bend County, TX','Galveston County, TX','Harris County, TX',
# 'Liberty County, TX','Montgomery County, TX','Waller County, TX']),
    
    ('Boston', ['Suffolk County, MA','Plymouth County, MA',
                      'Norfolk County, MA','Essex County, MA', 'Middlesex County, MA',
                     'Rockingham County, NH', 'Strafford County, NH']),

#     ('Phoenix', ['Maricopa County, AZ','Pinal County, AZ','Gila County, AZ']),
    ('Philadelphia', ['Burlington County, NJ','Camden County, NJ','Gloucester County,NJ',
     'Bucks County, PA', 'Chester County, PA', 'Montgomery County, PA',
                           'Delaware County, PA', 'Philadelphia County, PA',
                           'New Castle County, DE', 'Cecil County, MD', 'Salem County, NJ']),
    
#     ('San Antonio', ['Bexar County, TX','Atascosa County, TX','Bandera County, TX','Comal County, TX',
#                           'Guadalupe County, TX','Kendall County, TX','Medina County, TX','Wilson County, TX']),
    
#     ('San Diego', 'ca', ['San Diego County, CA']),
    
#     ('Dallas',  ['Collin County, TX','Dallas County, TX','Denton County, TX','Ellis County, TX',
# 'Hunt County, TX','Kaufman County, TX','Rockwall County, TX','Johnson County, TX','Parker County, TX',
# 'Tarrant County, TX','Wise County, TX']),

]

In [136]:
def rwacbystate(state):
    if os.path.exists('racwac/%s.csv'%state):
        pass
    else:
        rac = pd.read_csv('https://lehd.ces.census.gov/data/lodes/LODES7/%s/rac/%s_rac_S000_JT00_2019.csv.gz'%(state,state),
                        compression='gzip')
        rac = rac[['h_geocode','C000','CE01','CE03']]
        rac = rac.rename(columns={'h_geocode':'ct','C000':'S000residence',
                                  'CE01':'SE01residence','CE03':'SE03residence'})
        wac = pd.read_csv('https://lehd.ces.census.gov/data/lodes/LODES7/%s/wac/%s_wac_S000_JT00_2019.csv.gz'%(state,state),
                        compression='gzip')
        wac = wac[['w_geocode','C000','CE01','CE03']]
        wac = wac.rename(columns={'w_geocode':'ct','C000':'S000jobs',
                                  'CE01':'SE01jobs','CE03':'SE03jobs'})
        df = rac.merge(wac,on='ct',how='outer').fillna(0)
        df.to_csv('racwac/%s.csv'%state,index=False)
        print(state,'rac, wac downloaded')

In [137]:
def xwalkbystate(state):
    if os.path.exists('xwalk/%s.csv'%state):
        pass
    else:
        xwalk = pd.read_csv('https://lehd.ces.census.gov/data/lodes/LODES7/%s/%s_xwalk.csv.gz'%(state,state),
                        compression='gzip')
        xwalk = xwalk[['tabblk2010','ctyname','blklatdd','blklondd']]
        xwalk = xwalk.rename(columns={'tabblk2010':'ct'})
        xwalk['ctyname'] = xwalk['ctyname'].apply(lambda x: x.split(',')[0])
        xwalk.to_csv('xwalk/%s.csv'%state,index=False)
        print(state,'xwalk downloaded')

In [138]:
def odbystate(state):
    if os.path.exists('od/%s.csv'%state):
        pass
    else:
        od = pd.read_csv('https://lehd.ces.census.gov/data/lodes/LODES7/%s/od/%s_od_aux_JT00_2019.csv.gz'%(state,state),
                        compression='gzip')
        od = od[['w_geocode', 'h_geocode','S000','SE01','SE02', 'SE03']]
        od = od.rename(columns={'w_geocode':'origin','h_geocode':'destination'})
        od.to_csv('od/%s.csv'%state,index=False)
        

In [139]:
target_columns = ['S000','SE01','SE03']
for city, counties in cities:
    print(city)
    odConcat = pd.DataFrame()
    wacracConcat = pd.DataFrame()
    xwalkConcat = pd.DataFrame()
    ctlst = []
    stateDict = {}
    
    # process county,state to a dict
    for county in counties:
        ctyname = county.split(',')[0]
        state = county.split(',')[1].strip().lower()
        stateDict[state] = stateDict.get(state,[]) + [ctyname]
    # read rac, wac, xwalk by state
    for state in stateDict.keys():
        print(state)
        rwacbystate(state)
        xwalkbystate(state)
        odbystate(state)
        statexwalk = pd.read_csv('xwalk/%s.csv'%state)
        statexwalk = statexwalk.loc[statexwalk['ctyname'].isin(stateDict[state])]
        ctlst += statexwalk['ct'].values.tolist()
        xwalkConcat = pd.concat([xwalkConcat,statexwalk],axis=0)
        
        
        od = pd.read_csv('od/%s.csv'%state)
        od.columns = ['origin', 'destination', 'S000flow', 'SE01flow', 'SE02flow', 'SE03flow']        
        odConcat = pd.concat([odConcat,od],axis=0)
        
        wacrac = pd.read_csv('racwac/%s.csv'%state)
        wacracConcat = pd.concat([wacracConcat,wacrac],axis=0)
        
    odConcat = odConcat.loc[odConcat['origin'].isin(ctlst)]
    odConcat = odConcat.loc[odConcat['destination'].isin(ctlst)]
    print(odConcat.shape)
    # add rac, wac
    odConcat = odConcat.merge(wacracConcat[['ct','S000residence','SE01residence',
                                           'SE03residence']],left_on='origin',right_on='ct')
    del odConcat['ct']
    odConcat = odConcat.merge(wacracConcat[['ct','S000jobs','SE01jobs',
                                           'SE03jobs']],left_on='destination',right_on='ct')
    del odConcat['ct']
    print(odConcat.shape)
    
    # add lat, lon
    odConcat = odConcat.merge(xwalkConcat[['ct','blklatdd','blklondd']],left_on='origin',right_on='ct')
    odConcat = odConcat.rename(columns={'blklatdd':'olat','blklondd':'olng'})
    del odConcat['ct']
    odConcat = odConcat.merge(xwalkConcat[['ct','blklatdd','blklondd']],left_on='destination',right_on='ct')
    odConcat = odConcat.rename(columns={'blklatdd':'dlat','blklondd':'dlng'})
    del odConcat['ct']
    print(odConcat.shape)
    
    # calculate distance
    from_points = list(zip(odConcat.olat, odConcat.olng))
    to_points = list(zip(odConcat.dlat, odConcat.dlng))
    odConcat['distance'] = haversine_vector(from_points, to_points, Unit.KILOMETERS)
    
    # replace 0 distance by the shortest distance
    
    odConcat.drop(['olat', 'olng', 'dlat', 'dlng'], axis=1, inplace=True)
    odConcat_above = odConcat.loc[odConcat.distance > 0]
    odConcat_below = odConcat.loc[odConcat.distance == 0]
    del odConcat_below['distance']
    misDistance = odConcat_above.groupby(['origin']).agg({'distance':min})
    misDistance['distance'] = misDistance['distance']
    odConcat_below = odConcat_below.merge(misDistance,on=['origin'])
    odConcat = pd.concat([odConcat_above,odConcat_below],axis=0)
    
    odConcat.to_csv('processedOD/%s.csv'%city)



New York City
ny
nj
pa
(626551, 6)
(626551, 12)
(626551, 16)
Chicago
il
in
wi
(114101, 6)
(114101, 12)
(114101, 16)
Boston
ma
nh
(64277, 6)
(64277, 12)
(64277, 16)
Philadelphia
nj
pa
de
md
(250068, 6)
(250068, 12)
(250068, 16)
