In [11]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import statsmodels.api as sm
import numpy as np
import geopandas as gpd
import networkx as nx
from ipfn import ipfn
# import pycombo
from sklearn.linear_model import LinearRegression
from sklearn.metrics import normalized_mutual_info_score as nmi
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
import seaborn as sns
pd.options.mode.chained_assignment = None  # default='warn'
from sttn.data.lehd import OriginDestinationEmploymentDataProvider
provider = OriginDestinationEmploymentDataProvider()
from jenks import jenks
import math
from sttn.network import SpatioTemporalNetwork
from sttn.utils import add_distance

%matplotlib inline

In [59]:
cities = [
    ('New York City', 'ny', ['New York County, NY', 'Queens County, NY','Kings County, NY','Bronx County, NY','Richmond County, NY']),
    ('Los Angeles', 'ca', ['Los Angeles County, CA']),
    ('Chicago', 'il', ['Cook County, IL']),
    ('Houston', 'tx', ['Harris County, TX']),
    ('Boston', 'ma', ['Suffolk County, MA', 'Middlesex County, MA']),
    ('Phoenix', 'az', ['Maricopa County, AZ']),
    ('Philadelphia', 'pa', ['Philadelphia County, PA']),
    ('San Antonio', 'tx', ['Bexar County, TX']),
    ('San Diego', 'ca', ['San Diego County, CA']),
    ('Dallas', 'tx', ['Dallas County, TX']),
    ('San Jose', 'ca', ['Santa Clara County, CA']),
    ('Austin', 'tx', ['Travis County, TX']),
]

In [13]:
# get data for above cities - census tract level

allCity_dfs = []
job_column = 'S000'
comp_aggs={job_column: 'sum'}
for city, state, conties in cities:
    state_network = provider.get_data(state=state, year=2018)
    city_network = state_network.filter_nodes(state_network.nodes.county.isin(conties))
    with_distance = add_distance(city_network).edges
    
    city_jobs = city_network.agg_adjacent_edges(aggs=comp_aggs, outgoing=False).rename(columns={job_column: 'jobs'}).reset_index()
    city_pop = city_network.agg_adjacent_edges(aggs=comp_aggs, outgoing=True).rename(columns={job_column: 'residence'}).reset_index()
    
    city_dist = with_distance.merge(city_jobs, on='destination')
    city_cum = city_dist.merge(city_pop, on='origin')
    
    # edge cases
#     city_cum.loc[city_cum.distance == 0, 'distance'] = 0.2
#     city_cum.loc[city_cum['SE01'] == 0, 'SE01'] = 0.1
#     city_cum.loc[city_cum['SE02'] == 0, 'SE02'] = 0.1
#     city_cum.loc[city_cum['SE03'] == 0, 'SE03'] = 0.1
    
    allCity_dfs.append(city_cum)

In [14]:
# cities = [ny, la, ch]


# function for bins
def getbins(df, nbins=20):
    
    df.loc[df.distance == 0, 'distance'] = 0.2
    
    df['bin'] = pd.qcut(df['distance'], q=20)
    df.sort_values(by='bin', inplace=True)
    df.loc[df['SE01'] == 0, 'SE01'] = 0.1
    df.loc[df['SE02'] == 0, 'SE02'] = 0.1
    df.loc[df['SE03'] == 0, 'SE03'] = 0.1
    
    df.rename(columns={'jobs':'S000jobs', 'residence':'S000residence'}, inplace=True)
    
    return df

def findClass(num,jnb,i):
    if i < len(jnb)-1:
        lower = num-jnb[i]
        upper = num-jnb[i+1]
        if abs(lower) < 0.0000001:
            lower = 0
        if abs(upper) < 0.0000001:
            upper = 0
        if lower*upper <=0:
            return(i)
        else:
            i += 1
    #         print('here')
            return findClass(num,jnb,i)
    else:
        return(i)
    
def getbinsJenks(df,nbins=20):
    jnb = jenks(df['distance'].values,nbins)
    jnb = [round(i,2) for i in jnb]
    df.distance = df.distance.round(2)
    df['distGroup'] = df['distance'].apply(findClass,args=(jnb,0))
    


In [26]:
def balancing(test,target,iterationNum,iteration = 20):
#     print(target,'iteration', iterationNum)
    if target+'B' not in test.columns:
        test[target+'B'] = 1
    test[target+'BDF'] = test[target+'jobs']*test[target+'f(d)']*test[target+'B']
    if target+'A' in test.columns:
        del test[target+'A']
    del test[target+'B']
    test = test.groupby(['origin']).agg({target+'BDF':sum}).\
    rename(columns={target+'BDF':target+'A'}).reset_index().\
    merge(test,on=['origin'],how='right')
    test[target+'A'] = 1/test[target+'A']
    test[target+'AOF'] = test[target+'residence']*test[target+'f(d)']*test[target+'A']
    test = test.groupby(['destination']).agg({target+'AOF':sum}).\
    rename(columns={target+'AOF':target+'B'}).reset_index().\
    merge(test,on=['destination'],how='right')
    test[target+'B'] = 1/test[target+'B']
    test[target+'flowPred'] = test[target+'residence']*test[target+'jobs']*test[target+'f(d)']*\
                        test[target+'A']*test[target+'B']
    
    resultO = test[['origin',target+'residence']].drop_duplicates().\
    merge(test.groupby(['origin'])[[target+'flowPred']].sum().reset_index(),on=['origin'],how='left')
    resultO['percentage'] = np.abs(resultO[target+'residence'] - resultO[target+'flowPred'])/resultO[target+'residence']
    resultO = resultO['percentage'].mean()

    resultD = test[['destination',target+'jobs']].drop_duplicates().\
    merge(test.groupby(['destination'])[[target+'flowPred']].sum().reset_index(),on=['destination'],how='left')
    resultD['percentage'] = np.abs(resultD[target+'jobs'] - resultD[target+'flowPred'])/resultD[target+'jobs']
    resultD = resultD['percentage'].mean()
#     print(resultO,resultD)
    if resultO < 0.05 and resultD < 0.05:
        return test
    else:
        if iterationNum < iteration:
            return balancing(test,target,iterationNum = iterationNum+1,iteration = 20)
        else:
            return test
        

In [42]:
def doubly_constrained_model(data, separate_income=False):
    
    y_target = ['S000']  # target = total commute if no income segregation

    
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        
        origin = data.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = data.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    
    
    targetOutput = []
    for target in y_target:
        binoutput = pd.DataFrame()
        # estimate F for each bin
        for b in data['bin'].unique():
            
            subData = data[data['bin'] == b]
            X = subData[target+'residence'] * subData[target+'jobs']
            
            y = subData[target]

            model = sm.OLS(y,X).fit()
            
            subData[target+'f(d)'] = model.params[0]       
            binoutput = pd.concat([binoutput,subData])
        binoutput = balancing(binoutput,target,iterationNum=1,iteration = 20)
        
        binoutput = binoutput[['origin','destination',target,target+'A',target+'B',target+'f(d)','bin',target+'flowPred']]
        targetOutput.append(binoutput)
    if separate_income == True:
        targetOutput = targetOutput[0].merge(targetOutput[1],on=['origin','destination'],how='outer').\
                        merge(targetOutput[2],on=['origin','destination'],how='outer')
    else:
        targetOutput = targetOutput[0]
    targetOutput = targetOutput.merge(data[['origin','destination','S000']])
    return targetOutput

In [47]:
all_city = []

for city in allCity_dfs:
#     print(city)
    test = doubly_constrained_model(getbins(city), separate_income=True)
    all_city.append(test)


In [48]:
!mkdir constrainCTdistbinsABIncome
for i,citydf in enumerate(all_city):
    citydf['flowPred'] = citydf['SE01flowPred']+citydf['SE02flowPred']+citydf['SE03flowPred']
    citydf.to_csv('constrainCTdistbinsABIncome/'+cities[i][0]+'.csv',index=False)

mkdir: cannot create directory ‘constrainCTdistbinsABIncome’: File exists


In [52]:
all_city = []

for city in allCity_dfs:
#     print(city)
    test = doubly_constrained_model(getbins(city), separate_income=False)
    all_city.append(test)
#     break

In [60]:
!mkdir constrainCTdistbinsAB
for i,citydf in enumerate(all_city):
    citydf = citydf.rename(columns={'S000flowPred':'flowPred'})
    citydf.to_csv('constrainCTdistbinsAB/'+cities[i][0]+'.csv',index=False)

mkdir: cannot create directory ‘constrainCTdistbinsAB’: File exists


In [57]:
citydf

Unnamed: 0,origin,destination,S000,S000A,S000B,S000f(d),bin,flowPred
0,48453000101,48453000101,64,0.877945,0.914370,0.000005,"(0.199, 3.117]",17.628814
1,48453002402,48453002427,7,1.007890,1.194888,0.000005,"(0.199, 3.117]",11.384026
2,48453001826,48453001845,5,1.102114,1.329313,0.000005,"(0.199, 3.117]",2.979889
3,48453001820,48453001822,9,0.940128,1.071674,0.000005,"(0.199, 3.117]",5.960340
4,48453002409,48453002422,7,1.041796,1.093834,0.000005,"(0.199, 3.117]",7.609855
...,...,...,...,...,...,...,...,...
34772,48453001860,48453001772,1,1.309350,1.483298,0.000001,"(31.93, 57.928]",1.808251
34773,48453001860,48453001764,1,1.309350,1.741886,0.000001,"(31.93, 57.928]",3.917689
34774,48453001856,48453001768,4,1.287848,1.437418,0.000001,"(31.93, 57.928]",11.857308
34775,48453001779,48453002425,1,1.675301,1.946577,0.000001,"(31.93, 57.928]",1.180567


In [36]:
cities = ['New York City',
    'Los Angeles',
    'Chicago',
    'Houston',
    'Boston',
    'Phoenix',
    'Philadelphia',
    'San Antonio',
    'San Diego', 
    'Dallas', 
    'San Jose', 
    'Austin']
for city in cities:
    df = pd.read_csv('./constrainCTdistbinsABIncome/%s.csv'%city)
    break

In [54]:
citydf

Unnamed: 0,origin,destination,S000,S000A,S000B,S000f(d),bin,flowPred
0,48453000101,48453000101,64,0.877945,0.914370,0.000005,"(0.199, 3.117]",17.628814
1,48453002402,48453002427,7,1.007890,1.194888,0.000005,"(0.199, 3.117]",11.384026
2,48453001826,48453001845,5,1.102114,1.329313,0.000005,"(0.199, 3.117]",2.979889
3,48453001820,48453001822,9,0.940128,1.071674,0.000005,"(0.199, 3.117]",5.960340
4,48453002409,48453002422,7,1.041796,1.093834,0.000005,"(0.199, 3.117]",7.609855
...,...,...,...,...,...,...,...,...
34772,48453001860,48453001772,1,1.309350,1.483298,0.000001,"(31.93, 57.928]",1.808251
34773,48453001860,48453001764,1,1.309350,1.741886,0.000001,"(31.93, 57.928]",3.917689
34774,48453001856,48453001768,4,1.287848,1.437418,0.000001,"(31.93, 57.928]",11.857308
34775,48453001779,48453002425,1,1.675301,1.946577,0.000001,"(31.93, 57.928]",1.180567
