In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import statsmodels.api as sm
import numpy as np
import geopandas as gpd
from ipfn import ipfn
from matplotlib import pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
from sttn.data.lehd import OriginDestinationEmploymentDataProvider
provider = OriginDestinationEmploymentDataProvider()

import math
from sttn.network import SpatioTemporalNetwork
from sttn.utils import add_distance

%matplotlib inline

In [2]:
cities = [
    ('New York City', 'ny', ['New York County, NY', 'Queens County, NY','Kings County, NY','Bronx County, NY','Richmond County, NY']),
    ('Los Angeles', 'ca', ['Los Angeles County, CA']),
    ('Chicago', 'il', ['Cook County, IL']),
    ('Houston', 'tx', ['Harris County, TX']),
    ('Boston', 'ma', ['Suffolk County, MA', 'Middlesex County, MA']),
    ('Phoenix', 'az', ['Maricopa County, AZ']),
    ('Philadelphia', 'pa', ['Philadelphia County, PA']),
    ('San Antonio', 'tx', ['Bexar County, TX']),
    ('San Diego', 'ca', ['San Diego County, CA']),
    ('Dallas', 'tx', ['Dallas County, TX']),
    ('San Jose', 'ca', ['Santa Clara County, CA']),
    ('Austin', 'tx', ['Travis County, TX']),
]

In [10]:
# get data for above cities - census tract level

allCity_dfs = []
job_column = 'S000'
comp_aggs={job_column: 'sum'}
for city, state, conties in cities:
    state_network = provider.get_data(state=state, year=2018)
    city_network = state_network.filter_nodes(state_network.nodes.county.isin(conties))
    with_distance = add_distance(city_network).edges
    
    city_jobs = city_network.agg_adjacent_edges(aggs=comp_aggs, outgoing=False).rename(columns={job_column: 'jobs'}).reset_index()
    city_pop = city_network.agg_adjacent_edges(aggs=comp_aggs, outgoing=True).rename(columns={job_column: 'residence'}).reset_index()
    
    city_dist = with_distance.merge(city_jobs, on='destination')
    city_cum = city_dist.merge(city_pop, on='origin')
    
    allCity_dfs.append(city_cum)

In [13]:
!mkdir cities
for i,df in enumerate(allCity_dfs):
    city = cities[i][0]
    df.to_csv('cities/'+city+'.csv')

mkdir: cannot create directory ‘cities’: File exists


### log-mse doubly constrained model from Devashish

In [3]:


# function for bins
def getbins(df, nbins=20):
    
    df.loc[df.distance == 0, 'distance'] = 0.2
    
    df['bin'] = pd.qcut(df['distance'], q=20)
    df.sort_values(by='bin', inplace=True)
    df.loc[df['SE01'] == 0, 'SE01'] = 0.1
    df.loc[df['SE02'] == 0, 'SE02'] = 0.1
    df.loc[df['SE03'] == 0, 'SE03'] = 0.1
    
    df.rename(columns={'jobs':'S000jobs', 'residence':'S000residence'}, inplace=True)
    
    return df

# doubly constrained model
def constrained_model(data, separate_income=False):
    
    y_target = ['S000']  # target = total commute if no income segregation

    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        
        origin = df.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = df.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    
    dataF = []
    dataUV = []
    
    for target in y_target:
#         print(target)
        allF = {}

        # estimate F for each bin
        for b in data['bin'].unique():

            subData = data[data['bin'] == b]

            X = sm.add_constant(np.log(subData[target+'residence']) + np.log(subData[target+'jobs']))
            
#             X = sm.add_constant(np.log(subData['residence']) + np.log(subData['jobs']))
            y = subData[target]

            model = sm.OLS(y,X).fit()

            allF[b] = model.params[0]

        binF = pd.DataFrame.from_dict(allF, orient='index', columns={'F'}).reset_index()
        binF.rename(columns={'index':'bin'}, inplace=True)
        
        dataF.append(binF)
        
        allU = {}
        allU_arr = []

        # temporary dataframe with F
        dataStep1 = data.merge(binF, on='bin', how='left') 

        # estimate V(o) for each origin
        for o in data['origin'].unique():

            subData = dataStep1[dataStep1['origin'] == o]
            
            X = np.log(subData[target+'residence']*subData['F'])
            y = np.log(subData[target]) - np.log(subData[target+'jobs'])
            

            X = sm.add_constant(X)
            model = sm.OLS(y,X).fit()

            allU[o] = model.params[0]
            allU_arr.append([model.params[0]]*len(subData))

        binU = pd.DataFrame.from_dict(allU, orient='index', columns={'V'}).reset_index()
        binU.rename(columns={'index':'origin'}, inplace=True)

        # temporary dataframe with U
        dataStep2 = dataStep1.merge(binU, on='origin', how='left') 
        # temp = dataStep2[['from_residents', 'V']].drop_duplicates()
        dataStep2.loc[dataStep2.V < 0, 'V'] = 0.0001

        # proportional fitting for V(o) for constraint sum(V) = sum(population)
        Ptotal = [np.array(dataStep2.drop_duplicates(subset='origin')[target+'residence'].rename('total'))]
        dimensions1 = [[0]]
        pad = len(max(allU_arr, key=len))
        vs = np.array([i + [0]*(pad-len(i)) for i in allU_arr])
        IPF_1 = ipfn.ipfn(vs, Ptotal, dimensions1)
        m = IPF_1.iteration()
        temp1 = dataStep2.drop_duplicates(subset='origin')
        temp1['m'] = m[:,0]
        dataStep2 = dataStep2.merge(temp1[['origin', 'm']], on='origin', how='left')
        dataStep2.loc[dataStep2.m == 0, 'm'] = 0.0001

        allV = {}
        allV_arr = []

        # estimate U(d) for each destination
        for d in data['destination'].unique():

            subData = dataStep2[dataStep2['destination'] == d]

            X = np.log(subData['m']*subData[target+'jobs']*subData['F'])
            
            y = np.log(subData[target])

            X = sm.add_constant(X)
            model = sm.OLS(y,X).fit()

            allV[d] = model.params[0]
            allV_arr.append([model.params[0]]*len(subData))

        binV = pd.DataFrame.from_dict(allV, orient='index', columns={'U'}).reset_index()
        binV.rename(columns={'index':'destination'}, inplace=True)

        # temporary dataframe with U
        dataStep3 = dataStep2.merge(binV, on='destination', how='left') 

        # proportional fitting for U(d) for constraint sum(U) = sum(jobs)
        Wtotal = [np.array(dataStep3.drop_duplicates(subset='destination')[target+'jobs'].rename('total'))]
        dimensions1 = [[0]]
        pad = len(max(allV_arr, key=len))
        us = np.array([i + [0]*(pad-len(i)) for i in allV_arr])
        IPF_1 = ipfn.ipfn(us, Wtotal, dimensions1)
        n = IPF_1.iteration()
        
        ## final dataframe with F, U, V
        temp2 = dataStep3.drop_duplicates(subset='destination')
        temp2['n'] = n[:,0]
        dataStep3 = dataStep3.merge(temp2[['destination', 'n']], on='destination', how='left')
        dataStep3.loc[dataStep3.n == 0, 'n'] = 0.0001
        
        dataUV.append(dataStep3)
    
    if separate_income == False:
        return dataF[0], dataUV[0]
    else:
        return dataF, dataUV

In [18]:
!mkdir constrainCTdistbinsIncome/
import os
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataF, dataUV = constrained_model(getbins(df),separate_income=True)
        dataUV = pd.concat([i for i in dataUV])
        dataUV['flowPred'] = dataUV['F']*dataUV['m']*dataUV['n']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV = dataUV.rename(columns={'SOOO','flowReal'})
        dataUV.to_csv('constrainCTdistbinsIncome/'+city,index=False)
    #     break

mkdir: cannot create directory ‘constrainCTdistbinsIncome/’: File exists
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance
ipfn converged: convergence_rate not updating or below rate_tolerance


In [None]:
!mkdir constrainCTdistbins/
import os
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataF, dataUV = constrained_model(getbins(df),separate_income=True)
        dataUV = pd.concat([i for i in dataUV])
        dataUV['flowPred'] = dataUV['F']*dataUV['m']*dataUV['n']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV = dataUV.rename(columns={'SOOO','flowReal'})
        dataUV.to_csv('constrainCTdistbins/'+city,index=False)
    #     break

### mse doubly constrained model from Mingyi

In [7]:
def constrained_model_nonlog(data, separate_income=False):
    
    y_target = ['S000']  # target = total commute if no income segregation

    
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        
        origin = data.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = data.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    
    dataF = []
    dataUV = []
    
    for target in y_target:
#         print(target)
        allF = {}

        # estimate F for each bin
        for b in data['bin'].unique():

            subData = data[data['bin'] == b]

            X = subData[target+'residence'] * subData[target+'jobs']
            
            y = subData[target]

            model = sm.OLS(y,X).fit()
            
            allF[b] = model.params[0]

        binF = pd.DataFrame.from_dict(allF, orient='index', columns={'F'}).reset_index()
        binF.rename(columns={'index':'bin'}, inplace=True)
        
        dataF.append(binF)
        
        allU = {}
        allU_arr = []

        # temporary dataframe with F
        dataStep1 = data.merge(binF, on='bin', how='left') 

        # estimate V(o) for each origin
        for o in data['origin'].unique():

            subData = dataStep1[dataStep1['origin'] == o]
            
            X = subData[target+'residence']*subData['F']*subData[target+'jobs']
            y = subData[target] 
            

            model = sm.OLS(y,X).fit()

            allU[o] = model.params[0]
            allU_arr.append([model.params[0]]*len(subData))

        binU = pd.DataFrame.from_dict(allU, orient='index', columns={'V'}).reset_index()
        binU.rename(columns={'index':'origin'}, inplace=True)

        # temporary dataframe with U
        dataStep2 = dataStep1.merge(binU, on='origin', how='left') 
        # temp = dataStep2[['from_residents', 'V']].drop_duplicates()
        dataStep2.loc[dataStep2.V < 0, 'V'] = 0.0001

        # proportional fitting for V(o) for constraint sum(V) = sum(population)
        Ptotal = [np.array(dataStep2.drop_duplicates(subset='origin')[target+'residence'].rename('total'))]
        dimensions1 = [[0]]
        pad = len(max(allU_arr, key=len))
        vs = np.array([i + [0]*(pad-len(i)) for i in allU_arr])
        IPF_1 = ipfn.ipfn(vs, Ptotal, dimensions1)
        m = IPF_1.iteration()
        temp1 = dataStep2.drop_duplicates(subset='origin')
        temp1['m'] = m[:,0]
        dataStep2 = dataStep2.merge(temp1[['origin', 'm']], on='origin', how='left')
        dataStep2.loc[dataStep2.m == 0, 'm'] = 0.0001

        allV = {}
        allV_arr = []

        # estimate U(d) for each destination
        for d in data['destination'].unique():

            subData = dataStep2[dataStep2['destination'] == d]

            X = subData['m']*subData[target+'jobs']*subData['F']
            
            y = subData[target]

            model = sm.OLS(y,X).fit()

            allV[d] = model.params[0]
            allV_arr.append([model.params[0]]*len(subData))

        binV = pd.DataFrame.from_dict(allV, orient='index', columns={'U'}).reset_index()
        binV.rename(columns={'index':'destination'}, inplace=True)

        # temporary dataframe with U
        dataStep3 = dataStep2.merge(binV, on='destination', how='left') 

        # proportional fitting for U(d) for constraint sum(U) = sum(jobs)
        Wtotal = [np.array(dataStep3.drop_duplicates(subset='destination')[target+'jobs'].rename('total'))]
        dimensions1 = [[0]]
        pad = len(max(allV_arr, key=len))
        us = np.array([i + [0]*(pad-len(i)) for i in allV_arr])
        IPF_1 = ipfn.ipfn(us, Wtotal, dimensions1)
        n = IPF_1.iteration()
        
        ## final dataframe with F, U, V
        temp2 = dataStep3.drop_duplicates(subset='destination')
        temp2['n'] = n[:,0]
        dataStep3 = dataStep3.merge(temp2[['destination', 'n']], on='destination', how='left')
        dataStep3.loc[dataStep3.n == 0, 'n'] = 0.0001
        
        dataUV.append(dataStep3)
    
    if separate_income == False:
        return dataF[0], dataUV[0]
    else:
        return dataF, dataUV

In [10]:
!mkdir constrainCTdistbinsNonlogIncome/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataF, dataUV = constrained_model_nonlog(getbins(df),separate_income=True)
        dataUV = pd.concat([i for i in dataUV])
        dataUV['flowPred'] = dataUV['F']*dataUV['m']*dataUV['n']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV = dataUV.rename(columns={'SOOO','flowReal'})
        dataUV.to_csv('constrainCTdistbinsNonlogIncome/'+city,index=False)

In [12]:
!mkdir constrainCTdistbinsNonlog/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataF, dataUV = constrained_model_nonlog(getbins(df),separate_income=False)
        dataUV['flowPred'] = dataUV['F']*dataUV['m']*dataUV['n']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV = dataUV.rename(columns={'SOOO','flowReal'})
        dataUV.to_csv('constrainCTdistbinsNonlog/'+city,index=False)

mkdir: cannot create directory ‘constrainCTdistbinsNonlog/’: File exists


### MSE, doubly constrained, fit u,v together in iterations, from Mingyi

In [14]:
def balancing(test,target,iterationNum,iteration = 20):
#     print(target,'iteration', iterationNum)
    if target+'B' not in test.columns:
        test[target+'B'] = 1
    test[target+'BDF'] = test[target+'jobs']*test[target+'f(d)']*test[target+'B']
    if target+'A' in test.columns:
        del test[target+'A']
    del test[target+'B']
    test = test.groupby(['origin']).agg({target+'BDF':sum}).\
    rename(columns={target+'BDF':target+'A'}).reset_index().\
    merge(test,on=['origin'],how='right')
    test[target+'A'] = 1/test[target+'A']
    test[target+'AOF'] = test[target+'residence']*test[target+'f(d)']*test[target+'A']
    test = test.groupby(['destination']).agg({target+'AOF':sum}).\
    rename(columns={target+'AOF':target+'B'}).reset_index().\
    merge(test,on=['destination'],how='right')
    test[target+'B'] = 1/test[target+'B']
    test[target+'flowPred'] = test[target+'residence']*test[target+'jobs']*test[target+'f(d)']*\
                        test[target+'A']*test[target+'B']
    
    resultO = test[['origin',target+'residence']].drop_duplicates().\
    merge(test.groupby(['origin'])[[target+'flowPred']].sum().reset_index(),on=['origin'],how='left')
    resultO['percentage'] = np.abs(resultO[target+'residence'] - resultO[target+'flowPred'])/resultO[target+'residence']
    resultO = resultO['percentage'].mean()

    resultD = test[['destination',target+'jobs']].drop_duplicates().\
    merge(test.groupby(['destination'])[[target+'flowPred']].sum().reset_index(),on=['destination'],how='left')
    resultD['percentage'] = np.abs(resultD[target+'jobs'] - resultD[target+'flowPred'])/resultD[target+'jobs']
    resultD = resultD['percentage'].mean()
#     print(resultO,resultD)
    if resultO < 0.05 and resultD < 0.05:
        return test
    else:
        if iterationNum < iteration:
            return balancing(test,target,iterationNum = iterationNum+1,iteration = 20)
        else:
            return test
        
def doubly_constrained_model_AB(data, separate_income=False):
    
    y_target = ['S000']  # target = total commute if no income segregation

    
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        
        origin = data.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = data.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    
    
    targetOutput = []
    for target in y_target:
        binoutput = pd.DataFrame()
        # estimate F for each bin
        for b in data['bin'].unique():
            
            subData = data[data['bin'] == b]
            X = subData[target+'residence'] * subData[target+'jobs']
            
            y = subData[target]

            model = sm.OLS(y,X).fit()
            
            subData[target+'f(d)'] = model.params[0]       
            binoutput = pd.concat([binoutput,subData])
        binoutput = balancing(binoutput,target,iterationNum=1,iteration = 20)
        
        binoutput = binoutput[['origin','destination',target,target+'A',target+'B',target+'f(d)','bin',target+'flowPred']]
        targetOutput.append(binoutput)
    if separate_income == True:
        targetOutput = targetOutput[0].merge(targetOutput[1],on=['origin','destination'],how='outer').\
                        merge(targetOutput[2],on=['origin','destination'],how='outer')
    else:
        targetOutput = targetOutput[0]
    targetOutput = targetOutput.merge(data[['origin','destination','S000']])
    return targetOutput

In [23]:
!mkdir constrainCTdistbinsABIncome/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataUV = doubly_constrained_model_AB(getbins(df),separate_income=True)
        dataUV['flowPred'] = dataUV['SE01flowPred']+dataUV['SE02flowPred']+dataUV['SE03flowPred']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV.to_csv('constrainCTdistbinsABIncome/'+city,index=False)

mkdir: cannot create directory ‘constrainCTdistbinsABIncome/’: File exists


In [None]:
!mkdir constrainCTdistbinsAB/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        dataUV = doubly_constrained_model_AB(getbins(df),separate_income=False)
        dataUV['flowPred'] = dataUV['SE01flowPred']+dataUV['SE02flowPred']+dataUV['SE03flowPred']
        dataUV = dataUV.groupby(['origin','destination']).agg({'S000':sum,'flowPred':sum}).reset_index()
        dataUV.to_csv('constrainCTdistbinsAB/'+city,index=False)

### unconstrain model, power law, from Mingyi

In [30]:
import scipy.optimize as optimize
def power_law(x,k,a):
    return k*((x[:,0]**a)*x[:,1]*x[:,2])
def unconstrained_model(data, separate_income=False):
    y_target = ['S000']  # target = total commute if no income segregation
    data.loc[data.distance == 0, 'distance'] = 0.2
    data.loc[data['SE01'] == 0, 'SE01'] = 0.1
    data.loc[data['SE02'] == 0, 'SE02'] = 0.1
    data.loc[data['SE03'] == 0, 'SE03'] = 0.1
    origin = df.groupby(['origin']).agg({'S000':sum}).reset_index()
    origin.columns = ['origin','S000residence']
    destination = df.groupby(['destination']).agg({'S000':sum}).reset_index()
    destination.columns = ['destination','S000jobs']
    data = data.merge(origin,on=['origin'])
    data = data.merge(destination,on=['destination'])
    
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        origin = df.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = df.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    dataF = []
    for target in y_target:
        X = data[['distance',target+'jobs',target+'residence']].values
        y = data[target].values
        pars, cov = optimize.curve_fit(f=power_law, xdata=X, ydata=y, bounds=(-np.inf, np.inf))
#         print(pars)
        data[target+'k'] = pars[0]
        data[target+'a'] = pars[1]
        data[target+'pred'] = data[target+'k']*(data['distance']**data[target+'a'])*data[target+'jobs']*data[target+'residence']
    return data

        

In [32]:
!mkdir unconstrainCTPowerlaw/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=False)
        df.to_csv('unconstrainCTPowerlaw/'+city,index=False)


mkdir: cannot create directory ‘unconstrainCTPowerlaw/’: File exists


In [None]:
!mkdir unconstrainCTPowerlawIncome/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=True)
        df.to_csv('unconstrainCTPowerlawIncome/'+city,index=False)


### unconstrain model, full power law, from Mingyi

In [33]:

def power_law(x,k,a,b,c):
    return k*(x[:,0]**a)*(x[:,1]**b)*(x[:,2]**c)
def unconstrained_model(data, separate_income=False):
    data.loc[data.distance == 0, 'distance'] = 0.2
    data.loc[data['SE01'] == 0, 'SE01'] = 0.1
    data.loc[data['SE02'] == 0, 'SE02'] = 0.1
    data.loc[data['SE03'] == 0, 'SE03'] = 0.1
    y_target = ['S000']  # target = total commute if no income segregation.2
    origin = df.groupby(['origin']).agg({'S000':sum}).reset_index()
    origin.columns = ['origin','S000residence']
    destination = df.groupby(['destination']).agg({'S000':sum}).reset_index()
    destination.columns = ['destination','S000jobs']
    data = data.merge(origin,on=['origin'])
    data = data.merge(destination,on=['destination'])
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] # target = individual income commute if income segregation
        origin = df.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = df.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    dataF = []
    for target in y_target:
        X = data[['distance',target+'jobs',target+'residence']].values
        y = data[target].values
        pars, cov = optimize.curve_fit(f=power_law, xdata=X, ydata=y, bounds=(-np.inf, np.inf))
#         print(pars)
        data[target+'k'] = pars[0]
        data[target+'a'] = pars[1]
        data[target+'b'] = pars[2]
        data[target+'c'] = pars[3]
        data[target+'pred'] = data[target+'k']*(data['distance']**data[target+'a'])*\
                        (data[target+'jobs']**data[target+'b'])*(data[target+'residence']**data[target+'c'])
    return data

        

In [None]:
!mkdir unconstrainCTFullPowerlaw/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=False)
        df.to_csv('unconstrainCTFullPowerlaw/'+city,index=False)


In [None]:
!mkdir unconstrainCTFullPowerlawIncome/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=True)
        df.to_csv('unconstrainCTFullPowerlawIncome/'+city,index=False)


### unconstrain model, exp, from Mingyi

In [34]:
import scipy.optimize as optimize
def exp(x, a,b):
    return a*(np.e**(b*x))
def unconstrained_model(data, separate_income=False):
    y_target = ['S000']  # target = total commute if no income segregation
    data.loc[data.distance == 0, 'distance'] = 0.2
    data.loc[data['SE01'] == 0, 'SE01'] = 0.1
    data.loc[data['SE02'] == 0, 'SE02'] = 0.1
    data.loc[data['SE03'] == 0, 'SE03'] = 0.1
    origin = df.groupby(['origin']).agg({'S000':sum}).reset_index()
    origin.columns = ['origin','S000residence']
    destination = df.groupby(['destination']).agg({'S000':sum}).reset_index()
    destination.columns = ['destination','S000jobs']
    data = data.merge(origin,on=['origin'])
    data = data.merge(destination,on=['destination'])
    if separate_income == True:
        
        y_target = ['SE01', 'SE02', 'SE03'] 
        origin = df.groupby(['origin']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        origin.columns = ['origin','SE01residence','SE02residence','SE03residence']
        destination = df.groupby(['destination']).agg({'SE01':sum,'SE02':sum,'SE03':sum}).reset_index()
        destination.columns = ['destination','SE01jobs','SE02jobs','SE03jobs']
        data = data.merge(origin,on=['origin'])
        data = data.merge(destination,on=['destination'])
    dataF = []
    for target in y_target:
        X = data.distance.values
        y = data[target]/(data[target+'jobs']*data[target+'residence'])
        pars, cov = optimize.curve_fit(f=exp, xdata=X, ydata=y, bounds=(-np.inf, np.inf))
#         print(pars)
        data[target+'a'] = pars[0]
        data[target+'b'] = pars[1]
        data[target+'pred'] = data[target+'a']*(np.e**(data['distance']*data[target+'b']))*data[target+'jobs']*data[target+'residence']
    return data

        

In [None]:
!mkdir unconstrainCTExp/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=False)
        df.to_csv('unconstrainCTExp/'+city,index=False)


In [None]:
!mkdir unconstrainCTExpIncome/
cities = os.listdir('cities/')
for city in cities:
    if '.csv' in city:
        df = pd.read_csv('cities/'+city)
        df = unconstrained_model(df,separate_income=False)
        df.to_csv('unconstrainCTExpIncome/'+city,index=False)
