In [1]:
import pandas as pd
import numpy as np
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta
from biogeme.expressions import log
import math
from tqdm import tqdm

In [2]:
pums = "us_estdata_full.csv"
acs = "ACS_2018.csv"
lodes = "wac_puma.csv"
lpuma = "leave_puma.csv"
migpuma_acs_file = "data/ACS_MIGPUMA_2018.csv"
migpuma_lodes_file = "data/wac_migpuma.csv"

In [3]:
df = pd.read_csv(f"data/{pums}", usecols=["NAICS", "SERIALNO", "SCHG", "AGEP", "NATIVITY", "PAOC", "ESR", "MAR", "MARHD", "MARHW", "MARHM", "MIL", "TYPE_ORIG", "STAY", "CHOSEN", "ORIGIN", "FES", "HHT", "CBSA_NAME_ORIG", "UNEMP_PCT_ORIG_ADJ", "PERNP_ORIG_ADJ", "HH_MED_VAL_ORIG", "SMOCP_ORIG_ADJ", "GRNTP_ORIG_ADJ", "OWN_JOB_ORIG", "TOT_JOBS_ORIG", "MIGSP", "POBP", "SCHL", "child", "REC_CHILD"])
df

Unnamed: 0,SERIALNO,AGEP,MAR,MARHD,MARHM,MARHW,MIL,SCHG,SCHL,ESR,...,NAICS,HH_MED_VAL_ORIG,TOT_JOBS_ORIG,SMOCP_ORIG_ADJ,GRNTP_ORIG_ADJ,PERNP_ORIG_ADJ,UNEMP_PCT_ORIG_ADJ,OWN_JOB_ORIG,TYPE_ORIG,CBSA_NAME_ORIG
0,2018GQ0000049,19,5,,,,4.0,15.0,18.0,6.0,...,RET,217900.000000,91687,1019.5,1028.0,36000.0,0.027939,8536,1,168.0
1,2018GQ0000058,18,5,,,,4.0,15.0,18.0,6.0,...,FOD,136400.000000,44030,819.0,846.5,33100.0,0.045455,3653,0,161.0
2,2018GQ0000219,53,5,,,,4.0,,17.0,6.0,...,PUB,143756.972263,166367,856.5,915.0,35000.0,0.039870,0,1,127.0
3,2018GQ0000246,28,5,,,,2.0,,19.0,6.0,...,,109000.000000,43827,623.5,687.0,29100.0,0.071856,0,3,-1.0
4,2018GQ0000251,25,5,,,,4.0,,12.0,6.0,...,MFG,130301.073819,177381,771.5,860.0,32000.0,0.057325,18404,1,48.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530894,2018HU1400326,41,3,2.0,2.0,2.0,4.0,,18.0,1.0,...,MED,207700.000000,53153,959.0,850.0,43600.0,0.018330,7425,3,-1.0
2530895,2018HU1400326,34,5,,,,4.0,,21.0,1.0,...,EDU,207700.000000,53153,959.0,850.0,43600.0,0.018330,4856,3,-1.0
2530896,2018HU1400502,49,3,2.0,2.0,2.0,4.0,,19.0,1.0,...,MED,290600.000000,54624,960.0,831.0,40000.0,0.012712,8056,3,-1.0
2530897,2018HU1400502,19,5,,,,4.0,,16.0,1.0,...,AGR,290600.000000,54624,960.0,831.0,40000.0,0.012712,882,3,-1.0


In [5]:
df["MED"] = np.where(df["NAICS"] == "MED", 1, 0)
df["MFG"] = np.where(df["NAICS"] == "MFG", 1, 0)
df["RET"] = np.where(df["NAICS"] == "RET", 1, 0)
df["EDU"] = np.where(df["NAICS"] == "EDU", 1, 0)
df["ADM"] = np.where(df["NAICS"] == "ADM", 1, 0)
df["FOD"] = np.where(df["NAICS"] == "FOD", 1, 0)
df["PRF"] = np.where(df["NAICS"] == "PRF", 1, 0)
df["TRN"] = np.where(df["NAICS"] == "TRN", 1, 0)
df["SRV"] = np.where(df["NAICS"] == "SRV", 1, 0)
df["FIN"] = np.where(df["NAICS"] == "FIN", 1, 0)
df["WHL"] = np.where(df["NAICS"] == "WHL", 1, 0)
df["AGR"] = np.where(df["NAICS"] == "AGR", 1, 0)
df["PUB"] = np.where(df["NAICS"] == "PUB", 1, 0)
df["INF"] = np.where(df["NAICS"] == "INF", 1, 0)
df["ENT"] = np.where(df["NAICS"] == "ENT", 1, 0)
df["REL"] = np.where(df["NAICS"] == "REL", 1, 0)
df["UTL"] = np.where(df["NAICS"] == "UTL", 1, 0)
df["EXT"] = np.where(df["NAICS"] == "EXT", 1, 0)
df["MNG"] = np.where(df["NAICS"] == "MNG", 1, 0)
df["CON"] = np.where(df["NAICS"] == "CON", 1, 0)
df["NO_IND"] = np.where(df["NAICS"].isna(), 1, 0)

In [6]:
# clean up the database (Biogeme Database can only have numerical values)
# df = df.select_dtypes(['number'])
df = df.fillna(0)

In [7]:
df["IN_COLLEGE"] = np.where((df["SCHG"] == 15) | (df["SCHG"] == 16), 1, 0)
df["IN_COLLEGE"]

0          1
1          1
2          0
3          0
4          0
          ..
2530894    0
2530895    0
2530896    0
2530897    0
2530898    0
Name: IN_COLLEGE, Length: 2530899, dtype: int64

In [8]:
df["STAY"].value_counts()

1    2202915
0     327984
Name: STAY, dtype: int64

In [9]:
df["AGE_18_34"] = np.where(df["AGEP"] <= 34, 1, 0)
df["AGE_35_64"] = np.where((df["AGEP"] >= 35) & (df["AGEP"] <= 64), 1, 0)
df["AGE_OVER_65"] = np.where((df["AGEP"] >= 65), 1, 0)
df["FOREIGN"] = np.where(df["NATIVITY"] == 2, 1, 0)

In [10]:
df["AGE_18_22"] = np.where(df["AGEP"] <= 22, 1, 0)
df["AGE_23_29"] = np.where((df["AGEP"] >= 23) & (df["AGEP"] <= 29), 1, 0)
df["AGE_30_39"] = np.where((df["AGEP"] >= 30) & (df["AGEP"] <= 39), 1, 0)
df["AGE_40_49"] = np.where((df["AGEP"] >= 40) & (df["AGEP"] <= 49), 1, 0)
df["AGE_50_64"] = np.where((df["AGEP"] >= 50) & (df["AGEP"] <= 64), 1, 0)

In [11]:
df["AGEP"].describe()

count    2.530899e+06
mean     5.002470e+01
std      1.886097e+01
min      1.800000e+01
25%      3.400000e+01
50%      5.100000e+01
75%      6.400000e+01
max      9.600000e+01
Name: AGEP, dtype: float64

In [12]:
df["EDU_LESS_HIGH"] = np.where(df["SCHL"] <= 15, 1, 0)
df["EDU_HIGH"] = np.where((df["SCHL"] <= 20) & (df["SCHL"] >= 16), 1, 0) 
df["EDU_COLLEGE"] = np.where(df["SCHL"] >= 21, 1, 0)
# df["EDU_NOCOLLEGE"] = np.where(df["EDU_COLLEGE"] == 0, 1, 0)

In [13]:
# df["WOMAN_CHILD"] = np.where((df["PAOC"] >= 1) & (df["PAOC"] <= 3), 1, 0)
# df["UNEMPLOYED"] = np.where(df["ESR"] == 3, 1, 0)

In [14]:
# df["MALE"] = np.where(df["SEX"] == 1, 1, 0)
# df["FEMALE"] = np.where(df["SEX"] == 0, 1, 0)

In [15]:
df["MARRIED"] = np.where(df["MAR"] == 1, 1, 0)

In [16]:
# df["child_old"] = np.where(df["child"] == df["REC_CHILD"], 0, df["child"])
# df["child_old"].value_counts()
df["child"].value_counts()

0    1755972
1     774927
Name: child, dtype: int64

In [17]:
df["REC_NO_MAR"] = np.where((df["MARHD"] == 1) | (df["MARHW"] == 1), 1, 0)
df["REC_NO_MAR"].value_counts()

0    2495055
1      35844
Name: REC_NO_MAR, dtype: int64

In [18]:
df["MARHM_new"] = np.where(df["MARHM"] == 2, 0, df["MARHM"])
df["MARHM_new"].value_counts()

0.0    2488855
1.0      42044
Name: MARHM_new, dtype: int64

In [19]:
# df["married_old"] = np.where((df["MARHM"] == df["MARRIED"]), 0, df["MARRIED"])
# df["married_old"].value_counts()

In [20]:
df["MILITARY"] = np.where(df["MIL"] == 1, 1, 0)

In [21]:
# up to debate still
df["AGR_EXT"] = np.where((df["AGR"] == 1) | (df["EXT"] == 1), 1, 0)
df["HIGH_ED"] = np.where((df["MED"] == 1) | (df["EDU"] == 1) | (df["PRF"] == 1) | (df["MED"] == 1) | (df["FIN"] == 1) | (df["INF"] == 1) | (df["MED"] == 1), 1, 0)
df["LICENSE"] = np.where((df["SRV"] == 1) | (df["REL"] == 1), 1, 0)
df["OTHER_JOB"] = np.where((df["AGR_EXT"] == 0) & (df["HIGH_ED"] == 0) & (df["LICENSE"] == 0) & (df["NO_IND"] == 0), 1, 0)

In [22]:
df["MICRO_adj_ORIG"] = np.where((df["TYPE_ORIG"] == 3) | (df["TYPE_ORIG"] == 2), 1, 0)
df["METRO"] = np.where(df["TYPE_ORIG"] == 1, 1, 0)
df["T34"] = np.where(df["TYPE_ORIG"] == 0, 1, 0)

In [23]:
df["UNEMPLOYED"] = np.where((df["ESR"] == 3) | (df["ESR"] == 6), 1, 0)
df["IN_LF"] = np.where(df["ESR"] == 6, 0, 1)

In [24]:
df["WORK2_MAR"] = np.where(df["FES"] == 1, 1, 0)
df["WORK1_MAR"] = np.where((df["FES"] <= 4) & (df["FES"] >= 2), 1, 0)
df["OTHER_FAMILY"] = np.where((df["HHT"] == 2) | (df["HHT"] == 3), 1, 0)

In [25]:
df.loc[df["CBSA_NAME_ORIG"] == -1, "CBSA_NAME_ORIG"] = -2

In [26]:
df["MIGSP_ORIG"] = df["ORIGIN"].astype(str).str.zfill(7).str[0:2].astype(int)

In [27]:
df.memory_usage().sum()

1660269872

In [28]:
# reading in external data
puma_acs_data = pd.read_csv(f"data/{acs}")
puma_lodes_data = pd.read_csv(f"data/{lodes}")

puma_acs_data = puma_acs_data.set_index("PUMA")
puma_lodes_data = puma_lodes_data.set_index("puma")

# puma_acs_data = puma_acs_data[~puma_acs_data["Geo_STATE"].isin([2, 15, 72])]

# puma_acs_data.index = puma_recodes.loc[puma_acs_data.index].values.reshape((-1))
# puma_lodes_data.index = puma_recodes.loc[puma_lodes_data.index].values.reshape((-1))

puma_acs_data.head()

Unnamed: 0_level_0,Geo_FIPS,Geo_GEOID,Geo_NAME,Geo_QName,Geo_STUSAB,Geo_LOGRECNO,Geo_STATE,Geo_PUMA5,Total Population,Population Density (Per Sq. Mile),...,Civilian Population 18 Years and Over Veteran,Civilian Population 18 Years and Over Veteran 18 to 64 Years,Civilian Population 18 Years and Over Veteran 65 Years and Over,Civilian Population 18 Years and Over Nonveteran,Civilian Population 18 Years and Over Nonveteran 18 to 64 Years,Civilian Population 18 Years and Over Nonveteran 65 Years and Over,Total Population Native Born,Total Population Foreign Born,Total Population Foreign Born Naturalized Citizen,Total Population Foreign Born Not a Citizen
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100100,100100,79500US0100100,"Lauderdale, Colbert, Franklin & Marion (Northe...","Lauderdale, Alabama",al,57,1,100,184923,223.027706,...,9722,4332,5390,136809,105950,30859,178843.0,6080.0,1312.0,4768.0
100200,100200,79500US0100200,Limestone & Madison (Outer) Counties--Huntsvil...,Limestone & Madison (Outer) Counties--Huntsvil...,al,58,1,200,196618,237.132544,...,16167,9728,6439,135089,114171,20918,190579.0,6039.0,3440.0,2599.0
100301,100301,79500US0100301,Huntsville (North) & Madison (East) Cities PUM...,Huntsville (North) & Madison (East) Cities PUM...,al,59,1,301,133112,1459.68832,...,12813,8417,4396,91051,75831,15220,123801.0,9311.0,4154.0,5157.0
100302,100302,79500US0100302,"Huntsville City (Central & South) PUMA, Alabama","Huntsville City (Central & South) PUMA, Alabama",al,60,1,302,101737,1634.875764,...,8523,5271,3252,71585,57043,14542,95354.0,6383.0,3075.0,3308.0
100400,100400,79500US0100400,"DeKalb & Jackson Counties PUMA, Alabama","DeKalb & Jackson Counties PUMA, Alabama",al,61,1,400,123121,148.490962,...,5845,2846,2999,89462,70118,19344,117038.0,6083.0,669.0,5414.0


In [29]:
# reading in MIGPUMA-PUMA distance data
distances = pd.read_csv("data/puma_distance_matrix_fixed.csv").set_index("Unnamed: 0")
distances.index = distances.index.astype(int)
distances.columns = distances.columns.astype(int)
# distances.columns = puma_recodes.loc[distances.columns.astype(int)].values.reshape((-1))
distances

Unnamed: 0_level_0,600105,600102,608502,600108,600107,600101,600110,608504,600109,600106,...,2602200,2600900,4702402,4702300,4702401,4700500,4702501,5310400,5310300,5310100
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
600100,2.811224e+04,2.811224e+04,39057.0,2.811224e+04,2.811224e+04,2.811224e+04,2.811224e+04,32785.0,2.811224e+04,2.811224e+04,...,3659560.0,3658028.0,3704534.0,3707792.0,3697348.0,3709156.0,3675491.0,1510819.0,1366000.0,1521187.0
600700,2.676500e+05,2.541540e+05,308889.0,2.933460e+05,2.845210e+05,2.477670e+05,2.743320e+05,302616.0,2.903600e+05,2.738470e+05,...,3548479.0,3546947.0,3757628.0,3748449.0,3750726.0,3741417.0,3716148.0,1293877.0,1149057.0,1304245.0
601100,3.420630e+05,3.285670e+05,383302.0,3.677590e+05,3.589340e+05,3.221800e+05,3.618530e+05,377029.0,3.647730e+05,3.482590e+05,...,3646878.0,3645346.0,3856026.0,3846847.0,3849125.0,3839816.0,3814546.0,1246358.0,1101538.0,1256725.0
601300,5.418500e+04,4.434600e+04,90212.0,8.001100e+04,6.752000e+04,4.544600e+04,6.876300e+04,83940.0,7.168300e+04,5.517000e+04,...,3635231.0,3633699.0,3743085.0,3746342.0,3735899.0,3747707.0,3714042.0,1486490.0,1341670.0,1496858.0
601500,4.872780e+05,4.737820e+05,528517.0,5.129740e+05,5.041490e+05,4.673950e+05,5.070680e+05,522244.0,5.099870e+05,4.934740e+05,...,3594176.0,3592644.0,3803325.0,3794145.0,3796423.0,3787114.0,3761844.0,1149082.0,1004262.0,1159450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304007,3.996091e+06,4.004766e+06,3970737.0,3.980928e+06,3.987472e+06,4.012807e+06,3.958553e+06,3961958.0,3.991797e+06,3.985390e+06,...,1210028.0,1269788.0,373179.0,429406.0,391176.0,457262.0,418124.0,4143960.0,4251644.0,4527073.0
5151000,4.385759e+06,4.394434e+06,4360405.0,4.370596e+06,4.377141e+06,4.402476e+06,4.348221e+06,4351626.0,4.381466e+06,4.375058e+06,...,917732.0,1041456.0,718843.0,663499.0,695569.0,705881.0,692139.0,4231226.0,4338910.0,4661344.0
5151001,4.543694e+06,4.530198e+06,4542272.0,4.569390e+06,4.560565e+06,4.523811e+06,4.547511e+06,4533493.0,4.566404e+06,4.549890e+06,...,1050887.0,1169711.0,900710.0,845366.0,877436.0,887748.0,874006.0,4350356.0,4458040.0,4780474.0
4500600,4.316959e+06,4.325634e+06,4291605.0,4.301796e+06,4.308341e+06,4.333676e+06,4.279421e+06,4282827.0,4.312666e+06,4.306259e+06,...,1193418.0,1317143.0,712110.0,689738.0,721808.0,732120.0,718377.0,4444109.0,4551794.0,4827222.0


In [30]:
puma_recodes = pd.factorize(df["CHOSEN"].unique())[1]
puma_recodes = pd.DataFrame(index=puma_recodes, data=np.arange(1, 2337), columns=["code"])
puma_recodes

Unnamed: 0,code
101600,1
101900,2
102000,3
102400,4
102701,5
...,...
5600300,2332
5600100,2333
5600200,2334
5600500,2335


In [31]:
puma_recodes[puma_recodes.code == 12]

Unnamed: 0,code
102300,12


In [55]:
leave_puma = pd.read_csv(f"data/{lpuma}").set_index("CHOSEN")
leave_puma["UNEMP_PCT"] = leave_puma["UNEMP"] / (leave_puma["EMP"] + leave_puma["UNEMP"])
leave_puma

Unnamed: 0_level_0,JWMNP,FINCP,GRNTP,HINCP,SMOCP,PINCP,HH_COST,PERNP,WAGP,UNEMP,EMP,type,UNEMP_PCT
CHOSEN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100100,13.0,48000.0,680.0,33550.0,753.0,21800.0,0.262523,21800.0,20500.0,3,83,leavers,0.034884
100200,20.0,70000.0,785.0,70000.0,1107.0,35000.0,0.162381,34000.0,34000.0,3,86,leavers,0.033708
100301,15.0,77400.0,973.5,63600.0,977.0,30000.0,0.175737,30000.0,28600.0,8,77,leavers,0.094118
100302,15.0,62800.0,920.0,55000.0,1131.0,30000.0,0.217411,29050.0,28150.0,6,92,leavers,0.061224
100400,21.5,30300.0,580.0,35600.0,775.0,28600.0,0.159860,27000.0,27000.0,6,43,leavers,0.122449
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,10.0,76000.0,1005.0,55900.0,1001.0,26000.0,0.180750,26000.0,25000.0,3,84,leavers,0.034483
5600200,5.0,54000.0,922.0,61000.0,1130.0,35000.0,0.193548,25000.0,25000.0,2,41,leavers,0.046512
5600300,10.0,63700.0,930.0,42620.0,1338.0,20000.0,0.277500,19000.0,18000.0,12,135,leavers,0.081633
5600400,10.0,63800.0,820.0,61500.0,1206.5,25000.0,0.183302,24200.0,24000.0,2,60,leavers,0.032258


In [32]:
# reading in PUMA-MIGPUMA equivalency data
puma_migpuma = pd.read_excel("data/puma_migpuma.xlsx", dtype="str")
puma_migpuma["PUMA"] = puma_migpuma["State"] + puma_migpuma["PUMA"]
puma_migpuma["MIGPUMA"] = puma_migpuma["State"] + puma_migpuma["MIGPUMA"]
# some MIGPUMA fields have footnotes
puma_migpuma["MIGPUMA"] = puma_migpuma["MIGPUMA"].str.strip("*")
puma_migpuma = puma_migpuma.set_index("PUMA")
puma_migpuma = puma_migpuma.dropna()
puma_migpuma.index = puma_migpuma.index.astype(int)
puma_migpuma["State"] = puma_migpuma["State"].astype(int)
puma_migpuma

Unnamed: 0_level_0,JWMNP,FINCP,GRNTP,HINCP,SMOCP,PINCP,HH_COST,PERNP,WAGP,UNEMP,EMP,type,UNEMP_PCT
CHOSEN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100100,13.0,48000.0,680.0,33550.0,753.0,21800.0,0.262523,21800.0,20500.0,3,83,leavers,0.034884
100200,20.0,70000.0,785.0,70000.0,1107.0,35000.0,0.162381,34000.0,34000.0,3,86,leavers,0.033708
100301,15.0,77400.0,973.5,63600.0,977.0,30000.0,0.175737,30000.0,28600.0,8,77,leavers,0.094118
100302,15.0,62800.0,920.0,55000.0,1131.0,30000.0,0.217411,29050.0,28150.0,6,92,leavers,0.061224
100400,21.5,30300.0,580.0,35600.0,775.0,28600.0,0.159860,27000.0,27000.0,6,43,leavers,0.122449
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,10.0,76000.0,1005.0,55900.0,1001.0,26000.0,0.180750,26000.0,25000.0,3,84,leavers,0.034483
5600200,5.0,54000.0,922.0,61000.0,1130.0,35000.0,0.193548,25000.0,25000.0,2,41,leavers,0.046512
5600300,10.0,63700.0,930.0,42620.0,1338.0,20000.0,0.277500,19000.0,18000.0,12,135,leavers,0.081633
5600400,10.0,63800.0,820.0,61500.0,1206.5,25000.0,0.183302,24200.0,24000.0,2,60,leavers,0.032258


In [33]:
cbsas = pd.read_csv("data/puma_density.csv")
cbsas = cbsas.set_index("GEOID")
cbsas["type_num"] = 2
cbsas["type_num"] = np.where(cbsas["type"] == "T34", 0, cbsas["type_num"])
cbsas["type_num"] = np.where(cbsas["type"] == "Metro", 1, cbsas["type_num"])
# cbsas["type_num"] = np.where(cbsas["type"] == "Micro", 2, cbsas["type_num"])
cbsas["Name_num"] = pd.factorize(cbsas["CBSA_name"])[0]
cbsas

Unnamed: 0_level_0,Unnamed: 0,GISMATCH,GISJOIN,STATEFIP,State,PUMA,Name,center,type,CBSA_name,type_num,Name_num
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
600105,0,600105,G06000105,6,California,105,"Alameda County (West)--San Leandro, Alameda & ...",POINT (-2258808.793332302 339396.4722123478),T34,"San Francisco-Oakland-Hayward, CA Metro Area",0,0
600102,1,600102,G06000102,6,California,102,Alameda County (Northwest)--Oakland (Northwest...,POINT (-2262119.20387295 349529.9544024405),T34,"San Francisco-Oakland-Hayward, CA Metro Area",0,0
608502,2,608502,G06008502,6,California,8502,Santa Clara County (Northwest)--Sunnyvale & Sa...,POINT (-2251625.280558518 299700.2119385958),Metro,"San Jose-Sunnyvale-Santa Clara, CA Metro Area",1,1
600108,3,600108,G06000108,6,California,108,"Alameda County (Southwest)--Union City, Newark...",POINT (-2249381.571782368 314454.5304018197),T34,"San Francisco-Oakland-Hayward, CA Metro Area",0,0
600107,4,600107,G06000107,6,California,107,Alameda County (Central)--Hayward City PUMA,POINT (-2249209.156361292 325491.2460234718),T34,"San Francisco-Oakland-Hayward, CA Metro Area",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4700500,2373,4700500,G47000500,47,Tennessee,500,Sumner County--Hendersonville City PUMA,POINT (845555.1346444382 -73006.03095774697),Metro,"Nashville-Davidson--Murfreesboro--Franklin, TN...",1,283
4702501,2374,4702501,G47002501,47,Tennessee,2501,Nashville-Davidson (East) PUMA,POINT (834713.9667902931 -111417.8817361052),Metro,"Nashville-Davidson--Murfreesboro--Franklin, TN...",1,283
5310400,2375,5310400,G53010400,53,Washington,10400,"Stevens, Okanogan, Pend Oreille & Ferry Counti...",POINT (-1684664.90787742 1429380.26889712),,,2,-1
5310300,2376,5310300,G53010300,53,Washington,10300,Chelan & Douglas Counties PUMA,POINT (-1811096.507300111 1383753.51149053),Metro,"Wenatchee, WA Metro Area",1,536


In [34]:
# reading in PUMA-MIGPUMA equivalency data
puma_migpuma = pd.read_excel("data/puma_migpuma.xlsx", dtype="str")
puma_migpuma["PUMA"] = puma_migpuma["State"] + puma_migpuma["PUMA"]
puma_migpuma["MIGPUMA"] = puma_migpuma["State"] + puma_migpuma["MIGPUMA"]
# some MIGPUMA fields have footnotes
puma_migpuma["MIGPUMA"] = puma_migpuma["MIGPUMA"].str.strip("*")
puma_migpuma = puma_migpuma.set_index("PUMA")
puma_migpuma = puma_migpuma.dropna()
puma_migpuma.index = puma_migpuma.index.astype(int)
puma_migpuma["State"] = puma_migpuma["State"].astype(int)
puma_migpuma

Unnamed: 0_level_0,State,MIGPUMA
PUMA,Unnamed: 1_level_1,Unnamed: 2_level_1
100100,1,0100190
100200,1,0100290
100301,1,0100290
100302,1,0100290
100400,1,0100400
...,...,...
7200902,72,7200900
7201001,72,7201001
7201002,72,7201002
7201101,72,7201101


In [35]:
df

Unnamed: 0,SERIALNO,AGEP,MAR,MARHD,MARHM,MARHW,MIL,SCHG,SCHL,ESR,...,OTHER_JOB,MICRO_adj_ORIG,METRO,T34,UNEMPLOYED,IN_LF,WORK2_MAR,WORK1_MAR,OTHER_FAMILY,MIGSP_ORIG
0,2018GQ0000049,19,5,0.0,0.0,0.0,4.0,15.0,18.0,6.0,...,1,0,1,0,1,0,0,0,0,13
1,2018GQ0000058,18,5,0.0,0.0,0.0,4.0,15.0,18.0,6.0,...,1,0,0,1,1,0,0,0,0,13
2,2018GQ0000219,53,5,0.0,0.0,0.0,4.0,0.0,17.0,6.0,...,1,0,1,0,1,0,0,0,0,1
3,2018GQ0000246,28,5,0.0,0.0,0.0,2.0,0.0,19.0,6.0,...,0,1,0,0,1,0,0,0,0,1
4,2018GQ0000251,25,5,0.0,0.0,0.0,4.0,0.0,12.0,6.0,...,1,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530894,2018HU1400326,41,3,2.0,2.0,2.0,4.0,0.0,18.0,1.0,...,0,1,0,0,0,1,0,0,0,56
2530895,2018HU1400326,34,5,0.0,0.0,0.0,4.0,0.0,21.0,1.0,...,0,1,0,0,0,1,0,0,0,56
2530896,2018HU1400502,49,3,2.0,2.0,2.0,4.0,0.0,19.0,1.0,...,0,1,0,0,0,1,0,0,1,56
2530897,2018HU1400502,19,5,0.0,0.0,0.0,4.0,0.0,16.0,1.0,...,0,1,0,0,0,1,0,0,1,56


In [36]:
utilities = np.empty((2530899, 2337), dtype=np.float64)
utilities.fill(0)

In [37]:
c_stay = 15.8
c_stay_age_18_22 = -0.426
c_stay_age_23_29 = -0.571
c_stay_age_30_39 = -0.317
c_stay_age_40_49 = 0
c_stay_age_50_64 = 0.231
c_stay_age_65 = 0.795
c_stay_child = -0.471
c_stay_rec_child = -0.0526
c_stay_rec_mar = -1.18
c_stay_rec_nomar = -0.57
c_stay_2work_mar = 1.02
c_stay_1work_mar = 0.899
c_stay_otherfamily = 0.632
c_stay_college = 0.692
c_stay_mil = -0.693
c_stay_ownjob = 0.97
c_stay_hh_val = 0.000000753
c_stay_rentcost = -0.000915
c_stay_owncost = 0.000689
c_stay_foreign = 0.204
c_stay_edu_college = -0.174
c_stay_edu_nohigh = 0.0563
c_stay_income = 0.00000101
c_stay_unemp_rate = 0

In [38]:
c_stay_micro = -0.208
c_stay_T34 = 0.579

In [39]:
utilities[:, 0] = c_stay \
    + c_stay_age_18_22 * df["AGE_18_22"] - c_stay_age_23_29 * df["AGE_23_29"] - c_stay_age_30_39 * df["AGE_30_39"] + c_stay_age_50_64 * df["AGE_50_64"] + c_stay_age_65 * df["AGE_OVER_65"] \
    + c_stay_child * df["child"] + c_stay_rec_child * df["REC_CHILD"] \
    + c_stay_rec_mar * df["MARHM_new"] + c_stay_rec_nomar * df["REC_NO_MAR"] \
    + df["MARRIED"].values * (c_stay_2work_mar * df["WORK2_MAR"].values + c_stay_1work_mar * df["WORK1_MAR"].values) + c_stay_otherfamily * df["OTHER_FAMILY"] \
    + c_stay_college * df["IN_COLLEGE"] + c_stay_mil * df["MILITARY"] \
    + c_stay_ownjob * df["OWN_JOB_ORIG"].values / df["TOT_JOBS_ORIG"].values \
    + c_stay_hh_val * df["HH_MED_VAL_ORIG"] + c_stay_rentcost * df["GRNTP_ORIG_ADJ"] + c_stay_owncost * df["SMOCP_ORIG_ADJ"] \
    + c_stay_foreign * df["FOREIGN"] \
    + c_stay_edu_college * df["EDU_COLLEGE"] + c_stay_edu_nohigh * df["EDU_LESS_HIGH"] \
    + df["IN_LF"].values * (c_stay_income * df["PERNP_ORIG_ADJ"].values) + c_stay_T34 * df["T34"] + c_stay_micro * df["MICRO_adj_ORIG"]

In [49]:
df[df["SERIALNO"] == "2018GQ0016994"]

Unnamed: 0,SERIALNO,AGEP,MAR,MARHD,MARHM,MARHW,MIL,SCHG,SCHL,ESR,...,OTHER_JOB,MICRO_adj_ORIG,METRO,T34,UNEMPLOYED,IN_LF,WORK2_MAR,WORK1_MAR,OTHER_FAMILY,MIGSP_ORIG
2337287,2018GQ0016994,40,5,0.0,0.0,0.0,4.0,0.0,17.0,6.0,...,1,0,1,0,1,0,0,0,0,51


In [41]:
pd.DataFrame(utilities[:, 0]).isna().sum()

0    0
dtype: int64

In [42]:
migpuma_acs_data = pd.read_csv(migpuma_acs_file).drop("Geo_FIPS", axis=1)
migpuma_lodes_data = pd.read_csv(migpuma_lodes_file).set_index("MIGPUMA")

In [43]:
migpuma_lodes_data

Unnamed: 0_level_0,Unnamed: 0,TOT_JOBS,JOBS_AGE_29,JOBS_AGE_30_54,JOBS_AGE_55,JOBS_EARN_1250,JOBS_EARN_1251_3333,JOBS_EARN_3334,AGR,EXT,...,EDU,MED,ENT,FOD,SRV,PUB,JOBS_EDU_NOHS,JOBS_EDU_HS,JOBS_EDU_NOBACH,JOBS_EDU_BACH
MIGPUMA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100190,0,98779,25260,51315,22204,24429,40764,33586,804,496,...,8442,14392,673,9445,2087,4809,9425,24176,24604,15314
100290,1,251113,61138,133922,56053,57381,83406,110326,978,110,...,17880,31247,3529,22888,4706,9503,22517,53389,60427,53642
100400,2,38349,9478,20490,8381,8663,17262,12424,339,20,...,3249,4185,103,2904,536,1831,4218,10119,9416,5118
100600,3,55347,12855,30219,12273,12340,20116,22891,224,68,...,3955,6304,283,4922,1059,2021,5588,13943,14067,8894
100700,4,37022,9531,19312,8179,8457,16222,12343,483,14,...,2931,4713,190,3076,679,1777,3639,9418,9358,5076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,970,54624,11754,29036,13834,14569,17808,22247,882,1574,...,5316,8056,1740,9497,1628,3025,4896,12946,14447,10581
5600200,971,45443,8889,24613,11941,9694,13269,22480,615,6767,...,4983,5484,335,3835,1408,3810,3921,12436,12994,7203
5600300,972,63021,15408,32819,14794,14940,20399,27682,499,791,...,8711,9714,682,6383,1698,7672,5195,13416,16437,12565
5600400,973,53153,11890,28441,12822,11754,16371,25028,526,4880,...,4856,7425,838,5363,1792,3215,4663,13825,14622,8153


In [44]:
df.index.astype(int)

RangeIndex(start=0, stop=2530899, step=1)

In [45]:
c_move = -0.00496
c_destchoice_dist = -0.000000302
c_destchoice_logdist = -0.35
c_destchoice_cbsa_dist = -0.0000278
c_destchoice_hhcost = -2.06
c_destchoice_college = 0.0000654
c_destchoice_foreign = 0.684
c_destchoice_age_18_34 = 3.68
c_destchoice_age_35_64 = 1.39
c_destchoice_age_over_65 = 4.62
c_destchoice_entscore_18_34 = 1.24
c_destchoice_entscore_35_64 = 2.98
c_destchoice_entscore_65 = 2.58
c_destchoice_unemp_rate = -2.1
c_destchoice_pincp  = 0
c_destchoice_T34_T34 = 0
c_destchoice_Metro_T34 = 0.0409
c_destchoice_Micro_T34 = -0.0367
c_destchoice_T34_Metro = 1.05
c_destchoice_Metro_Metro = 0.808
c_destchoice_Micro_Metro = 0.572
c_destchoice_T34_Micro = 1.17
c_destchoice_Metro_Micro = 1.25
c_destchoice_Micro_Micro = 2.01
c_destchoice_geo_spec_job = 6.35
c_destchoice_high_ed_job = 1.54
c_destchoice_license_job = 5.23
c_destchoice_other = 2.45
c_destchoice_military = 0.00018
c_destchoice_birthstate = 0.285
c_destchoice_samestate = 2.5

In [46]:
puma_lodes_data["filler"] = 0
df["NAICS"] = np.where(df["NAICS"] == 0, "filler", df["NAICS"])

In [47]:
puma_acs_data["COLLEGE"] = puma_acs_data["Population 3 Years and Over Enrolled in School Private School College"] + puma_acs_data["Population 3 Years and Over Enrolled in School Public School College"]

In [48]:
puma_acs_data["COLLEGE"] = puma_acs_data["COLLEGE"].fillna(0)
puma_acs_data["Total Population Foreign Born"] = puma_acs_data["Total Population Foreign Born"].fillna(0)

In [49]:
df["NAICS"].value_counts()

filler    698710
MED       247609
RET       200347
EDU       186754
MFG       186553
PRF       135138
ADM       130881
FOD       122090
CON       116691
SRV        91802
TRN        81404
FIN        80926
WHL        44625
ENT        43916
INF        35638
REL        35118
PUB        34602
AGR        29865
UTL        15897
EXT         9428
MNG         2905
Name: NAICS, dtype: int64

In [50]:
df["NAICS"] = np.where(df["NAICS"] == "SCA", "MED", df["NAICS"])

In [51]:
for puma, i in tqdm(puma_recodes.iterrows(), total=puma_recodes.shape[0]):
    index = i["code"]
    tot_pop = puma_acs_data.loc[puma, "Total Population"]
    dists = distances.loc[df["ORIGIN"], puma]
    cbsa_num = cbsas.loc[puma, 'Name_num']
    cbsa_type = cbsas.loc[puma, "type_num"]
    hh_cost = leave_puma.loc[puma, "HH_COST"]
    pincp = leave_puma.loc[puma, "PINCP"]
    unemp = leave_puma.loc[puma, "UNEMP"]
    emp = leave_puma.loc[puma, "EMP"]
    unemp_rate = unemp / (emp + unemp)
    foreign = puma_acs_data.loc[puma, "Total Population Foreign Born"]
    age_18_34 = puma_acs_data.loc[puma, "Total Population 18 to 34 Years"]
    age_35_64 = puma_acs_data.loc[puma, "Total Population 35 to 64 Years"]
    age_65 = puma_acs_data.loc[puma, "Total Population 65 and Over"]
    mil_num = puma_acs_data.loc[puma, "Population 16 Years and Over in Labor Force in Armed Forces"]
    tot_emp = puma_lodes_data.loc[puma, "TOT_JOBS"]
    ent = puma_lodes_data.loc[puma, "ENT"]
    own_job = puma_lodes_data.loc[puma, df["NAICS"]]
    state_num = puma_migpuma.loc[puma, "State"]
    college = puma_acs_data.loc[puma, "COLLEGE"]
    utilities[:, index] = c_move + np.log(tot_pop)  \
        + (df["CBSA_NAME_ORIG"].values != cbsa_num) * (c_destchoice_dist * dists.values + c_destchoice_logdist * np.log(dists + 1)) \
        + (df["CBSA_NAME_ORIG"].values == cbsa_num) * c_destchoice_cbsa_dist * dists.values \
        + c_destchoice_hhcost * hh_cost + c_destchoice_college * df["IN_COLLEGE"].values * college + c_destchoice_foreign * df["FOREIGN"].values * foreign / tot_pop \
        + c_destchoice_age_18_34 * df["AGE_18_34"].values * age_18_34 / tot_pop + c_destchoice_age_35_64 * df["AGE_35_64"].values * age_35_64 / tot_pop + c_destchoice_age_over_65 * df["AGE_OVER_65"].values * age_65 / tot_pop \
        + ent / tot_emp * (c_destchoice_entscore_18_34 * df["AGE_18_34"].values + c_destchoice_entscore_35_64 * df["AGE_35_64"].values + c_destchoice_entscore_65 * df["AGE_OVER_65"].values) \
        + df["IN_LF"].values * (c_destchoice_unemp_rate * unemp_rate) \
        + (c_destchoice_Metro_T34 * df["METRO"].values + c_destchoice_Micro_T34 * df["MICRO_adj_ORIG"].values) * (cbsa_type == 0) \
        + (c_destchoice_T34_Metro * df["T34"].values + c_destchoice_Metro_Metro * df["METRO"].values + c_destchoice_Micro_Metro * df["MICRO_adj_ORIG"].values) * (cbsa_type == 1) \
        + (c_destchoice_T34_Micro * df["T34"].values + c_destchoice_Metro_Micro * df["METRO"].values + c_destchoice_Micro_Micro * df["MICRO_adj_ORIG"].values) * (cbsa_type == 2) \
        + (c_destchoice_geo_spec_job * df["AGR_EXT"].values + c_destchoice_high_ed_job * df["HIGH_ED"].values + c_destchoice_license_job * df["LICENSE"].values + c_destchoice_other * df["OTHER_JOB"].values) * own_job.values / tot_emp + c_destchoice_military * df["MILITARY"].values * mil_num \
        + c_destchoice_birthstate * (df["POBP"].values == state_num) + c_destchoice_samestate * (df["MIGSP_ORIG"].values == state_num)

100%|██████████| 2336/2336 [25:24<00:00,  1.53it/s]


In [52]:
leave_puma

Unnamed: 0_level_0,JWMNP,FINCP,GRNTP,HINCP,SMOCP,PINCP,HH_COST,PERNP,WAGP,UNEMP,EMP,type,UNEMP_PCT
CHOSEN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100100,13.0,48000.0,680.0,33550.0,753.0,21800.0,0.262523,21800.0,20500.0,3,83,leavers,0.034884
100200,20.0,70000.0,785.0,70000.0,1107.0,35000.0,0.162381,34000.0,34000.0,3,86,leavers,0.033708
100301,15.0,77400.0,973.5,63600.0,977.0,30000.0,0.175737,30000.0,28600.0,8,77,leavers,0.094118
100302,15.0,62800.0,920.0,55000.0,1131.0,30000.0,0.217411,29050.0,28150.0,6,92,leavers,0.061224
100400,21.5,30300.0,580.0,35600.0,775.0,28600.0,0.159860,27000.0,27000.0,6,43,leavers,0.122449
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,10.0,76000.0,1005.0,55900.0,1001.0,26000.0,0.180750,26000.0,25000.0,3,84,leavers,0.034483
5600200,5.0,54000.0,922.0,61000.0,1130.0,35000.0,0.193548,25000.0,25000.0,2,41,leavers,0.046512
5600300,10.0,63700.0,930.0,42620.0,1338.0,20000.0,0.277500,19000.0,18000.0,12,135,leavers,0.081633
5600400,10.0,63800.0,820.0,61500.0,1206.5,25000.0,0.183302,24200.0,24000.0,2,60,leavers,0.032258


In [52]:
utilities

array([[15.61120057, 10.35559327, 11.20701926, ...,  7.53628642,
         7.65113166,  7.93253408],
       [16.14692987, 10.64702146, 11.72084163, ...,  7.4012468 ,
         7.52712173,  7.78608818],
       [15.4211525 , 10.53927704, 11.23509454, ...,  7.06356832,
         7.10459324,  7.22235387],
       ...,
       [16.52735333,  5.94725703,  6.34810336, ..., 11.78974064,
        12.08188584, 12.10047068],
       [15.97395915,  6.5958972 ,  6.89794892, ..., 11.57664029,
        11.76321075, 11.92017894],
       [16.3252697 ,  6.06065321,  6.51116883, ..., 12.0138607 ,
        12.62840124, 12.25067482]])

In [53]:
flows = pd.DataFrame(np.exp(utilities))

In [54]:
probs = pd.DataFrame(flows.values / flows.values.sum(axis=1)[:, None])
probs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2327,2328,2329,2330,2331,2332,2333,2334,2335,2336
0,0.099198,0.000518,0.001213,0.000378,0.000419,0.000070,0.000199,0.000161,0.000169,0.000345,...,0.000052,0.000063,0.000054,0.000077,0.000136,0.000165,0.000044,0.000031,0.000035,0.000046
1,0.158933,0.000650,0.001901,0.000516,0.000442,0.000087,0.000152,0.000181,0.000147,0.000380,...,0.000053,0.000065,0.000040,0.000068,0.000169,0.000133,0.000043,0.000025,0.000029,0.000037
2,0.312019,0.002366,0.004745,0.169818,0.008479,0.002436,0.004772,0.004021,0.007544,0.005327,...,0.000071,0.000079,0.000157,0.000180,0.000077,0.000103,0.000085,0.000073,0.000076,0.000086
3,0.392469,0.003842,0.007959,0.008529,0.032085,0.002074,0.011126,0.003295,0.013583,0.004818,...,0.000091,0.000094,0.000257,0.000355,0.000112,0.000305,0.000185,0.000157,0.000156,0.000213
4,0.259633,0.003148,0.004975,0.004742,0.007492,0.087434,0.006243,0.004037,0.008826,0.003258,...,0.000094,0.000068,0.000081,0.000129,0.000071,0.000076,0.000047,0.000040,0.000040,0.000056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530894,0.634738,0.000039,0.000058,0.000086,0.000235,0.000040,0.000253,0.000086,0.000233,0.000090,...,0.000060,0.000099,0.000390,0.000420,0.000060,0.018357,0.018690,0.015167,0.016275,0.031856
2530895,0.608538,0.000083,0.000109,0.000099,0.000321,0.000038,0.000268,0.000083,0.000226,0.000094,...,0.000067,0.000084,0.000299,0.000376,0.000101,0.028068,0.016976,0.015420,0.015877,0.033299
2530896,0.800479,0.000020,0.000030,0.000045,0.000123,0.000021,0.000132,0.000045,0.000122,0.000047,...,0.000034,0.000057,0.000225,0.000241,0.000035,0.006525,0.022477,0.007012,0.009391,0.009567
2530897,0.673511,0.000057,0.000077,0.000078,0.000268,0.000031,0.000218,0.000067,0.000214,0.000076,...,0.000061,0.000066,0.000293,0.000353,0.000085,0.010715,0.024605,0.008291,0.009992,0.011690


In [67]:
np.arange(2337)

array([   0,    1,    2, ..., 2334, 2335, 2336])

In [87]:
stay = 0

In [88]:
migrants = pd.DataFrame(data=0, index=distances.index, columns=puma_recodes.index)
migrants

Unnamed: 0_level_0,101600,101900,102000,102400,102701,100400,100600,102200,101303,100500,...,5541001,5541002,5500600,5501000,5540101,5600300,5600100,5600200,5600500,5600400
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
600100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
600700,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601300,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601500,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5151000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5151001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4500600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
translation = puma_recodes.reset_index().set_index("code")

In [90]:
for i, row in tqdm(probs.iterrows(), total=probs.shape[0]):
    choice = np.random.choice(2337, p=row)
    if choice == 0:
        stay += 1
    else:
        origin = df.loc[i, "ORIGIN"]
        dest = translation.loc[choice][0]
        migrants.loc[origin, dest] += 1

100%|██████████| 2530899/2530899 [10:00<00:00, 4211.54it/s]


In [91]:
stay

1293065

In [106]:
predicted_flows = migrants.sort_index().sort_index(axis=1)
predicted_flows

Unnamed: 0_level_0,100100,100200,100301,100302,100400,100500,100600,100700,100800,100900,...,5555102,5555103,5570101,5570201,5570301,5600100,5600200,5600300,5600400,5600500
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100190,17,5,9,9,22,24,9,32,9,10,...,0,0,0,0,1,0,1,1,0,1
100290,12,650,354,276,18,32,25,20,13,12,...,0,0,0,0,1,0,0,0,1,0
100400,7,4,3,5,355,15,3,7,2,5,...,0,0,0,0,0,0,0,0,0,0
100600,7,9,7,5,8,8,148,5,2,3,...,0,0,0,1,1,0,0,0,0,0
100700,1,7,2,1,7,15,5,183,2,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,24,3,7,4,4
5600200,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,31,4,8,5
5600300,0,0,0,0,0,2,0,1,0,0,...,0,0,0,0,0,4,7,158,10,2
5600400,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,6,9,7,18,8


In [109]:
actual_flows = actual_flows.sort_index()
actual_flows

CHOSEN,100100,100200,100301,100302,100400,100500,100600,100700,100800,100900,...,5555102,5555103,5570101,5570201,5570301,5600100,5600200,5600300,5600400,5600500
ORIGIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100190,104.0,5.0,1.0,3.0,3.0,4.0,5.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100290,8.0,102.0,80.0,117.0,9.0,109.0,4.0,5.0,1.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100400,3.0,2.0,2.0,0.0,67.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100600,4.0,3.0,3.0,6.0,2.0,5.0,61.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100700,4.0,2.0,1.0,0.0,0.0,4.0,5.0,65.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,68.0,5.0,7.0,0.0,4.0
5600200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,1.0,31.0,9.0,5.0,2.0
5600300,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,122.0,9.0,5.0
5600400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,3.0,4.0,49.0,5.0


In [111]:
((actual_flows - predicted_flows) ** 2).sum().sum() / (975 * 2336)

48.531950298559885

: 

In [95]:
migrants.to_csv("test_flows.csv")

In [94]:
actual_flows = df[df["STAY"] == 0]

In [98]:
actual_flows

Unnamed: 0,SERIALNO,AGEP,MAR,MARHD,MARHM,MARHW,MIL,SCHG,SCHL,ESR,...,OTHER_JOB,MICRO_adj_ORIG,METRO,T34,UNEMPLOYED,IN_LF,WORK2_MAR,WORK1_MAR,OTHER_FAMILY,MIGSP_ORIG
0,2018GQ0000049,19,5,0.0,0.0,0.0,4.0,15.0,18.0,6.0,...,1,0,1,0,1,0,0,0,0,13
1,2018GQ0000058,18,5,0.0,0.0,0.0,4.0,15.0,18.0,6.0,...,1,0,0,1,1,0,0,0,0,13
4,2018GQ0000251,25,5,0.0,0.0,0.0,4.0,0.0,12.0,6.0,...,1,0,1,0,1,0,0,0,0,1
9,2018GQ0001011,18,5,0.0,0.0,0.0,4.0,15.0,18.0,6.0,...,0,0,1,0,1,0,0,0,0,1
12,2018GQ0001284,57,4,2.0,2.0,2.0,2.0,0.0,19.0,6.0,...,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530840,2018HU1384891,20,5,0.0,0.0,0.0,4.0,15.0,20.0,1.0,...,1,1,0,0,0,1,0,0,0,31
2530841,2018HU1384891,19,5,0.0,0.0,0.0,4.0,15.0,16.0,1.0,...,0,1,0,0,0,1,0,0,0,8
2530860,2018HU1391612,45,1,2.0,2.0,2.0,4.0,0.0,20.0,1.0,...,0,1,0,0,0,1,1,0,0,56
2530861,2018HU1391612,57,1,2.0,2.0,2.0,4.0,0.0,18.0,1.0,...,1,1,0,0,0,1,1,0,0,56


In [99]:
actual_flows = actual_flows.groupby(["ORIGIN", "CHOSEN"])["SERIALNO"].count().unstack().fillna(0)

In [55]:
stay = probs.iloc[:, :1].sum()
stay

0    1.292800e+06
dtype: float64

In [56]:
stay.values

array([1292799.90167466])

In [57]:
non_stay = probs.loc[:, 1:]

In [63]:
non_stay.loc[0].sort_values()

297     0.000005
1744    0.000005
190     0.000006
350     0.000006
2277    0.000006
          ...   
671     0.011205
619     0.011946
639     0.013355
609     0.048075
631     0.340666
Name: 0, Length: 2336, dtype: float64

In [65]:
df.loc[0]["STAY"]

0

In [58]:
non_stay

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,2327,2328,2329,2330,2331,2332,2333,2334,2335,2336
0,0.000518,0.001213,0.000378,0.000419,0.000070,0.000199,0.000161,0.000169,0.000345,0.000229,...,0.000052,0.000063,0.000054,0.000077,0.000136,0.000165,0.000044,0.000031,0.000035,0.000046
1,0.000650,0.001901,0.000516,0.000442,0.000087,0.000152,0.000181,0.000147,0.000380,0.000178,...,0.000053,0.000065,0.000040,0.000068,0.000169,0.000133,0.000043,0.000025,0.000029,0.000037
2,0.002366,0.004745,0.169818,0.008479,0.002436,0.004772,0.004021,0.007544,0.005327,0.005037,...,0.000071,0.000079,0.000157,0.000180,0.000077,0.000103,0.000085,0.000073,0.000076,0.000086
3,0.003842,0.007959,0.008529,0.032085,0.002074,0.011126,0.003295,0.013583,0.004818,0.010811,...,0.000091,0.000094,0.000257,0.000355,0.000112,0.000305,0.000185,0.000157,0.000156,0.000213
4,0.003148,0.004975,0.004742,0.007492,0.087434,0.006243,0.004037,0.008826,0.003258,0.005327,...,0.000094,0.000068,0.000081,0.000129,0.000071,0.000076,0.000047,0.000040,0.000040,0.000056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2530894,0.000039,0.000058,0.000086,0.000235,0.000040,0.000253,0.000086,0.000233,0.000090,0.000276,...,0.000060,0.000099,0.000390,0.000420,0.000060,0.018357,0.018690,0.015167,0.016275,0.031856
2530895,0.000083,0.000109,0.000099,0.000321,0.000038,0.000268,0.000083,0.000226,0.000094,0.000260,...,0.000067,0.000084,0.000299,0.000376,0.000101,0.028068,0.016976,0.015420,0.015877,0.033299
2530896,0.000020,0.000030,0.000045,0.000123,0.000021,0.000132,0.000045,0.000122,0.000047,0.000144,...,0.000034,0.000057,0.000225,0.000241,0.000035,0.006525,0.022477,0.007012,0.009391,0.009567
2530897,0.000057,0.000077,0.000078,0.000268,0.000031,0.000218,0.000067,0.000214,0.000076,0.000211,...,0.000061,0.000066,0.000293,0.000353,0.000085,0.010715,0.024605,0.008291,0.009992,0.011690


In [59]:
migrants = pd.DataFrame(data=0, index=distances.index, columns=puma_recodes.index)

In [60]:
for i, row in tqdm(non_stay.iterrows(), total=non_stay.shape[0]):
    origin = df.loc[i, "ORIGIN"]
    migrants.loc[origin] += row.values

 10%|▉         | 241025/2530899 [01:33<14:46, 2584.24it/s]


KeyboardInterrupt: 

In [None]:
migrants.sum().sum()

In [None]:
migrants.to_csv("fixed_flows.csv")

In [None]:
# # exmaple
# puma = 5401300
# tot_pop = puma_acs_data.loc[puma, "Total Population"]
# dists = distances.loc[df["ORIGIN"].values, puma]
# cbsa_num = cbsas.loc[puma, 'Name_num']
# cbsa_type = cbsas.loc[puma, "type_num"]
# hh_cost = leave_puma.loc[puma, "HH_COST"]
# pincp = leave_puma.loc[puma, "PINCP"]
# unemp = leave_puma.loc[puma, "UNEMP"]
# emp = leave_puma.loc[puma, "EMP"]
# unemp_rate = unemp / (emp + unemp)
# foreign = puma_acs_data.loc[puma, "Total Population Foreign Born"]
# age_18_34 = puma_acs_data.loc[puma, "Total Population 18 to 34 Years"]
# age_35_64 = puma_acs_data.loc[puma, "Total Population 35 to 64 Years"]
# age_65 = puma_acs_data.loc[puma, "Total Population 65 and Over"]
# mil_num = puma_acs_data.loc[puma, "Population 16 Years and Over in Labor Force in Armed Forces"]
# tot_emp = puma_lodes_data.loc[puma, "TOT_JOBS"]
# ent = puma_lodes_data.loc[puma, "ENT"]
# own_job = puma_lodes_data.loc[puma, df["NAICS"]]
# state_num = puma_migpuma.loc[puma, "State"]
# college = puma_acs_data.loc[puma, "COLLEGE"]

# np.log(tot_pop)  \
#         + (df["CBSA_NAME_ORIG"].values != cbsa_num) * (c_destchoice_dist * dists.values + c_destchoice_logdist * np.log(dists + 1)) \
#         + (df["CBSA_NAME_ORIG"].values == cbsa_num) * c_destchoice_cbsa_dist * dists.values \
#         + c_destchoice_hhcost * hh_cost + c_destchoice_college * df["IN_COLLEGE"].values * college + c_destchoice_foreign * df["FOREIGN"].values * foreign / tot_pop \
#         + c_destchoice_age_18_34 * df["AGE_18_34"].values * age_18_34 / tot_pop + c_destchoice_age_35_64 * df["AGE_35_64"].values * age_35_64 / tot_pop + c_destchoice_age_over_65 * df["AGE_OVER_65"].values * age_65 / tot_pop \
#         + ent / (tot_emp + 1) * (c_destchoice_entscore_18_34 * df["AGE_18_34"].values + c_destchoice_entscore_35_64 * df["AGE_35_64"].values + c_destchoice_entscore_65 * df["AGE_OVER_65"].values) \
#         + df["IN_LF"].values * (c_destchoice_unemp_rate * unemp_rate + c_destchoice_pincp * pincp) \
#         + (c_destchoice_Metro_T34 * df["METRO"].values + c_destchoice_Micro_T34 * df["MICRO_adj_ORIG"].values) * (cbsa_type == 0) \
#         + (c_destchoice_T34_Metro * df["T34"].values + c_destchoice_Metro_Metro * df["METRO"].values + c_destchoice_Micro_Metro * df["MICRO_adj_ORIG"].values) * (cbsa_type == 1) \
#         + (c_destchoice_T34_Micro * df["T34"].values + c_destchoice_Metro_Micro * df["METRO"].values + c_destchoice_Micro_Micro * df["MICRO_adj_ORIG"].values) * (cbsa_type == 2) \
#         + (c_destchoice_geo_spec_job * df["AGR_EXT"].values + c_destchoice_high_ed_job * df["HIGH_ED"].values + c_destchoice_license_job * df["LICENSE"].values + c_destchoice_other * df["OTHER_JOB"].values) * own_job.values / (tot_emp + 1) + c_destchoice_military * df["MILITARY"].values * mil_num \
#         + c_destchoice_birthstate * (df["POBP"].values == state_num) + c_destchoice_samecbsa * (df["CBSA_NAME_ORIG"].values == cbsa_num) + c_destchoice_samestate * (df["MIGSP"].values == state_num)

In [None]:
# old utility formula (has issues when multiplying pandas dfs with index matching)
# utilities[:, index] = np.log(tot_pop)  \
#     + (df["CBSA_NAME_ORIG"] != cbsa_num).values * (c_destchoice_dist * dists + c_destchoice_logdist * np.log(dists + 1)) \
#     + (df["CBSA_NAME_ORIG"] == cbsa_num).values * c_destchoice_cbsa_dist * dists \
#     + c_destchoice_hhcost * hh_cost + c_destchoice_college * df["IN_COLLEGE"] * college + c_destchoice_foreign * df["FOREIGN"] * foreign / tot_pop \
#     + c_destchoice_age_18_34 * df["AGE_18_34"] * age_18_34 / tot_pop + c_destchoice_age_35_64 * df["AGE_35_64"] * age_35_64 / tot_pop + c_destchoice_age_over_65 * df["AGE_OVER_65"] * age_65 / tot_pop \
#     + ent / (tot_emp + 1) * (c_destchoice_entscore_18_34 * age_18_34 + c_destchoice_entscore_35_64 * age_35_64 + c_destchoice_entscore_65 * age_65) \
#     + df["IN_LF"] * (c_destchoice_unemp_rate * unemp_rate + c_destchoice_pincp * pincp) \
#     + (c_destchoice_Metro_T34 * df["METRO"] + c_destchoice_Micro_T34 * df["MICRO_adj_ORIG"]) * (cbsa_type == 0) \
#     + (c_destchoice_T34_Metro * df["T34"] + c_destchoice_Metro_Metro * df["METRO"] + c_destchoice_Micro_Metro * df["MICRO_adj_ORIG"]) * (cbsa_type == 1) \
#     + (c_destchoice_T34_Micro * df["T34"] + c_destchoice_Metro_Micro * df["METRO"] + c_destchoice_Micro_Micro * df["MICRO_adj_ORIG"]) * (cbsa_type == 2) \
#     + (c_destchoice_geo_spec_job * df["AGR_EXT"] + c_destchoice_high_ed_job * df["HIGH_ED"] + c_destchoice_license_job * df["LICENSE"] + c_destchoice_other * df["OTHER_JOB"]) * own_job / (tot_emp + 1) + c_destchoice_military * df["MILITARY"] * mil_num \
#     + c_destchoice_birthstate * (df["POBP"] == state_num) + c_destchoice_samecbsa * (df["CBSA_NAME_ORIG"] == cbsa_num) + c_destchoice_samestate * (df["MIGSP"] == state_num)