## Imports and setup

In [None]:
# Grab data from ACS with cenpy (don't need to run every time!)
# Variables:
#    - household income (B19001_001E)
#    - lower quartile house price (B25076_001E)
#    - median house price (B25077_001E)
#    - upper quartile house price (B25078_001E)
#    - number of housing units (B00002_001E)

countydata = gpd.GeoDataFrame()

states = pd.read_csv('../data/states.csv')['State'].values
for state in tqdm(states):
    countydata = countydata.append(acs.from_state(state=state, variables=['B19001_001E', 'B25076_001E', 'B25077_001E', 'B25078_001E', 'B00002_001E'], level='county'))

countydata.to_csv('../data/ACS_countydata.csv')

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from scipy.linalg import norm

from tqdm import tqdm
from cenpy.products import ACS
from spint.gravity import Gravity, Attraction, Production

acs = ACS()
LA_FIPS = '06037'  # LA County FIPS code
variables = ['B19001_001E', 'B25076_001E', 'B25077_001E', 'B25078_001E', 'B00002_001E']

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

## Load data

In [62]:
# Flows
inflows_raw = pd.read_csv('../data/LACounty_ACS_2014_2018_All_IN.csv', usecols=['State/County FIPS', 'Total', 'Margin of Error (+/-)']).replace("'", "", regex=True)
outflows_raw = pd.read_csv('../data/LACounty_ACS_2014_2018_All_OUT.csv', usecols=['State/County FIPS', 'Total', 'Margin of Error (+/-)']).replace("'", "", regex=True)
outflows_raw.rename(columns={'State/County FIPS' : 'destFIPS', 'Total' : 'total_out', 'Margin of Error (+/-)' : 'outMOE'}, inplace=True)
inflows_raw.rename(columns={'State/County FIPS' : 'originFIPS', 'Total' : 'total_in', 'Margin of Error (+/-)' : 'inMOE'}, inplace=True)

In [4]:
# County-level demographic data (and convert covariate columns)
countydata = gpd.read_file('../data/ACS_countydata.csv', GEOM_POSSIBLE_NAMES="geometry", KEEP_GEOM_COLUMNS="NO").replace('', 0)
countydata['B00002_001E'] = countydata['B00002_001E'].astype(float)
countydata['B19001_001E'] = countydata['B19001_001E'].astype(float)
countydata['B25076_001E'] = countydata['B25076_001E'].astype(float)
countydata['B25077_001E'] = countydata['B25077_001E'].astype(float)
countydata['B25078_001E'] = countydata['B25078_001E'].astype(float)

In [5]:
# Ping ACS for data for LA county
la_data = acs.from_county("Los Angeles County, CA", variables=['B19001_001E', 'B25076_001E', 'B25077_001E', 'B25078_001E', 'B00002_001E'], level='county')

## Rearrage data to prep for analysis

In [6]:
# Merge all the data into one big dataframe to make it simpler
data = pd.merge(pd.merge(countydata, outflows_raw, how='inner', left_on='GEOID', right_on='destFIPS'), inflows_raw, how='inner', left_on='GEOID', right_on='originFIPS').set_crs(epsg=3395)
la_covars = np.tile(la_data[variables].values, (data.shape[0], 1))  # repeat the data so that we account for the single location properly

In [7]:
# costs are all distances between LA and the out or in destination
coords = np.hstack((data.centroid.x.values.reshape(-1, 1), data.centroid.y.values.reshape(-1, 1)))
la_coords = np.array([la_data.centroid.x[0], la_data.centroid.y[0]])
cost = norm(la_coords - coords, axis=1)

## Calibrate model 

In [38]:
model = Gravity(flows=data['total_out'].values, o_vars=la_covars, d_vars=data[variables].values, cost=cost, cost_func='exp', constant=False)

In [60]:
paramnames = [*[x + '_o' for x in variables], *[x + '_d' for x in variables], 'distance']
pd.DataFrame(data={'paramname' : paramnames, 'paramval' : model.params, 'SE' : model.std_err, 'tvalue' : model.tvalues, 'pvalue' : model.pvalues})  # intercept, origin attrs, dest attrs, distance

Unnamed: 0,paramname,paramval,SE,tvalue,pvalue
0,B19001_001E_o,-37.81286,,,
1,B25076_001E_o,-244.6795,3680.112,-0.066487,0.9469901
2,B25077_001E_o,275.9205,,,
3,B25078_001E_o,-33.98551,,,
4,B00002_001E_o,41.80905,,,
5,B19001_001E_d,0.1402123,0.005594608,25.062049,1.2902979999999999e-138
6,B25076_001E_d,-2.337505,0.02246427,-104.054348,0.0
7,B25077_001E_d,3.840602,0.04427451,86.74523,0.0
8,B25078_001E_d,-0.9517738,0.0277881,-34.251126,4.196199e-257
9,B00002_001E_d,0.8157609,0.006249579,130.530532,0.0


In [58]:
model.SRMSE

153.4685439545562

In [17]:
out_model = Production(flows=data['total_out'].values, origins=data['originFIPS'].values, d_vars=data[variables].values, cost=cost, cost_func='exp')
in_model = Attraction(flows=data['total_in'].values, destinations=data['destFIPS'].values, o_vars=data[variables].values, cost=cost, cost_func='exp')

Exception: one or more input arrays have more columns than rows