ONE-TIME USE DATA COLLECTION CALL -- do for both county and tract levels

In [None]:
import pandas as pd
from cenpy.products import ACS
acs = ACS()
level = 'tract'

vars = ['B00001_001E',   # total pop  
        'B07411_001E',   # Median income in the past 12 months -- Total living in area 1 year ago
        'B02001_002E']   # total white people
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',\
    'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', \
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', \
    'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
#for state in states:
#    if state == 'Alabama':
#        data = acs.from_state(state, level=level, variables=vars)
#    else:
#        data = pd.concat([data, acs.from_state(state, level=level, variables=vars)], ignore_index=True)
#    print(f'state {state} complete')
data = acs.from_state('California', level=level, variables=vars)
data.to_csv(f'../data/CA{level}_level_data.csv')

  return _prepare_from_string(" ".join(pjargs))


## Imports and data loading

In [None]:
import sqlite3
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from scipy.linalg import norm
from spint.gravity import Gravity, Production
from spopt import MaxPHeuristic, RegionKMeansHeuristic, WardSpatial

In [None]:
county_level = pd.read_csv('../data/daily_county2county_08_10.csv', \
    usecols=['geoid_o', 'geoid_d', 'lng_o', 'lat_o', 'lng_d', 'lat_d', 'visitor_flows', 'pop_flows'])

In [None]:
ct_level = pd.read_csv('../data/daily_ct2ct_08_10.csv', nrows=10, \
    usecols=['geoid_o', 'geoid_d', 'lng_o', 'lat_o', 'lng_d', 'lat_d', 'visitor_flows', 'pop_flows'])

In [None]:
# Import origin and destination variables from census data tables (cenpy was too finicky)
county_bds = gpd.read_file('../data/county_level_data.csv', GEOM_POSSIBLE_NAMES="geometry", KEEP_GEOM_COLUMNS="NO")

In [None]:
county_level.join(county_bds.set_index('GEOID'), on='geoid_o')

In [None]:
# Create spatial cost via distances between centroids
o_coords = np.hstack((county_level['lng_o'].values.reshape(-1, 1), county_level['lat_o'].values.reshape(-1, 1)))
d_coords = np.hstack((county_level['lng_d'].values.reshape(-1, 1), county_level['lat_d'].values.reshape(-1, 1)))
cost = cdist(o_coords, d_coords)

In [None]:
Gravity(county_level['visitor_flows'].values, o_vars, d_vars, cost, 'exp')

In [None]:
# TODO:
# o_vars and d_vars from cenpy
# boundaries from cenpy (join)
# cost -- spatial distance between centroids

## Pare it down
The dataset is way too large to handle at once, we need to use a subset for testing speed.

In [None]:
# TODO:
# select CA flows from county2county and ct2ct
# put gravity model on tracts and on counties
# test aggregation schemes

In [None]:
CAtracts = gpd.read_file('../data/CAtract_level_data.csv', GEOM_POSSIBLE_NAMES="geometry", KEEP_GEOM_COLUMNS="NO")
CAcounties = gpd.read_file('../data/CAcounty_level_data.csv', GEOM_POSSIBLE_NAMES="geometry", KEEP_GEOM_COLUMNS="NO")

In [None]:
def isCA_counties(x):
    # Vectorized evaluation if row is a CA to CA flow in counties
    return [el[:2] == '06' for el in x]

def isCA_cts(x):
    # Vectorized evaluation if row is a CA to CA flow in cts
    return [el[0] == '6' and len(el) == 10 for el in x]

In [None]:
county_daily = pd.read_csv('../data/daily_county2county_08_10.csv', 
    converters={'geoid_o' : lambda x: str(x), 'geoid_d' : lambda x: str(x), 'visitor_flows' : lambda x: int(float(x)), 'pop_flows' : lambda x: int(float(x))}, 
    usecols=['geoid_o', 'geoid_d', 'lng_o', 'lat_o', 'lng_d', 'lat_d', 'visitor_flows', 'pop_flows'])
county_daily = county_daily[np.logical_and(isCA_counties(county_daily['geoid_o']), isCA_counties(county_daily['geoid_d']))]

ct_daily = pd.read_csv('../data/daily_ct2ct_08_10.csv', 
    converters={'geoid_o' : lambda x: str(x), 'geoid_d' : lambda x: str(x), 'visitor_flows' : lambda x: int(float(x)), 'pop_flows' : lambda x: int(float(x))}, 
    usecols=['geoid_o', 'geoid_d', 'lng_o', 'lat_o', 'lng_d', 'lat_d', 'visitor_flows', 'pop_flows'])
ct_daily = ct_daily[np.logical_and(isCA_cts(ct_daily['geoid_o']), isCA_cts(ct_daily['geoid_d']))]

In [None]:
o_coords = np.hstack((county_daily['lng_o'].values.reshape(-1, 1), county_daily['lat_o'].values.reshape(-1, 1)))
d_coords = np.hstack((county_daily['lng_d'].values.reshape(-1, 1), county_daily['lat_d'].values.reshape(-1, 1)))
cost = norm(o_coords - d_coords, axis=1)

In [None]:
CAcounties = CAcounties.join(county_daily.set_index('geoid_o'), on='GEOID')

In [None]:
Gravity(county_daily['pop_flows'].values, county_daily['visitor_flows'].values, county_daily['visitor_flows'].values, cost, 'exp')