# This is the file that construct the node and edge files for Figure 1

In [1]:
import numpy as np
import pandas as pd

In [2]:
pairs = np.load('../data/pairs_full_12.npy', allow_pickle=True).item()

In [None]:
from tqdm import tqdm

poi2county = {}

for cbg in tqdm(pairs):
    county = cbg // 10000000
    for poi in pairs[cbg]:
        if poi not in poi2county:
            poi2county[poi] = {}
        if county not in poi2county[poi]:
            poi2county[poi][county] = 0.0
        poi2county[poi][county] += pairs[cbg][poi]

In [4]:
CBG_data = pd.read_csv('../data/census_cbg_with_predicted_hesitancy_vaccincation.csv', error_bad_lines=False)

In [5]:
prediction_vac = pd.read_csv('../data/vac_inferred_lvm.csv')

In [6]:
CBG_data['FIPS Code'] = CBG_data['census_block_group'] // 10000000
CBG_data = CBG_data.merge(prediction_vac, on='census_block_group')

CBG_data['vac_rate_inferred_times_total_population'] = CBG_data['vac_rate_inferred'] * CBG_data['total_population']

CBG_data_sum = CBG_data.groupby('FIPS Code')[['vac_rate_inferred_times_total_population', 'total_population']].sum()
CBG_data_sum = CBG_data_sum.reset_index()
CBG_data_sum['county_level_weighted_average'] = CBG_data_sum['vac_rate_inferred_times_total_population'] / CBG_data_sum['total_population']

CBG_data = CBG_data.merge(CBG_data_sum[['FIPS Code', 'county_level_weighted_average']], on='FIPS Code')

CBG_data['E_estimate_unsure'] = 1 - CBG_data['vac_rate_inferred'] / 100.0
CBG_data['Estimated hesitant or unsure'] = 1 - CBG_data['county_level_weighted_average'] / 100.0

CBG_data['E_estimate_unsure'] = np.minimum(CBG_data['E_estimate_unsure'], 1.0)
CBG_data['E_estimate_unsure'] = np.maximum(CBG_data['E_estimate_unsure'], 0.0)

CBG_data['Estimated hesitant or unsure'] = np.minimum(CBG_data['Estimated hesitant or unsure'], 1.0)
CBG_data['Estimated hesitant or unsure'] = np.maximum(CBG_data['Estimated hesitant or unsure'], 0.0)

vaccine = CBG_data

In [7]:
dict_param = np.load('../data/dict_param_all_12.npy', allow_pickle=True).item()

In [8]:
cbg2population = {}

for i, r in vaccine[['census_block_group', 'total_population']].iterrows():
    cbg2population[r['census_block_group']] = r['total_population']

In [9]:
poi2areas = np.load('../data/poi2area.npy', allow_pickle=True).item()
poi2dwell_corrects_total = np.load('../data/poi2dwell_corrects_total_12.npy', allow_pickle=True).item()

In [10]:
pois = set([poi for poi in poi2county if len(poi2county[poi]) >= 1])

counties = [cbg // 10000000 for cbg in cbg2population if cbg2population[cbg] > 1]
counties.sort()
counties = set(counties)

poi2idx = {}

for poi in pois:
    poi2idx[poi] = len(poi2idx)
    
county2idx = {}

for county in counties:
    county2idx[county] = len(county2idx)

## The next step costs a lot memory. It is good to store some of the files needed to use in downstream tasks while releasing the rest

In [None]:
del pairs
import gc

gc.collect()

In [11]:

from tqdm import tqdm

rows = []
cols = []
vals = []

for poi in poi2county:
    if poi in pois and poi in poi2dwell_corrects_total and poi in poi2areas:
        for county in poi2county[poi]:
            if county in county2idx:
                rows.append(poi2idx[poi])
                cols.append(county2idx[county])
                vals.append(poi2county[poi][county] * np.sqrt(poi2dwell_corrects_total[poi] / poi2areas[poi]))

[0.4680150483508192, 0.006433196540904731, 0.008041495676130914, 0.006433196540904731, 0.006433196540904731, 0.05762375883239318, 0.05762375883239318, 0.10084157795668806, 0.05762375883239318, 0.05762375883239318, 0.05762375883239318, 0.11524751766478636, 0.05762375883239318, 0.05762375883239318, 0.11524751766478636, 0.05762375883239318, 0.05762375883239318, 0.25523506571234106, 0.00622524550517905, 0.00622524550517905, 0.10285233725053743, 0.4502215517382016, 0.007762440547210372, 0.14748637039699708, 0.23869504682671894, 0.007762440547210372, 0.015524881094420745, 0.007762440547210372, 0.009703050684012966, 0.007762440547210372, 0.30079457120440195, 0.04657464328326223, 0.015524881094420745, 0.04463403314645964, 0.007762440547210372, 0.007762440547210372, 0.007762440547210372, 0.02522793177843371, 0.007762440547210372, 0.007762440547210372, 0.023287321641631116, 0.06986196492489335, 0.17465491231223337, 0.03493098246244668, 0.10673355752414262, 0.007762440547210372, 0.007762440547210

In [14]:
from scipy.sparse import csr_matrix
bipartite = csr_matrix((vals, (rows, cols)), shape=(len(poi2idx), len(county2idx)))

In [15]:
bipartite_normed_product = bipartite.T @ bipartite


In [16]:
bipartite_normed_product = bipartite_normed_product.todense()

In [17]:
mask = bipartite_normed_product > np.tile(np.percentile(bipartite_normed_product, 100*(1 - 6/len(bipartite_normed_product)), axis=0), [3105,1])

In [18]:
vaccine_node = vaccine[['FIPS Code', 'county_level_weighted_average']].drop_duplicates()
vaccine_node.columns = ['Id', 'unvax']
vaccine_node.to_csv('node_US_county_test.csv', index=False)


In [19]:
idx2county = {}

for county in county2idx:
    idx = county2idx[county]
    idx2county[idx] = county

In [20]:
fo = open('edge_US_county_test.csv', 'w')
# our output is named edge_US_county.csv

fo.write('Source, Target\n')

for x, y in list(np.argwhere(mask)):
    if x != y:
        fo.write('%d, %d\n' % (idx2county[x], idx2county[y]))

fo.close()