In [73]:
import functools
import json
import numpy as np
import pandas as pd

from collections import defaultdict

**Step 1. Load data.**

In [74]:
ili_data = pd.read_csv("data/Small/input/Flu_ILI.csv")
tweets = json.load(open("data/Full/input/Flu_Vacc_Tweet_TRAIN.json"))
states = json.load(open("data/Full/input/StateInfo.json"))
regions_to_counties = json.load(open("data/Full/input/Region2CountyMap.json"))
county_adjacency = json.load(open("data/Full/input/county_adjacency_lower48.json"))

**Step 2. Gather all dates (in order).**

In [75]:
dates = ['08/10/2013', '08/17/2013', '08/24/2013', '08/31/2013', '09/07/2013', '09/14/2013', '09/21/2013',
         '09/28/2013', '10/05/2013', '10/12/2013', '10/19/2013', '10/26/2013', '11/02/2013', '11/09/2013',
         '11/16/2013', '11/23/2013', '11/30/2013', '12/07/2013', '12/14/2013', '12/21/2013', '12/28/2013',
         '01/04/2014', '01/11/2014', '01/18/2014', '01/25/2014', '02/01/2014', '02/08/2014', '02/15/2014',
         '02/22/2014', '03/01/2014', '03/08/2014', '03/15/2014', '03/22/2014', '03/29/2014', '04/05/2014',
         '04/12/2014', '04/19/2014', '04/26/2014', '05/03/2014', '05/10/2014', '05/17/2014', '05/24/2014',
         '05/31/2014', '06/07/2014', '06/14/2014', '06/21/2014', '06/28/2014', '07/05/2014', '07/12/2014',
         '07/19/2014', '07/26/2014', '08/02/2014']

In [76]:
print("Number of dates:", len(dates))

Number of dates: 52


**Step 3. Compute statistics for each county (indexed by FIPS code)**

Note: hacky solution to demographics-adjusted Twitter population.

In [77]:
fips_to_cov1 = defaultdict(list)
fips_to_cov2 = defaultdict(list)
fips_to_pop = {}

In [78]:
for fips_code, blob in tweets.items():
    
    if not 'Vaccination percentage %' in blob.keys():
        continue
        
    for date in dates:
    
        cov1 = np.log((blob['No. of Tweets'][date] + 0.1) / blob['Population, 2014 estimate'] * 1000) 
        cov2 = np.log((blob['Vaccination percentage %'][date] / 100 + 0.001) / 
                      (1-blob['Vaccination percentage %'][date] / 100 + 0.001))
        
        fips_to_cov1[fips_code].append(cov1)
        fips_to_cov2[fips_code].append(cov2)
        
    fips_to_pop[fips_code] = blob['Population, 2014 estimate']

**Step 4. Construct regions; get *only* the relevant counties from the training data.**

In [79]:
regions = set()
for i, col in enumerate(ili_data.columns):
    if i > 3:
        regions.add(col)

In [80]:
counties = set()
for r in regions:
    counties = counties.union(set(regions_to_counties[r].keys()))

In [81]:
print('Number of regions:', len(regions))
print('Number of counties:', len(counties))

Number of regions: 9
Number of counties: 82


**Step 4. Save data for counties in matrices.**

In [82]:
county_to_index = {}

In [83]:
for i, fips in enumerate(counties):
    county_to_index[fips] = i

In [84]:
index_to_county = {v: k for k, v in county_to_index.items()}

In [85]:
county_pop_matrix = []
cov1_matrix = []
cov2_matrix = []

for i, fips in enumerate(counties):
    county_pop_matrix.append(fips_to_pop[fips])
    cov1_matrix.append(fips_to_cov1[fips])
    cov2_matrix.append(fips_to_cov2[fips])

county_pop_matrix = np.array(county_pop_matrix)
cov1_matrix = np.array(cov1_matrix)
cov2_matrix = np.array(cov2_matrix)
    
np.savetxt('data_processed/county_pops.txt', county_pop_matrix)
np.savetxt('data_processed/covariates1.txt', cov1_matrix)
np.savetxt('data_processed/covariates2.txt', cov2_matrix)

In [86]:
print(cov1_matrix.shape)
print(cov2_matrix.shape)
print(county_pop_matrix.shape)

(82, 52)
(82, 52)
(82,)


**Step 5. Construct matrix of indicators for whether regions contain counties.**

In [45]:
county_map_matrix = np.zeros((len(regions), len(counties)))
region_pop_matrix = [0] * len(regions)

for i, r in enumerate(regions):
    for fips in regions_to_counties[r]:
        if fips not in county_to_index:
            continue
        county_map_matrix[i][county_to_index[fips]] = 1
        region_pop_matrix[i] += fips_to_pop[fips]
        
np.savetxt('data_processed/county_map.txt', county_map_matrix)
np.savetxt('data_processed/region_pops.txt', region_pop_matrix)

In [46]:
print(county_map_matrix.shape)

(9, 82)


**Step 6. Get adjacency matrix for counties to construct covariance matrix for multivariate Gaussian.**

In [47]:
cov_matrix = np.eye(len(counties))

In [48]:
np.savetxt('data_processed/corr_cov.txt', cov_matrix)

**TODO**

In [101]:
len(county_adjacency)

3107