# Population Flow

Let's take a look at the flow between states in hopes of learning which states' data to aggregate in our final model. 

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing

In [2]:
data = pd.read_csv('../data/graph.csv')
data.head()

Unnamed: 0,source_state,target_state,04-12-2020,04-13-2020,04-14-2020,04-15-2020,04-16-2020,04-17-2020,04-18-2020,04-19-2020,...,08-22-2020,08-23-2020,08-24-2020,08-25-2020,08-26-2020,08-27-2020,08-28-2020,08-29-2020,08-30-2020,08-31-2020
0,Alabama,Alabama,616434.0,784114.0,756555.0,786938.0,792072.0,846587.0,783567.0,619458.0,...,988618.0,873985.0,950066.0,952730.0,961319.0,994395.0,993710.0,951297.0,826922.0,991456.0
1,Alabama,Alaska,16.0,8.0,13.0,12.0,9.0,10.0,10.0,11.0,...,54.0,48.0,30.0,44.0,37.0,50.0,42.0,44.0,45.0,50.0
2,Alabama,Arizona,158.0,176.0,148.0,158.0,140.0,176.0,170.0,155.0,...,236.0,230.0,220.0,211.0,226.0,220.0,216.0,212.0,179.0,213.0
3,Alabama,Arkansas,298.0,399.0,370.0,401.0,649.0,683.0,650.0,412.0,...,772.0,647.0,622.0,738.0,683.0,660.0,686.0,751.0,698.0,585.0
4,Alabama,California,358.0,421.0,421.0,369.0,341.0,380.0,330.0,367.0,...,609.0,564.0,572.0,569.0,592.0,610.0,573.0,697.0,638.0,638.0


In [3]:
data.isna().sum()

source_state    0
target_state    0
04-12-2020      0
04-13-2020      0
04-14-2020      0
               ..
08-27-2020      0
08-28-2020      0
08-29-2020      0
08-30-2020      0
08-31-2020      0
Length: 144, dtype: int64

Nice, no interpolation is needed. 

In [4]:
data.describe()

Unnamed: 0,04-12-2020,04-13-2020,04-14-2020,04-15-2020,04-16-2020,04-17-2020,04-18-2020,04-19-2020,04-20-2020,04-21-2020,...,08-22-2020,08-23-2020,08-24-2020,08-25-2020,08-26-2020,08-27-2020,08-28-2020,08-29-2020,08-30-2020,08-31-2020
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,...,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0
mean,10311.67,11707.23,11511.4,11784.38,11797.31,12305.19,11631.54,10573.03,12052.44,12799.9,...,16198.5,14530.65,15117.24,15292.83,15351.36,15388.28,15771.8,15309.74,13971.03,16095.96
std,103450.1,116882.6,113902.4,116793.4,116989.9,121821.0,114933.3,104512.4,119046.2,126838.4,...,155182.9,139663.7,146165.2,148519.4,148198.8,148253.5,151739.9,147772.5,134310.4,155286.7
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,24.0,24.75,24.0,24.0,25.0,24.0,23.0,27.0,29.0,...,64.0,57.0,57.0,58.0,57.0,58.0,59.0,58.0,55.0,61.0
50%,71.5,81.0,79.5,83.0,83.0,85.0,79.0,76.0,87.0,99.0,...,199.0,177.0,171.0,168.0,170.5,174.0,178.0,181.0,167.5,182.0
75%,251.0,288.0,302.5,305.0,307.0,311.25,297.0,268.25,310.25,351.0,...,786.25,672.25,610.5,603.5,611.25,635.0,669.0,696.0,623.25,652.0
max,2865996.0,3208712.0,3080286.0,3198044.0,3216221.0,3350439.0,3152302.0,2825683.0,3248109.0,3475880.0,...,4268882.0,3792737.0,3951518.0,4058344.0,3963659.0,3934572.0,4108671.0,4061164.0,3611719.0,4156280.0


Aight, now let's make some helper functions to extract a state's data as a time series. 

In [5]:
# Given a single data record, returns the total flow as a time series. 
def extract_timeseries(point):
    ts = pd.Series(point[2:])
    ts.index = pd.to_datetime(point.index[2:])
    return ts

# Graph Construction

Let's construct a dense adjacency matrix. 

In [6]:
states = np.unique(data['source_state'])

# Adjacency Matrix is a 50x50 array. 
# adjacency[s, t] = # people going from source state s to target state t
# s and t are state "numbers"; ie. the index of the state in "states"
adj = np.zeros((len(states), len(states)))

for s, source_state in enumerate(states): 
    src_data = data.loc[data['source_state'] == source_state]
    for t, target_state in enumerate(states): 
        entry = src_data.loc[src_data['target_state'] == target_state].iloc[0]     
        adj[s, t] = np.sum(extract_timeseries(entry))

assert adj[0, 0] == extract_timeseries(data.iloc[0]).sum()

In [7]:
print(adj)

[[1.34305209e+08 6.18500000e+03 3.53990000e+04 ... 1.88160000e+04
  3.40060000e+04 1.83540000e+04]
 [4.78500000e+03 6.76601400e+06 1.35080000e+04 ... 5.62000000e+02
  3.53800000e+03 1.70900000e+03]
 [2.79350000e+04 2.57200000e+04 9.33214980e+07 ... 6.82600000e+03
  1.12522000e+05 5.66850000e+04]
 ...
 [1.79120000e+04 9.68000000e+02 6.54700000e+03 ... 2.63978590e+07
  1.42340000e+04 3.92500000e+03]
 [3.10290000e+04 6.53000000e+03 5.78300000e+04 ... 1.35210000e+04
  9.02275640e+07 2.86220000e+04]
 [3.86800000e+03 2.68100000e+03 1.94000000e+04 ... 1.68400000e+03
  5.77500000e+03 7.32258200e+06]]


For simplification purposes, we'll make our graph undirected. 
We assume that the total population of states is relatively constant over time; ie. that all people who enter a state must eventually leave to their home state. 
With this assumption, "total connection" between states will be defined as the sum of all traffic between two states. 

In [8]:
# If they change the number of states, our code will still work ;)
for i in range(len(states)): 
    for j in range(len(states)):
        tot = adj[i, j] + adj[j, i]
        adj[i, j] = tot
        adj[j, i] = tot

Now, we trim our adjacency matrix to only those entries above a certain threshold $\epsilon$. 

More complex functions can be done as needed (eg. as a percentage of state population). Perhaps we could even do a GMM style soft correlation matrix?

In [53]:
ε = 5e6

# connected components; 1 if connection, 0 if not. 
conn = np.zeros((len(states), len(states)))
conn[adj > ε] = 1
conn[adj <= ε] = 0

print(conn)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


Looks good so far. 

## Clustering States

In [10]:
import scipy.sparse as sparse
import scipy.sparse.csgraph as csgraph

In [11]:
n, clusters = csgraph.connected_components(
    csgraph=sparse.csr_matrix(conn), 
    directed=False, 
    return_labels=True)

In [12]:
print(n)
print(clusters)

27
[ 0  1  2  3  2  4  5  6  0  0  7  8  9  9 10  9 11 12 13  0 14 11  9  0
  9 15 16  2 17  0 18  0  0 19 11 12 20  0 21  0 22  0 12 23 24  0 20 25
  9 26]


In [13]:
for label in np.unique(clusters): 
    print(f'Cluster {label}:')
    for ind, val in enumerate(clusters): 
        if val == label: 
            print(f'\t{states[ind]}')
    print()

Cluster 0:
	Alabama
	Florida
	Georgia
	Maryland
	Mississippi
	New Jersey
	New York
	North Carolina
	Pennsylvania
	South Carolina
	Tennessee
	Virginia

Cluster 1:
	Alaska

Cluster 2:
	Arizona
	California
	Nevada

Cluster 3:
	Arkansas

Cluster 4:
	Colorado

Cluster 5:
	Connecticut

Cluster 6:
	Delaware

Cluster 7:
	Hawaii

Cluster 8:
	Idaho

Cluster 9:
	Illinois
	Indiana
	Kansas
	Minnesota
	Missouri
	Wisconsin

Cluster 10:
	Iowa

Cluster 11:
	Kentucky
	Michigan
	Ohio

Cluster 12:
	Louisiana
	Oklahoma
	Texas

Cluster 13:
	Maine

Cluster 14:
	Massachusetts

Cluster 15:
	Montana

Cluster 16:
	Nebraska

Cluster 17:
	New Hampshire

Cluster 18:
	New Mexico

Cluster 19:
	North Dakota

Cluster 20:
	Oregon
	Washington

Cluster 21:
	Rhode Island

Cluster 22:
	South Dakota

Cluster 23:
	Utah

Cluster 24:
	Vermont

Cluster 25:
	West Virginia

Cluster 26:
	Wyoming



By tweaking the value of $\epsilon$, we can get anywhere from 6 to 40 clusters. 
Nice. 

## Evaluating Clustering Results

If our clustering is accurate, we should be getting correlations between the deaths in these states. Let's verify this with a covariance matrix. 

In [14]:
train = pd.read_csv("../data/train.csv")
train.head()

Unnamed: 0,ID,Province_State,Date,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
0,0,Alabama,04-12-2020,3563,93,,3470.0,75.98802,21583.0,437.0,2.61016,460.300152,12.264945
1,1,Alaska,04-12-2020,272,8,66.0,264.0,45.504049,8038.0,31.0,2.941176,1344.711576,11.397059
2,2,Arizona,04-12-2020,3542,115,,3427.0,48.662422,42109.0,,3.246753,578.522286,
3,3,Arkansas,04-12-2020,1280,27,367.0,1253.0,49.439423,19722.0,130.0,2.109375,761.753354,10.15625
4,4,California,04-12-2020,22795,640,,22155.0,58.137726,190328.0,5234.0,2.81202,485.423869,22.961176


In [50]:
deaths = []
for i, state in enumerate(states): 
    state_deaths = train.loc[train['Province_State'] == state]['Deaths']
    deaths.append(state_deaths)
    
print(deaths)

[0         93
50        99
100      114
150      118
200      133
        ... 
6850    2076
6900    2107
6950    2152
7000    2162
7050    2182
Name: Deaths, Length: 142, dtype: int64, 1        8
51       8
101      9
151      9
201      9
        ..
6851    37
6901    37
6951    37
7001    37
7051    37
Name: Deaths, Length: 142, dtype: int64, 2        115
52       122
102      131
152      142
202      150
        ... 
6852    4929
6902    4978
6952    5007
7002    5030
7052    5029
Name: Deaths, Length: 142, dtype: int64, 3        27
53       29
103      32
153      33
203      37
       ... 
6853    739
6903    756
6953    772
7003    784
7053    797
Name: Deaths, Length: 142, dtype: int64, 4         640
54        714
104       767
154       860
204       956
        ...  
6854    12677
6904    12805
6954    12894
7004    12937
7054    13022
Name: Deaths, Length: 142, dtype: int64, 5        289
55       306
105      327
155      328
205      355
        ... 
6855    1931
6905    19

In [51]:
cov = np.cov(deaths)

In [52]:
print(cov)

[[3.71758174e+05 4.73740805e+03 9.53834757e+05 ... 2.68674527e+04
  1.61807596e+05 5.83210069e+03]
 [4.73740805e+03 6.55976925e+01 1.24499029e+04 ... 3.48119019e+02
  1.93650744e+03 7.21797023e+01]
 [9.53834757e+05 1.24499029e+04 2.48224822e+06 ... 6.78657748e+04
  4.02396636e+05 1.46519950e+04]
 ...
 [2.68674527e+04 3.48119019e+02 6.78657748e+04 ... 2.11316417e+03
  1.19248834e+04 4.36684947e+02]
 [1.61807596e+05 1.93650744e+03 4.02396636e+05 ... 1.19248834e+04
  7.57247312e+04 2.66501898e+03]
 [5.83210069e+03 7.21797023e+01 1.46519950e+04 ... 4.36684947e+02
  2.66501898e+03 9.69609430e+01]]
