In [1]:
import pandas as pd
import datetime
import numpy as np
import altair as alt
import fidap
import config
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt

# instantiate connection to fidap
fidap = fidap.fidap_client(api_key = config.api_key)

The question at hand is to look at the US' current surge in Covid-19 cases, and to look at colleges in particular. As usual, we can begin by looking at how the US has done since July 4th. 

In [9]:
national_average = fidap.sql("""
WITH covid_cases AS (
SELECT date, 
ROW_NUMBER() OVER (ORDER BY date ASC) AS row_num,
(SUM(confirmed_cases) - (LAG(SUM(confirmed_cases), 1) OVER (ORDER BY date ASC))) AS new_cases,
FROM bigquery-public-data.covid19_nyt.us_counties
WHERE date >= CAST("2021-07-04" AS DATE)
AND confirmed_cases >= 0 
AND deaths >= 0
GROUP BY date), 

census_join AS (
SELECT CAST(c.date AS STRING) AS nyt_date, c.new_cases, c.row_num, 
ROUND(100000*(AVG(c.new_cases) OVER (ORDER BY c.date ASC ROWS 7 PRECEDING))/332721914, 2) AS new_cases_ma7
FROM covid_cases AS c)

SELECT nyt_date, new_cases, new_cases_ma7
FROM census_join
WHERE row_num >= 7
ORDER BY nyt_date DESC
""")

national_average['nyt_date'] = pd.to_datetime(national_average['nyt_date'])
alt.Chart(national_average).mark_line().encode(
    x = alt.X('nyt_date', title = "Date"),
    y = alt.Y('new_cases_ma7', title = "7-day Moving Average New Cases per 100k")
)

In [11]:
college_counties_average_july4 = fidap.sql("""
WITH covid_cases AS (
SELECT *, 
ROW_NUMBER() OVER (PARTITION BY county_fips_code ORDER BY date ASC) AS row_num,
(confirmed_cases - (LAG(confirmed_cases, 1) OVER (PARTITION BY county_fips_code ORDER BY date ASC))) AS new_cases,
FROM bigquery-public-data.covid19_nyt.us_counties
WHERE date >= CAST("2021-07-04" AS DATE)
AND confirmed_cases >= 0 
AND deaths >= 0), 

census_join AS (
SELECT CAST(c.date AS STRING) AS nyt_date, c.county, c.state_name, c.confirmed_cases, c.new_cases, c.row_num, c.county_fips_code,
ROUND(100000*(AVG(c.new_cases) OVER (PARTITION BY c.county_fips_code ORDER BY c.date ASC ROWS 7 PRECEDING))/CAST(census.POP100 AS INT64), 2) AS new_cases_ma7
FROM covid_cases AS c
INNER JOIN fidap-301014.us_census_2020.Redistricting_Data_Complete AS census 
ON census.GEOCODE = c.county_fips_code
WHERE CHAR_LENGTH(census.GEOCODE) = 5),

final_query AS (
SELECT nyt_date, county, state_name, confirmed_cases, new_cases, county_fips_code, new_cases_ma7
FROM census_join
WHERE row_num >= 7
),

college_town AS (
SELECT CAST(POP100 AS INT64) AS total, 
CAST(P0050008 AS INT64) AS student_housing,  
CONCAT(c.NAMELSAD, ", ", STUSAB) AS county,
ROUND(100*SAFE_DIVIDE(CAST(P0050008 AS INT64),CAST(POP100 AS INT64)), 2) AS pct_student_housing,
GEOCODE
FROM fidap-301014.us_census_2020.Redistricting_Data_Complete AS r
INNER JOIN fidap-301014.us_census_2020.2020_county_boundaries AS c
ON c.GEOID = GEOCODE
WHERE CHAR_LENGTH(r.GEOID) = 14
AND GEOVAR = '00'
AND COUNTY IS NOT NULL
ORDER BY pct_student_housing DESC
LIMIT 10
)

SELECT nyt_date, ct.county, state_name, GEOCODE, pct_student_housing, confirmed_cases, new_cases, new_cases_ma7 
FROM final_query
INNER JOIN college_town AS ct
ON ct.GEOCODE = county_fips_code
WHERE new_cases_ma7 IS NOT NULL;
""")

college_counties_average_july4['nyt_date'] = pd.to_datetime(college_counties_average_july4['nyt_date'])
alt.Chart(college_counties_average_july4).mark_line().encode(
    x = alt.X('nyt_date', title = "Date"),
    y = alt.Y('new_cases_ma7', title = "7-day Moving Average New Cases per 100k"),
    color = alt.Color('county', title = "County")
)

We see that there is a steady rise in the number of new cases. But how do these counties compare in comparison to the national US rate? Instead of plotting the US national infection rate directly, we can express the 7-day moving average of these 10 college counties as a percentage of the US national rate. 

In [14]:
# rename columns
national_average = national_average.rename(columns = {'new_cases_ma7': 'national_ma7'})

# merge
aggregated_rate = pd.merge(college_counties_average_july4, national_average[['nyt_date', 'national_ma7']], on = 'nyt_date')

# express as percentage
aggregated_rate = aggregated_rate.assign(
    pct_rate = lambda x: round(100*x.new_cases_ma7/x.national_ma7,1)
)

# plotting
alt.Chart(aggregated_rate).mark_line().encode(
    x = alt.X('nyt_date', title = "Date"),
    y = alt.Y('pct_rate', title = "7-day Moving Average New Cases per 100k (% of US MA7)"),
    color = alt.Color('county', title = "County")
)

For now, counties with a huge student population are not seeing a surge in cases as colleges reopen, with the exception of those in VA. That could be the outcome of a more of a localized Covid-19 situation.  
  
We might also want to consider the possibility counties' case loads are correlated with that of their neighbors because people may cross county lines.   

In [43]:
college_counties_adj_comparison = fidap.sql("""
WITH covid_cases AS (
SELECT *, 
ROW_NUMBER() OVER (PARTITION BY county_fips_code ORDER BY date ASC) AS row_num,
(confirmed_cases - (LAG(confirmed_cases, 1) OVER (PARTITION BY county_fips_code ORDER BY date ASC))) AS new_cases,
FROM bigquery-public-data.covid19_nyt.us_counties
WHERE date >= CAST("2021-08-01" AS DATE)
AND confirmed_cases >= 0 
AND deaths >= 0), 

census_join AS (
SELECT CAST(c.date AS STRING) AS nyt_date, c.county, c.state_name, c.confirmed_cases, c.new_cases, c.row_num, c.county_fips_code,
ROUND(100000*(AVG(c.new_cases) OVER (PARTITION BY c.county_fips_code ORDER BY c.date ASC ROWS 7 PRECEDING))/CAST(census.POP100 AS INT64), 2) AS new_cases_ma7
FROM covid_cases AS c
INNER JOIN fidap-301014.us_census_2020.Redistricting_Data_Complete AS census 
ON census.GEOCODE = c.county_fips_code
WHERE CHAR_LENGTH(census.GEOCODE) = 5),

final_query AS (
SELECT nyt_date, county, state_name, confirmed_cases, new_cases, county_fips_code, new_cases_ma7
FROM census_join
WHERE row_num >= 7),

college_counties AS (
SELECT CAST(POP100 AS INT64) AS total, 
CAST(P0050008 AS INT64) AS student_housing,  
CONCAT(c.NAMELSAD, ", ", STUSAB) AS county,
ROUND(100*SAFE_DIVIDE(CAST(P0050008 AS INT64),CAST(POP100 AS INT64)), 2) AS pct_student_housing,
GEOCODE
FROM fidap-301014.us_census_2020.Redistricting_Data_Complete AS r
INNER JOIN fidap-301014.us_census_2020.2020_county_boundaries AS c
ON c.GEOID = GEOCODE
WHERE CHAR_LENGTH(r.GEOID) = 14
AND GEOVAR = '00'
AND COUNTY IS NOT NULL
ORDER BY pct_student_housing DESC
LIMIT 10
),

college_counties_adj AS (
SELECT cc.county, state, county_fips_code, cc.pct_student_housing, cc.total AS total_pop, neighboring_counties_fips_codes
FROM bigquery-public-data.geo_us_boundaries.adjacent_counties
CROSS JOIN UNNEST(neighbors_fips_code) AS neighboring_counties_fips_codes
INNER JOIN college_counties AS cc
ON cc.GEOCODE = county_fips_code),

college_counties_ma AS(
SELECT cma.nyt_date, cca.county, state, cca.county_fips_code, pct_student_housing, total_pop, neighboring_counties_fips_codes, cma.new_cases_ma7 AS college_county_ma7
FROM college_counties_adj AS cca
INNER JOIN final_query AS cma
ON cma.county_fips_code = cca.county_fips_code
WHERE new_cases_ma7 IS NOT NULL)

SELECT cca.nyt_date, cca.county, state, cca.county_fips_code, pct_student_housing, total_pop, college_county_ma7, neighboring_counties_fips_codes, cma.new_cases_ma7 AS neighboring_counties_ma7
FROM college_counties_ma AS cca
INNER JOIN final_query AS cma
ON cma.county_fips_code = cca.neighboring_counties_fips_codes
AND cma.nyt_date = cca.nyt_date
WHERE new_cases_ma7 IS NOT NULL
""")

college_counties_adj_comparison['nyt_date'] = pd.to_datetime(college_counties_adj_comparison['nyt_date'])
college_counties_adj_comparison = college_counties_adj_comparison.assign(
    neighboring_counties_ma7_pct = lambda x: round(100*x.neighboring_counties_ma7/x.college_county_ma7, 2))

In [49]:
# plotting
alt.Chart(college_counties_adj_comparison).mark_circle(
    color = '#80b1d3',
    opacity = 0.6
).encode(
    x = alt.X('college_county_ma7', title = "College Towns 7MA"),
    y = alt.Y('neighboring_counties_ma7', title = "Neighboring Counties 7MA")
)

In [44]:
# plotting
alt.Chart(college_counties_adj_comparison).mark_line().encode(
    x = alt.X('nyt_date', title = "Date"),
    y = alt.Y('neighboring_counties_ma7_pct', title = "7-day Moving Average New Cases per 100k"),
    color = alt.Color('neighboring_counties_fips_codes:N', title = "County"),
    facet = alt.Facet('county:O', columns = 2)
)

There is not a whole lot of evidence that the reopening of colleges has resulted in a surge of cases whether in the county itself or surrounding ones. 

In [2]:
college_cases = fidap.sql("""
WITH covid_cases AS (
SELECT *, 
ROW_NUMBER() OVER (PARTITION BY county_fips_code ORDER BY date ASC) AS row_num,
(confirmed_cases - (LAG(confirmed_cases, 1) OVER (PARTITION BY county_fips_code ORDER BY date ASC))) AS new_cases,
FROM bigquery-public-data.covid19_nyt.us_counties
WHERE date >= DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)
AND confirmed_cases >= 0 
AND deaths >= 0), 

census_join AS (
SELECT CAST(c.date AS STRING) AS nyt_date, c.county, c.state_name, c.confirmed_cases, c.new_cases, c.row_num, c.county_fips_code,
ROUND(100000*(AVG(c.new_cases) OVER (PARTITION BY c.county_fips_code ORDER BY c.date ASC ROWS 7 PRECEDING))/CAST(census.POP100 AS INT64), 2) AS new_cases_ma7
FROM covid_cases AS c
INNER JOIN fidap-301014.us_census_2020.Redistricting_Data_Complete AS census 
ON census.GEOCODE = c.county_fips_code
WHERE CHAR_LENGTH(census.GEOCODE) = 5),

final_query AS (
SELECT nyt_date, county, state_name, confirmed_cases, new_cases, county_fips_code,  
IF(row_num < 7, NULL, new_cases_ma7) new_cases_ma7
FROM census_join),

college_town AS (
SELECT CAST(POP100 AS INT64) AS total, 
CAST(P0050008 AS INT64) AS student_housing,  
CONCAT(c.NAMELSAD, ", ", STUSAB) AS county,
ROUND(100*SAFE_DIVIDE(CAST(P0050008 AS INT64),CAST(POP100 AS INT64)), 2) AS pct_student_housing,
GEOCODE
FROM fidap-301014.us_census_2020.Redistricting_Data_Complete AS r
INNER JOIN fidap-301014.us_census_2020.2020_county_boundaries AS c
ON c.GEOID = GEOCODE
WHERE CHAR_LENGTH(r.GEOID) = 14
AND GEOVAR = '00'
AND COUNTY IS NOT NULL
ORDER BY pct_student_housing DESC
LIMIT 20
)

SELECT nyt_date, ct.county, state_name, GEOCODE, pct_student_housing, confirmed_cases, new_cases, new_cases_ma7 
FROM final_query
INNER JOIN college_town AS ct
ON ct.GEOCODE = county_fips_code
WHERE new_cases_ma7 IS NOT NULL;
""")

In [3]:
college_counties = fidap.sql("""
SELECT CAST(POP100 AS INT64) AS total, 
CAST(P0050008 AS INT64) AS student_housing,  
CONCAT(c.NAMELSAD, ", ", STUSAB) AS county,
ROUND(100*SAFE_DIVIDE(CAST(P0050008 AS INT64),CAST(POP100 AS INT64)), 2) AS pct_student_housing,
GEOCODE
FROM fidap-301014.us_census_2020.Redistricting_Data_Complete AS r
INNER JOIN fidap-301014.us_census_2020.2020_county_boundaries AS c
ON c.GEOID = GEOCODE
WHERE CHAR_LENGTH(r.GEOID) = 14
AND GEOVAR = '00'
AND COUNTY IS NOT NULL
ORDER BY pct_student_housing DESC
LIMIT 20
""")

In [None]:
unnested_adj_counties = fidap.sql("""
SELECT county, state, county_fips_code, neighboring_counties_fips_codes
FROM bigquery-public-data.geo_us_boundaries.adjacent_counties
CROSS JOIN UNNEST(neighbors_fips_code) AS neighboring_counties_fips_codes
""")

In [4]:
college_adj_counties = fidap.sql("""
WITH college_counties AS (
SELECT CAST(POP100 AS INT64) AS total, 
CAST(P0050008 AS INT64) AS student_housing,  
CONCAT(c.NAMELSAD, ", ", STUSAB) AS county,
ROUND(100*SAFE_DIVIDE(CAST(P0050008 AS INT64),CAST(POP100 AS INT64)), 2) AS pct_student_housing,
GEOCODE
FROM fidap-301014.us_census_2020.Redistricting_Data_Complete AS r
INNER JOIN fidap-301014.us_census_2020.2020_county_boundaries AS c
ON c.GEOID = GEOCODE
WHERE CHAR_LENGTH(r.GEOID) = 14
AND GEOVAR = '00'
AND COUNTY IS NOT NULL
ORDER BY pct_student_housing DESC
LIMIT 20
)

SELECT cc.county, state, county_fips_code, neighboring_counties_fips_codes
FROM bigquery-public-data.geo_us_boundaries.adjacent_counties
CROSS JOIN UNNEST(neighbors_fips_code) AS neighboring_counties_fips_codes
INNER JOIN college_counties AS cc
ON cc.GEOCODE = county_fips_code
""")