In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import glob
import epiweeks
import os
import math

from datetime import datetime
from tqdm.notebook import trange, tqdm

In [None]:
weeks_ahead = 2
first_prediction_week = epiweeks.Week.fromdate(datetime.strptime('2020-01-22', '%Y-%m-%d'))
last_prediction_week = epiweeks.Week.fromdate(datetime.strptime('2021-06-28', '%Y-%m-%d'))

# Confirmed cases (groundtruth)

In [None]:
dtype = {'date': 'string', 'location': 'string', 'location_name': 'string', 'value': 'float'}
confirmed = pd.read_csv('../covid19-forecast-hub/data-truth/truth-Incident Cases.csv', dtype=dtype)
confirmed['location'].replace({'US': 0},inplace=True)
confirmed['location'] = confirmed['location'].astype('int64')
confirmed['date'] = pd.to_datetime(confirmed['date']).dt.date

confirmed = confirmed[confirmed['location'] >= 999] # only consider counties

In [None]:
locations = confirmed['location'].unique()
rows = []
for loc in tqdm(locations):
    currweek = first_prediction_week
    while currweek <= last_prediction_week:
        week = currweek + weeks_ahead
        aux = confirmed[confirmed['location']==loc]
        sumcases = aux[(aux['date'] >= week.startdate()) & (aux['date'] <= week.enddate())]['value'].sum()
        rows.append({'location': loc, 'week': int(week.cdcformat()), 'start_date': week.startdate(), 'end_date': week.enddate(), 'sum': sumcases})
        currweek += 1
groundtruth = pd.DataFrame(rows)
groundtruth = groundtruth.set_index(['location', 'week'])
groundtruth = groundtruth.sort_index()

# Predictions

In [None]:
paths = glob.glob('../../covid19-forecast-hub/data-processed/*/')
models = []
for path in paths:
    path = os.path.normpath(path)
    tks = path.split(os.sep)
    model = tks[-1]
    models.append(model)

In [None]:
predictions = {}
for model in tqdm(models):
    df = pd.DataFrame()
    files = sorted(glob.glob('../../covid19-forecast-hub/data-processed/%s/*.csv'%model))
    for file in files:
        dtype = {'forecast_date': 'string', 'quantile': 'float', 'location': 'string', 'target_end_date': 'string', 'type': 'string'}
        curr = pd.read_csv(file, dtype=dtype)
        curr['location'].replace({'US': 0},inplace=True)
        curr['location'] = curr['location'].astype('int64')
        curr['forecast_date'] = pd.to_datetime(curr['forecast_date']).dt.date
        curr['target_end_date'] = pd.to_datetime(curr['target_end_date']).dt.date
        curr = curr[curr['target']=='%d wk ahead inc case'%weeks_ahead]
        curr = curr[curr['location'] >= 1000] # only consider counties
        curr = curr[curr['type'] == 'point'] # only point predictions
        df = df.append(curr)        
     
    if(len(df) > 0):
        df['week'] = df['target_end_date'].apply(epiweeks.Week.fromdate).apply(epiweeks.Week.cdcformat).astype('int')
        df = df.set_index(['location', 'week'])
        df = df.sort_index()
        predictions[model] = df
        print(model, len(df))

# Computing errors

In [None]:
for model in tqdm(predictions):
    df = predictions[model]
    print(model)
    absdiffs = []
    for index, row in df.iterrows():
        location = index[0]
        week = index[1]
        if (location, week) in groundtruth.index:
            gt = groundtruth.loc[(location, week)]
            absdiff = row['value'] - gt['sum']
            absdiffs.append(absdiff)
        else:
            absdiffs.append(float('nan'))
    predictions[model]['diff'] = absdiffs

for model in predictions:
    predictions[model].dropna(subset=['diff'],inplace=True)

# Remove duplicates (keep last)

In [None]:
for model in tqdm(predictions):
    predictions[model] = predictions[model][~predictions[model].index.duplicated(keep='last')]

# Weights and eligibility

In [None]:
for model in predictions:
    predictions[model]['weight'] = 0.0
    predictions[model]['eligibility'] = False

In [None]:
files = sorted(glob.glob('../../covid19-forecast-hub/ensemble-metadata/*-inc_case-model-weights.csv'))
for file in tqdm(files):
    date = os.path.basename(file)
    date = date[:date.find('-inc')]
    week = int(epiweeks.Week.fromdate(datetime.strptime(date, '%Y-%m-%d')).cdcformat())
    df = pd.read_csv(file)
    aux = 'location'
    if 'locations' in df:
        aux = 'locations'
    df[aux].replace({'US': 0},inplace=True)
    df[aux] = df[aux].astype('int64')
    df = df[df[aux] >= 1000] # only consider counties
    
    for index, row in df.iterrows():
        location = row[aux]
        for model in predictions:
            if (model in df.columns) and ((location,week,week) in predictions[model].index):
                weight = row[model]
                predictions[model].loc[(location,week,week),'weight'] = weight

In [None]:
files = sorted(glob.glob('../../covid19-forecast-hub/ensemble-metadata/*-inc_case-model-eligibility.csv'))
for file in tqdm(files):
    date = os.path.basename(file)
    date = date[:date.find('-inc')]
    week = int(epiweeks.Week.fromdate(datetime.strptime(date, '%Y-%m-%d')).cdcformat())
    df = pd.read_csv(file, dtype={'location': 'string'})
    aux = 'location'
    if 'locations' in df:
        aux = 'locations'
    df[aux].replace({'US': 0},inplace=True)
    df[aux] = df[aux].astype('int64')
    df = df[df[aux] >= 1000] # only consider counties
    
    for index, row in df.iterrows():
        location = row[aux]
        model = row['model']
        eligibility = row['overall_eligibility']
        if (model in predictions) and ((location,week,week) in predictions[model].index):
            if eligibility == 'eligible':
                predictions[model].loc[(location,week,week),'eligibility'] = True

# Saving

In [None]:
for model in predictions:
    predictions[model].to_pickle('%d-week/%s.pkl'%(weeks_ahead,model))

In [None]:
groundtruth.to_pickle('%d-week/groundtruth.pkl'%(weeks_ahead))

# JSON

In [None]:
paths = glob.glob('./%d-week/*'%weeks_ahead)
models = []
for path in paths:
    path = os.path.normpath(path)
    tks = path.split(os.sep)
    model = tks[-1].split('.')[0]
    if model != 'groundtruth':
        models.append(model)

In [None]:
predictions = {}
for model in models:
    predictions[model] = pd.read_pickle('%d-week/%s.pkl'%(weeks_ahead,model))
    
groundtruth = pd.read_pickle('%d-week/groundtruth.pkl'%weeks_ahead)

In [None]:
first_week = predictions['COVIDhub-baseline'].iloc[0][['forecast_date']]
num_weeks = predictions['COVIDhub-baseline'].iloc[-1]['forecast_date'] - first_week
num_weeks = int(num_weeks[0].days / 7)

In [None]:
# Ensemble json
modelsdict = {}
for model in models:
    if model not in modelsdict:
        modelsdict[model] = {}
    for index, row in predictions[model].iterrows():
        location = str(index[0])
        week = row['forecast_date']
        value = row['value']
        diff = row['diff']

        if location not in modelsdict[model]:
            modelsdict[model][location] = [{'value': 'NaN', 'diff': 'NaN'}] * (num_weeks+1)
        week_index = (week - first_week)
        week_index = math.ceil(week_index[0].days/7)
        modelsdict[model][location][week_index] = {'value': value, 'diff': diff}

In [None]:
import json
with open('../vis/src/assets/models.json', 'w') as f:
    json.dump(modelsdict, f, separators=(',', ':'))

In [None]:
population = pd.read_csv('../../covid19-forecast-hub/data-locations/locations.csv')
population['location'].replace({'US': 0},inplace=True)
population['location'] = population['location'].astype('int64')
population['population'].sum()

In [None]:
pop = {}
for index, row in population.dropna(subset=['population']).iterrows():
    location = row['location']
    value = int(row['population'])
    pop[location] = value
    
with open('../vis/src/assets/population.json', 'w') as f:
    json.dump(pop, f, separators=(',', ':'))