In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-poster')
from datetime import datetime as dt
from datetime import timedelta
import glob
    
from Reff_functions import *
from Reff_constants import *
from scipy.stats import gamma


In [None]:
#Code taken from read_in_cases from Reff_functions. Preprocessing was not helpful for this situation.

case_file_date='29Jun'
path = "../data/COVID-19 UoM "+case_file_date+"*.xlsx"

for file in glob.glob(path):
    df_NNDSS = pd.read_excel(file,
                       parse_dates=['SPECIMEN_DATE','NOTIFICATION_DATE','NOTIFICATION_RECEIVE_DATE','TRUE_ONSET_DATE'],
                       dtype= {'PLACE_OF_ACQUISITION':str})
    df_NNDSS.PLACE_OF_ACQUISITION.fillna('00038888',inplace=True) #Fill blanks with simply unknown

   # df_NNDSS['date_inferred'] = df_NNDSS.TRUE_ONSET_DATE
  #  df_NNDSS.loc[df_NNDSS.TRUE_ONSET_DATE.isna(),'date_inferred'] = df_NNDSS.loc[df_NNDSS.TRUE_ONSET_DATE.isna()].NOTIFICATION_DATE - timedelta(days=5)
  #  df_NNDSS.loc[df_NNDSS.date_inferred.isna(),'date_inferred'] = df_NNDSS.loc[df_NNDSS.date_inferred.isna()].NOTIFICATION_RECEIVE_DATE - timedelta(days=6)    
df_NNDSS['imported'] = df_NNDSS.PLACE_OF_ACQUISITION.apply(lambda x: 1 if x[-4:]=='8888' and x != '00038888' else 0)
df_NNDSS['local'] = 1 - df_NNDSS.imported

df_interim = df_NNDSS[['NOTIFICATION_DATE','STATE','imported','local']] 
# Importantly, imported and local are indicator variables in df_interim.

#df_state = df_NNDSS[['NOTIFICATION_DATE','STATE','imported','local']].groupby(['STATE','NOTIFICATION_DATE']).sum()

In [None]:
df_interim = df_interim[~df_interim.NOTIFICATION_DATE.isna()] #Get rid of non-existent dates.

In [None]:
#Filter out territories
df_linel = df_interim[(df_interim['STATE']!='NT') & (df_interim['STATE']!='ACT')]
#Melt down so that imported and local are no longer columns. Allows multiple draws for infection date.
df_linel = df_linel.melt(id_vars = ['NOTIFICATION_DATE','STATE'], var_name = 'SOURCE',value_name='n_cases')

#Reset index or the joining doesn't work
df_linel = df_linel[df_linel.n_cases!=0]
df_linel = df_linel.reset_index()


## Part 1: Inferring infection dates
$\Lambda$ depends on the infection date (ID), while the data contains the notification date (ND). We obtain ID through the following relationship:
$$
ID = ND - reporting\_delay - incubation\_period.
$$



A gamma distribution was fitted to case data using the MLE algorithm to produce distributions for reporting delay and incubation period. 

In [None]:
##uncomment for debugging
# notification_dates = df_linel['NOTIFICATION_DATE']
# mean_rd = 5.47
# sd_rd = 4.04
# mean_inc = 2.0
# sd_inc = 1.41
# nreplicates = 3

In [None]:
##gamma draws take arguments (shape, scale)
def create_inf_dates(notification_dates, mean_rd=5.47, sd_rd=4.04,
                    mean_inc=2.0, sd_inc=1.41, nreplicates=1):

    nsamples = notification_dates.shape[0]

    #    DEFINE DELAY DISTRIBUTION
    #     mean_rd = 5.47
    #     sd_rd = 4.04
    scale_rd = mean_rd/(sd_rd)**2
    shape_rd = mean_rd/scale_rd

    # DEFINE INCUBATION PERIOD DISTRIBUTION
    #     mean_inc = 2.0
    #     sd_inc = 1.41
    scale_inc = mean_inc/(sd_inc)**2
    shape_inc = mean_inc/scale_inc

    #Draw from distributions - these are long vectors
    inc_period = np.random.gamma(shape_inc, scale_inc, size = (nsamples*nreplicates))
    rep_delay = np.random.gamma(shape_rd, scale_rd, size = (nsamples*nreplicates))

    #infection date is id_nd_diff days before notification date. This is also a long vector.
    id_nd_diff = inc_period + rep_delay

    #Minutes aren't included in df. Take the ceiling because the day runs from 0000 to 2359. This can still be a long vector.
    whole_day_diff = np.ceil(id_nd_diff) 
    time_day_diffmat = whole_day_diff.astype('timedelta64[D]').reshape((nsamples, nreplicates))

    #Vector must be coerced into a nsamples by nreplicates array. Then each column must be subtracted from notification_dates. 
    #Subtract days off of notification dates.

    notification_mat = np.tile(notification_dates, (nreplicates,1)).T #notification_dates is repeated as a column nreplicates times.

    infection_dates = notification_mat - time_day_diffmat

    return(infection_dates)

In [None]:
nreps = 100
infdates = create_inf_dates(df_linel['NOTIFICATION_DATE'], nreplicates=nreps)
datecolnames = [*map(str,range(nreps))]
infdates_df = pd.DataFrame(infdates,columns = datecolnames)

#Add new infection_dates to the linelist data as columns. 
df_inf = pd.concat([df_linel, infdates_df], axis=1, verify_integrity=True)
df_inf.head()

In [None]:
#
df_combined = df_inf[['STATE','SOURCE',datecolnames[0],'n_cases']].groupby(['STATE', datecolnames[0],'SOURCE']).sum()
for cn in range(1,nreps):
    df_addin = df_inf[['STATE','SOURCE',datecolnames[cn],'n_cases']].groupby(['STATE', datecolnames[cn],'SOURCE']).sum()
    df_combined = pd.concat([df_combined,df_addin], axis=1, ignore_index = True)

#NaNs are inserted for missing values when concatenating. If it's missing, there were zero infections
df_combined[np.isnan(df_combined)]=0
df_combined.index.set_names(["STATE","INFECTION_DATE","SOURCE"], inplace=True)

In [None]:
#df_combined = df_combined.reset_index()
#Use indices for filling dates, so don't collapse them yet.
df_combined.head()
#It makes sense for df_combined to have considerably fewer rows, since rows now represent more than one case.

In [None]:
# Reindex to include days with zero total infections.
local_infs = df_combined.xs('local',level='SOURCE')
imported_infs = df_combined.xs('imported',level='SOURCE')
statelist = [*dftest.index.get_level_values('STATE').unique()]

#Should all states have the same start date? Current code starts from the first case in each state.
#For the same start date:
local_statedict = dict(zip(statelist, np.repeat(None, len(statelist))))
imported_statedict = dict(zip(statelist, np.repeat(None, len(statelist))))

start_date = np.datetime64("2020-02-01")

#Determine end dates as the last infected date by state.
index_only = df_combined.index.to_frame()
index_only = index_only.reset_index(drop=True)
maxdates = test.groupby(['STATE'])['INFECTION_DATE'].max()

for aus_state in statelist:
    state_data = local_infs.xs(aus_state, level='STATE')
    #start_date = state_data.index.min()
    
    #dftest.index=dftest.reindex(alldates, fill_value=0)
    
    alldates = pd.date_range(start_date, maxdates[aus_state]) #All days from start_date to the last infection day.
    local_statedict[aus_state] = state_data.reindex(alldates, fill_value=0)
    
for aus_state in statelist:
    state_data = imported_infs.xs(aus_state, level='STATE')
    alldates = pd.date_range(start_date, maxdates[aus_state])
    imported_statedict[aus_state] = state_data.reindex(alldates, fill_value=0)

In [None]:
#Convert dictionaries to data frames
df_local_inc_zeros = pd.concat(local_statedict)
df_local_inc_zeros['SOURCE']='local'
df_imp_inc_zeros = pd.concat(imported_statedict)
df_imp_inc_zeros['SOURCE']='imported'

In [None]:
#Merge dataframes and reindex. 
df_inc_zeros = pd.concat([df_local_inc_zeros, df_imp_inc_zeros])

df_inc_zeros = df_inc_zeros.reset_index()
df_inc_zeros= df_inc_zeros.groupby(['level_0',"level_1","SOURCE"]).sum()
df_inc_zeros.index = df_inc_zeros.index.rename(['STATE','INFECTION_DATE',"SOURCE"])

In [None]:
df_inc_zeros.head()

#to_csv, etc. 

In [None]:
np.sum(df_inc_zeros, axis=0) #Differences in numbers: start date?

## Part 2: Calculating Lambda

$$
\Lambda_t(w_s) = \sum_{s=1}^t (I_{t-s}^{local} + I_{t-s}^{imported})w_s = \sum_{s=1}^t I_{t-s}w_s,
$$
where $w_s$ is the probability that the generation interval is $s$ and $I_t$ is the number of infected individuals at time $t$. 

### Part 2a: Discretizing the gamma generation interval distribution

In the formula for $\Lambda_t$, we sum over $w$. We should consider generation interval as a discrete random variable here. 

In [None]:
#Define gamma distribution for generation interval
mean_gen = 2.5
sd_gen = 1.75
scale_gen = mean_gen/(sd_gen)**2
shape_gen = mean_gen/scale_gen


In [None]:
## Plot gamma pdf over random samples from distribution
# xrange = np.linspace(0,25,150)

# fig,ax = plt.subplots(figsize=(12,9))
# x = np.random.gamma(shape_gen, scale_gen, size = 10000)
# print("Generation time: \nMean: %f.2" %np.mean(x))
# print("Std: %f.2" %np.std(x))
# w = ax.hist(x,bins=40, density=True)
# ax.set_title("Generation time distribution")
# ax.plot(xrange, gamma.pdf(xrange, a=shape_gen, scale=scale_gen), linewidth=4,alpha=0.8)

In [None]:
trunc_days = 21
shift=0
xmids = [x+shift for x in range(trunc_days+1)] #Find midpoints for discretisation
gamma_vals = gamma.pdf(xmids, a=shape_gen, scale=scale_gen)
disc_gamma = gamma_vals/sum(gamma_vals)

In [None]:
print("Sum of gamma values is " + str(sum(gamma_vals))+"; \n Sum of discretised gamma values is " + str(sum(disc_gamma)))

In [None]:
xrange = np.linspace(0,trunc_days,150)
fig,ax = plt.subplots(figsize=(12,9))
w = ax.bar(xmids,height=disc_gamma, width=1)
ax.set_title("Generation time distribution")
ax.plot(xrange, gamma.pdf(xrange, a=shape_gen, scale=scale_gen), linewidth=4,alpha=0.8, color="orange")
ax.set_xlabel('Days')

### Part 2b: Actually calculating $\Lambda$

In [None]:
# We use the total number of infected individuals, so use total I.
I_total = df_total.drop('local',1)


In [None]:
#Summing over s from 1 to t means that t is the maximum value of the generation interval. 
#21 here. So we should filter the data to 21 days before whenever 
I_total=I_total.reset_index()

In [None]:
#Test on one state before parallelising?
onestate='NSW'
input_state = I_total[I_total.STATE=='NSW']

In [None]:
tmax = input_state['infection_date'].max()

In [None]:
#input a vector of infection dates and total values.
tstart= tmax-np.timedelta64(trunc_days,'D')


In [None]:
relevant_dates = pd.date_range(tstart, tmax-np.timedelta64(1,'D'))
reldates = input_state['total'][input_state.infection_date.isin(relevant_dates)]
ws = [*reversed(disc_gamma[1:(trunc_days+1)])]
lambda_t=sum(reldates*ws)

In [None]:
lambda_t