## Optimal Baseline
At first all four datasets are modified in a way, that all of them contain the same countries with at least the last 36 months of observations. Countries that are not present in all datasets are not used for the minimization of the CRPS in dependecy of w or s. 

### Set Variables

In [13]:
# set the model type
estimModel = 'hurdle' #nbinom or hurdle

# list of the (prediction) windows
max_w = 36 #36
window_list = list(range(2, max_w+1))

# remove countries
removeCountries = False
lastX = 2 # all but the last x countries are removed

In [14]:
import pandas as pd
import numpy as np
import os
import CRPS.CRPS as pscore
import copy
from joblib import dump, load
from scipy.stats import nbinom, poisson
from time import sleep
from tqdm import tqdm
import warnings

## functions for the distribtion models
# cdf of the truncated negative binomial distribution
def truncNbinomCdf(y, n, p, log=True):

    ## error/input handling part one
    # n and p have to be greater than zero and p <= 1
    if n <= 0 or p <= 0 or p > 1:
        if not isinstance(y, (list, np.ndarray)):
            return np.nan
        else:
            return np.full(len(y), np.nan)

    ## calculation
    is_scalar = np.isscalar(y)  # check if y is scalar or array

    if is_scalar:
        y = np.array([y])  # typecast scalar to onedimensional array
    elif isinstance(y, list):
        y = np.array(y)

    f_zero = nbinom.pmf(0, n, p) # f(0) untruncated density

    # general formula for lower trunc. distributions
    cdf_y = (nbinom.cdf(y, n, p) - nbinom.cdf(0, n, p)) / (1 - f_zero)
    ## error/input handling part two
    # VERY IMPORTANT STEP: there might be numerical instabilities, which may lead to
    # nbinomo.cdf(x) < nbinom.cdf(0)!!!!! this then leads to negative values of the cdf
    # or nans in the log version (np.log(neg number))
    cdf_y[cdf_y < 0] = 0
    # set values to 0, if y <= 0 (y <=0 not allowed per definition of a 0 truncated count distribution)
    cdf_y[y <= 0] = 0

    # in case of log CDF
    if log: 
        # ignore the 'RuntimeWarning: divide by zero encountered in log' warning (np.log(0))
        warnings.filterwarnings('ignore', category=RuntimeWarning)
        log_cdf_y = np.log(cdf_y) # general formula for lower trunc. distributions
        warnings.filterwarnings('default', category=RuntimeWarning)

        if is_scalar:
            return log_cdf_y[0]  # return scalar, if onedimensional array
        else:
            return log_cdf_y
    # normal CDF
    else:
        if is_scalar:
            return cdf_y[0]  
        else:
            return cdf_y
    

#log.p	logical; if TRUE, probabilities p are given as log(p)    
def qnbinom_trunc(p, nNbinom, pNbinom, log_p=False):
    ## calculation of the quantile
    # if f(0)=0 no truncation is needed
    if nbinom.pmf(0, nNbinom, pNbinom) == 0:
        return nbinom.ppf(p, nNbinom, pNbinom) # if p=0, -1 is returned instead of 0 (0 is truncated)
        # but this is not that important and is ignored here (because the 0 quantile does not make sense)
    else:
        # n and p have to be greater than zero and p <= 1
        if nNbinom <= 0 or pNbinom <= 0 or pNbinom > 1:
            if not isinstance(p, (list, np.ndarray)):
                return np.nan
            else:
                return np.full(len(p), np.nan)

        # Convert p (quantile) to array if it's a scalar
        if not isinstance(p, (list, np.ndarray)):
            p = np.array([p])
        elif isinstance(p, list):
            p = np.array(p)
        
        n = len(p) # number of quantiles

        # Set log-probabilities (lower tail)
        if log_p:
            logp = p
        else:
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            logp = np.log(p)
            warnings.filterwarnings('default', category=RuntimeWarning)
        
        # error handling/deal with special cases (outputs NA and Inf)
        quantiles = np.full(n, np.nan)
        na = np.isnan(logp) # nan <-> p < 0 -> return nan
        neginf = np.isneginf(logp) # -inf <-> p = 0 -> return 0 (due to truncation, otherwise -1)
        zero = logp == 0  # 0 <-> p = 1 -> return inf
        aboveZero = logp > 0 # >0 <-> p > 1 -> return nan

        # set quantile array if one of the restrictions is not fulfilled
        quantiles[neginf] = 0
        quantiles[zero] = np.inf
        quantiles[aboveZero] = np.nan
        
        # mask array, true if conditions are fulfilled
        mask = np.logical_not(np.logical_or(na, np.logical_or(neginf, np.logical_or(zero, aboveZero))))
        validLogp = logp[mask]

        if len(validLogp) == 0:
            # Return output
            if len(quantiles) == 1:
                return quantiles[0] # if single quantile is handed over
            else:
                return quantiles

        # find valid max value with mask
        lp_max = np.max(validLogp)
        p_max = np.exp(lp_max)

        # calculate mean and variance out of n and p
        mean = (nNbinom * (1 - pNbinom)) / pNbinom
        var = (nNbinom * (1 - pNbinom)) / (pNbinom**2)

        # find an adequate upper limit, starting from the extreme conservative chebychev inequality
        upper = int(mean + np.sqrt(var/(1-np.exp(lp_max)))) #Chebychev inequality

        # if upper < 1000 there is an log(0)=-inf with warning -> ignore this warning
        warnings.filterwarnings('ignore', category=RuntimeWarning)
        # lower the upper limit (saves computation time)
        while truncNbinomCdf(upper-1000, nNbinom, pNbinom, log=False) > p_max:
            upper = upper - 1000

        # after this section warnings are enabled again
        warnings.filterwarnings('default', category=RuntimeWarning)

        yarray = np.arange(1, int(upper)+1) # the y values for which the CDF is going to be calculated
        logcdf = truncNbinomCdf(yarray, nNbinom, pNbinom) # calculate log CDF (faster computation time)

        # Compute output
        for i in range(n): # for all quantiles   
            if not na[i] and not neginf[i] and not zero[i] and not aboveZero[i]:
                    quantiles[i] = np.sum(logcdf < np.array(logp[i])) + 1 #+1 because 0 is truncated
        
        # Return output
        if len(quantiles) == 1:
            return quantiles[0] # if single quantile is handed over
        else:
            return quantiles

# truncated poisson---------------------------------------------------
# cdf of the truncated poisson distribution
def truncPoisCdf(y, mu, log=True):
    # values of lam <= 0 (0 truncation -> 0= not allowed) are not allowed. return nan
    if mu <= 0:
        if not isinstance(y, (list, np.ndarray)):
            return np.nan
        else:
            return np.full(len(y), np.nan)

    is_scalar = np.isscalar(y)  # check if y is scalar or array

    if is_scalar:
        y = np.array([y])  # typecast scalar to onedimensional array
    elif isinstance(y, list):
        y = np.array(y)

    f_zero = poisson.pmf(0, mu) # f(0) untruncated density

    # general formula for lower trunc. distributions
    cdf_y = (poisson.cdf(y, mu) - poisson.cdf(0, mu)) / (1 - f_zero)
    # VERY IMPORTANT STEP: there might be numerical instabilities, which may lead to
    # poisson.cdf(x) < poisson.cdf(0)!!!!! this then leads to negative values of the cdf
    # or nans in the log version (np.log(neg number))
    cdf_y[cdf_y < 0] = 0 # set negative values to zero
    # set values to 0, if y <= 0 (y <=0 not allowed per definition of a 0 truncated count distribution)
    cdf_y[y <= 0] = 0

    # in case of log CDF
    if log:
        # ignore the 'RuntimeWarning: divide by zero encountered in log' warning (np.log(0))
        warnings.filterwarnings('ignore', category=RuntimeWarning)
        log_cdf_y = np.log(cdf_y) # general formula for lower trunc. distributions
        warnings.filterwarnings('default', category=RuntimeWarning)

        if is_scalar:
            return log_cdf_y[0]  # return scalar, if onedimensional array
        else:
            return log_cdf_y
    # normal CDF    
    else:
        if is_scalar:
            return cdf_y[0]
        else:
            return cdf_y
        
def log1mexp(x):
    if np.any((x < 0) & (~np.isnan(x))):
        raise ValueError("Inputs need to be non-negative!")
    return np.where(x <= np.log(2), np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))
        

#log.p	logical; if TRUE, probabilities p are given as log(p)    
def qpois_trunc(p, lam, log_p=False):
    ## calculation of the quantile
    # if f(0)=0 no truncation is needed
    if poisson.pmf(0, lam) == 0:
        return poisson.ppf(p, lam)
    else:
        # values of lam <= 0 (0 truncation -> 0= not allowed) are not allowed. return nan
        if lam <= 0:
            if not isinstance(p, (list, np.ndarray)):
                return np.nan
            else:
                return np.full(len(p), np.nan)


        # Convert p (quantile) to array if it's a scalar
        if not isinstance(p, (list, np.ndarray)):
            p = np.array([p])
        elif isinstance(p, list):
            p = np.array(p)
        
        n = len(p) # number of quantiles

        # Set log-probabilities (lower tail)
        if log_p:
            logp = p
        else:
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            logp = np.log(p)
            warnings.filterwarnings('default', category=RuntimeWarning)
        
        # error handling/deal with special cases (outputs NA and Inf)
        quantiles = np.full(n, np.nan)
        na = np.isnan(logp) # nan <-> p < 0 -> return nan
        neginf = np.isneginf(logp) # -inf <-> p = 0 -> return 0 (due to truncation, otherwise -1)
        zero = logp == 0  # 0 <-> p = 1 -> return inf
        aboveZero = logp > 0 # >0 <-> p > 1 -> return nan

        # set quantile array if one of the restrictions is not fulfilled
        quantiles[neginf] = 0
        quantiles[zero] = np.inf
        quantiles[aboveZero] = np.nan
        
        # mask array, true if conditions are fulfilled
        mask = np.logical_not(np.logical_or(na, np.logical_or(neginf, np.logical_or(zero, aboveZero))))
        validLogp = logp[mask]

        if len(validLogp) == 0:
            # Return output
            if len(quantiles) == 1:
                return quantiles[0] # if single quantile is handed over
            else:
                return quantiles

        # find valid max value with mask
        lp_max = np.max(validLogp)
        p_max = np.exp(lp_max)

        # find an adequate upper limit, starting from the extreme conservative chebychev inequality
        upper = int(lam + np.sqrt(lam * np.exp(-log1mexp(-lp_max)))) #Chebychev inequality

        # if upper < 1000 there is an log(0)=-inf with warning -> ignore this warning
        warnings.filterwarnings('ignore', category=RuntimeWarning)

        while truncPoisCdf(upper-1000, lam, log=False) > p_max:
            upper = upper - 1000

        # after this section warnings are enabled again
        warnings.filterwarnings('default', category=RuntimeWarning)

        yarray = np.arange(1, int(upper)+1) # the y values for which the CDF is going to be calculated
        logcdf = truncPoisCdf(yarray, lam) # calculate log CDF (faster computation time)

        # Compute output
        for i in range(n): # for all quantiles   
            if not na[i] and not neginf[i] and not zero[i] and not aboveZero[i]:
                    quantiles[i] = np.sum(logcdf < np.array(logp[i])) + 1 #+1 because 0 is truncated
        
        # Return output
        if len(quantiles) == 1:
            return quantiles[0] # if single quantile is handed over
        else:
            return quantiles

### function to compute distribution-------------------------------------------------
def baseFatalModel_quantiles(featureSeries, quantiles, w=None, model='hurdle'):
    # list to store quantiles 
    dummy_fatalities_list = []
    # string to store model distribution
    dist_string = ''

    mean = None
    var = None

    numberQuantiles = len(quantiles)

    # hurdle model
    if model == 'hurdle':
        if w == None:
            
            # calculate pt, i.e. the probability that y>0
            p_t = 1 - (featureSeries.value_counts().get(0, 0) / featureSeries.count())
            # calculate n (r) and p via average/variance without the zero values
            mean = pd.Series.mean(featureSeries[featureSeries != 0])
            var = pd.Series.var(featureSeries[featureSeries != 0])

        elif w <= 0:
            return 'w has to be > 0'
        
        else:
            features = featureSeries.tail(w).loc[:,'ged_sb']
            # calculate pt, i.e. the probability that y>0
            p_t = 1 - (features.value_counts().get(0, 0) / features.count())
            # calculate n (r) and p via average/variance without the zero values
            mean = pd.Series.mean(features[features != 0])
            var = pd.Series.var(features[features != 0])

        # pd.Series.var or mean returns Nan in case of a passed series of length 1
        if np.isnan(mean):
            mean = 0
        if np.isnan(var):
            var = 0

        # check if there are values above zero, otherwise no second component (trunc dist.) needed
        if p_t > 0:
            # component 1, y=0: Bernoulli
            comp2_quantiles = [q for q in quantiles if q > (1-p_t)] #quantiles for the second component
            removed_elements_length = numberQuantiles-len(comp2_quantiles)
            zeros_array = np.zeros(removed_elements_length) #zero values that originate from the bernoulli dist

            # component 2, y>0
            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var

                trunc_nbinom_quantiles = qnbinom_trunc(comp2_quantiles, n, p)

                dummy_fatalities_list = np.concatenate((zeros_array, trunc_nbinom_quantiles)).tolist()
                dist_string = 'BernoullitruncNbinom'

            else:  # equivalent to all means and 0 < var <= mean (mean cant be 0, because of hurdle)
                trunc_pois_quantiles = qpois_trunc(comp2_quantiles, mean)
                
                dummy_fatalities_list = np.concatenate((zeros_array, trunc_pois_quantiles)).tolist()
                dist_string = 'BernoulliTruncPois'
            
        # p_t = 0 so no second component is needed    
        else:
            dummy_fatalities_list = [0] * numberQuantiles
            dist_string = 'BernoulliHurdle'

    # nbinom model
    elif model == 'nbinom':
        if w == None:
             # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries)
            var = pd.Series.var(featureSeries)
        elif w <= 0:
            return 'w has to be > 0'
        else:
            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(featureSeries.tail(w).loc[:,'ged_sb'])

        if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var

                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()
                dist_string = 'NBinom'

        elif mean == 0 and var == 0: # due to faster calculation
                dummy_fatalities_list = [0] * numberQuantiles
                dist_string = 'Pois'

        else: # equivalent to all means and 0 < var <= mean
                dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()
                dist_string = 'Pois'

    return {'fatalities': dummy_fatalities_list, 'dist': dist_string, 'mean': mean, 'var': var}

In [15]:
import pyarrow.parquet as pq
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

""" # Authenticate and create the PyDrive client.
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)

data_folder_id = '1RigGnEyyNGnO_SPBSc_RwO9jjdbnPTAV'
result_folder_id = '1CNBTHtBOTFXh01WUpP2EF1aEmDi0rSyg'   """

" # Authenticate and create the PyDrive client.\ngauth = GoogleAuth()\ngauth.LocalWebserverAuth()\ndrive = GoogleDrive(gauth)\n\ndata_folder_id = '1RigGnEyyNGnO_SPBSc_RwO9jjdbnPTAV'\nresult_folder_id = '1CNBTHtBOTFXh01WUpP2EF1aEmDi0rSyg'   "

In [16]:

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

# path to the current directory
current_dir = os.getcwd()

for i in range(len(feature_years)):
    # relative paths to the parquet files
    relative_path_features = os.path.join('..', 'data', 'cm_features_to_oct' + feature_years[i] + '.parquet')
    relative_path_actuals = os.path.join('..', 'data', 'cm_actuals_' + actual_years[i] + '.parquet')

    path_features = os.path.join(current_dir, relative_path_features)
    path_actuals = os.path.join(current_dir, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations to_oct_17
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last n months are in the dataset of a country,
# other than that the last month of a country in the feature dataset has to be 3 months before the first actuals month!!
def check_last_nMonths(n, country_id, yearindex):
    country = country_feature_group_list[yearindex].get_group(country_id)

    # reference month of the actual dataset
    actual_month_list = actuals_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()

    # if the last month of the feature dataset in the country does not match the first of the actuals return false
    if (actual_month_list[0] - 3) != country.index.get_level_values('month_id').unique().tolist()[-1]:
        return False
    else:
        month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
        last_month = month_list[-1] # equals the first month - 3 from the corresponding actuals dataset
        first_month = month_list[0]

        last_n_months = True

        if last_month-n+1 < first_month:
            last_n_months = False
        else:
            month_list = list(range(last_month-n+1, last_month+1))
            
            for month in month_list:
                if month not in country.index.get_level_values('month_id'):
                    last_n_months = False
                    break

        return last_n_months
        #return True

country_list = sorted(features_df_list[3]['data'].index.get_level_values('country_id').unique().tolist())

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

print(len(country_list))


213


In [17]:
# modify country_list so that it contains only country_ids 
# that have at least the last n months of observations in the last dataset (2020)!
numberMonths_toOct20 = 72 # 72 = 3*12 (3 jahre für 2017) + 3*12 (jedes Jahr 12 Monate mehr also 2020 6 Jahre)

#-- note------
# dataset 2020 is used, because of the structure of the other datasets.
# 2020 is dataset 2019 with 12 additional rows (months) etc.
# for the CRPS calculation  of the datasets != 2020 the last 12*x windows are deleted
# this procedure is saving computation time
#-------------


#IMPORTANT
#if you do not minimize over all countries but only the single countries, 
# it is sufficient to check if all countries contain the last month in the features dataset (this way you use the full information). 
# But you still have to check check_last_nMonths(len(countrymonths), countryIndex, 3), so that no month is missing in between.

# => so currently not all information is used for each country

dummy_list = []
for countryIndex in country_list:
    dummy_hasLastN_months = True

    # index 3 is the last dataset
    if check_last_nMonths(numberMonths_toOct20, countryIndex, 3) is not True:
        dummy_hasLastN_months = False  
    
    if dummy_hasLastN_months is True:
        dummy_list.append(countryIndex)

# the values in country_list are the 'country_id'
country_list = dummy_list

#IMPORTANT
# all countries that have the last month as observation have the last 72 months as observations (in 2020)!!! so no country is excluded
# checked by modifing the check_last_nMonths function -> else: return True

len(country_list)

191

In [18]:
## changes, so that the calculation does not take a long time -------------------
if removeCountries == True:
    # remove all but last x countries
    elements_to_remove = country_list[0:(len(country_list)-lastX)] # only last x countries
    country_list = [element for element in country_list if element not in elements_to_remove]
    
len(country_list)

191

#### The minimization is based on calculating the quantiles for each country, w and year (of the datasets).

In [19]:
# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
s_prediction_list = list(range(3, 15))


number_countries = len(country_list)
number_dataframes = len(features_df_list)
number_w = len(window_list)

quantiles = np.arange(0.001, 0.9999, 0.001)
quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies
dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles]

# loop through all countries (that are present in each dataset)
for index in range(number_countries):
    country = country_list[index]

    # list to store the predictions for each year temporally
    baseline_predict_list = [[] for _ in range(number_dataframes)]
    
    # loop through datasets
    for i in range(len(index_list)): #range(number_dataframes): 
        features = country_feature_group_list[i].get_group(country) # features of country in dataset i
        
        baseline_predict_list[i] = []

        # loop through windows
        for j in range(number_w):
            w = window_list[j] # current window

            fit = baseFatalModel_quantiles(features, quantiles, w=w, model=estimModel)

            baseline_predict_list[i].append({'window':w, 'country_id':country, 'dist':fit['dist'], 
                                             'mean':fit['mean'], 'var':fit['var'], 'quantile':[], 'fatalities':[]}) 

            baseline_predict_list[i][j]['quantile'] = dummy_quantile_list    
            baseline_predict_list[i][j]['fatalities'] = fit['fatalities']

            baseline_predict_list[i][j] = pd.DataFrame(baseline_predict_list[i][j])
            baseline_predict_list[i][j].set_index(['window', 'quantile'], inplace=True)

        baseline_country_predict_list[index]['prediction'][index_list[i]] = baseline_predict_list[i]

        # combine each w dataset together
        baseline_country_predict_list[index]['prediction'][index_list[i]] = pd.concat(baseline_country_predict_list[index]['prediction'][index_list[i]], axis=0)
        baseline_country_predict_list[index]['prediction'][index_list[i]].sort_index(axis=0, inplace=True)

In [20]:
print(country_list)
#baseline_country_predict_list[8]['prediction']['2019'].xs(3, level = 'window')

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 69, 70, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 198, 199, 205, 206, 209, 213, 214, 218, 220, 222, 223, 231, 232, 233, 234, 235, 237, 242, 243, 244, 245, 246]


## Baseline 1-4

Optimize **w** (through the CRPS) regarding
|            | datasets    | countries   | prediction windows |
|------------|-------------|-------------|--------------------|
| baseline 1 | all         | all         | all                |
| baseline 2 | all         | inidvidual  | all                |
| baseline 3 | all         | all         | individual         |
| baseline 4 | all         | inidvidual  | individual         |

In [21]:
# list to store all crps values
baseline_crps_list = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]

# numver of prediction windows
number_s = len(s_prediction_list)

# fill list with crps calculations
for s in s_prediction_list:

    for index in range(number_countries):
        country = country_list[index]
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            for j in range(number_dataframes):
                year = actual_years[j]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                true_obs = monthly_totals_actuals.iloc[s-3,0]

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")

                crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                dummy_crps_list.append(crps)

            baseline_crps_list[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list))
    
# time to calculate: ~66 min with all 190 countries

In [22]:
v1_baseline_crps_dict = {'w':[],'CRPS':[]}
v2_baseline_crps_list = [{'country_id': country, 'baseline': {'w':[],'CRPS':[]}} for country in country_list]
v3_baseline_crps_list = [{'s':s,'w':[],'CRPS':[]} for s in s_prediction_list]

## baseline v1---------------------------------------------------------------------------
# loop over w
for j in range(number_w):
    w = window_list[j]
    dummy_crps_v1_list = []
    # loop over countries
    for i in range(number_countries):
        # loop over prediction windows s
        for k in range(number_s):
            dummy_crps_v1_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
    v1_baseline_crps_dict['w'].append(w)
    v1_baseline_crps_dict['CRPS'].append(np.mean(dummy_crps_v1_list))

v1_baseline_crps = pd.DataFrame(v1_baseline_crps_dict)
v1_baseline_crps = v1_baseline_crps[v1_baseline_crps.CRPS == v1_baseline_crps.loc[:,'CRPS'].min()]
v1_baseline_crps.set_index(pd.Index(range(len(v1_baseline_crps))), inplace=True)
    
    
#----------------------------------------------------------------------------------------

## baseline v2----------------------------------------------------------------------------
# list for baseline v2
for i in range(number_countries):
    for j in range(number_w):
        w = window_list[j]
        dummy_crps_v2_list = []
        for k in range(number_s):
            dummy_crps_v2_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
        v2_baseline_crps_list[i]['baseline']['w'].append(w)
        v2_baseline_crps_list[i]['baseline']['CRPS'].append(np.mean(dummy_crps_v2_list))
    
# dataframe with the w that minimizes the CRPS for every country (v2)
data_v2 = {
    'country_id':[],
    'w':[],
    'CRPS':[]
}
for i in range(len(v2_baseline_crps_list)):
    # get the index of the minimal CRPS value
    min_index = v2_baseline_crps_list[i]['baseline']['CRPS'].index(min(v2_baseline_crps_list[i]['baseline']['CRPS']))
    
    # store values in dict
    data_v2['country_id'].append(v2_baseline_crps_list[i]['country_id'])
    data_v2['w'].append(v2_baseline_crps_list[i]['baseline']['w'][min_index])
    data_v2['CRPS'].append(v2_baseline_crps_list[i]['baseline']['CRPS'][min_index])
    
v2_baseline_crps = pd.DataFrame(data_v2)
#----------------------------------------------------------------------------------------


## baseline v3---------------------------------------------------------------------------
for s_index in range(number_s):
    dummy_crps_v3_list = []
    s = s_prediction_list[s_index]
    for w_index in range(number_w):
        w = window_list[w_index]
        for i in range(number_countries):
            dummy_crps_v3_list.append(baseline_crps_list[i]['baseline'][s_index]['CRPS'][w_index])
        v3_baseline_crps_list[s_index]['w'].append(w)
        v3_baseline_crps_list[s_index]['CRPS'].append(np.mean(dummy_crps_v3_list))

# dataframe with the w that minimize the CRPS for each prediction window s
data_v3 = {
    's':[],
    'w':[],
    'CRPS':[]
}
# length of the v3_baseline list is the number of prediction windows
for i in range(len(v3_baseline_crps_list)):
    s = s_prediction_list[i]
    # get the index of the minimal CRPS value
    min_index = v3_baseline_crps_list[i]['CRPS'].index(min(v3_baseline_crps_list[i]['CRPS']))

    # store values in dict
    data_v3['s'].append(s)
    data_v3['w'].append(v3_baseline_crps_list[i]['w'][min_index])
    data_v3['CRPS'].append(v3_baseline_crps_list[i]['CRPS'][min_index])

v3_baseline_crps = pd.DataFrame(data_v3)
#----------------------------------------------------------------------------------------

## baseline v4---------------------------------------------------------------------------
v4_baseline_crps = [{'country_id':country,
                    's':[],
                    'w':[],
                    'CRPS':[]
                    } for country in country_list]

# loop over all countries
for i in range(len(baseline_crps_list)):
    # loop over all prediction windows
    for s_index in range(number_s):
        s = s_prediction_list[s_index]
        # get the index of the minimal CRPS value
        min_index = baseline_crps_list[i]['baseline'][s_index]['CRPS'].index(min(baseline_crps_list[i]['baseline'][s_index]['CRPS']))
    
        # store values in dict
        v4_baseline_crps[i]['s'].append(s)
        v4_baseline_crps[i]['w'].append(baseline_crps_list[i]['baseline'][s_index]['w'][min_index])
        v4_baseline_crps[i]['CRPS'].append(baseline_crps_list[i]['baseline'][s_index]['CRPS'][min_index])

    v4_baseline_crps[i] = pd.DataFrame(v4_baseline_crps[i])
#----------------------------------------------------------------------------------------

In [23]:
# calculation of the overall CRPS to compare the impact of the level of detail in modeling 
dummy_array_v4 = []
for countryData in v4_baseline_crps:
    dummy_array_v4.append(np.mean(countryData.loc[:,'CRPS']))

print('Overall CRPS')
print('baseline 1: ' + str(np.round(v1_baseline_crps.iloc[0,1], decimals = 4)))
print('baseline 2: ' + str(np.round(np.mean(v2_baseline_crps.loc[:,'CRPS']), decimals = 4)))
print('baseline 3: ' + str(np.round(np.mean(v3_baseline_crps.loc[:,'CRPS']), decimals = 4)))
print('baseline 4: ' + str(np.round(np.mean(dummy_array_v4), decimals = 4)))

Overall CRPS
baseline 1: 15.3619
baseline 2: 13.8713
baseline 3: 15.5362
baseline 4: 12.9773


In [24]:
# save variables in joblib files
variable_string = str(estimModel)+'Wmax'+str(max_w)+'last'+str(lastX)+'ctrs'
filename = 'task2_optimal_baseline_' + variable_string + '.joblib'
# save variables in joblib file
""" dump([country_list, baseline_country_predict_list, baseline_crps_list, v1_baseline_crps_dict,
      v2_baseline_crps_list, v3_baseline_crps_list,
      v1_baseline_crps, v2_baseline_crps, v3_baseline_crps, v4_baseline_crps], filename)

file1 = drive.CreateFile({'parents': [{'id': result_folder_id}]})
file1.SetContentFile(filename)
file1.Upload()
print(filename) """

" dump([country_list, baseline_country_predict_list, baseline_crps_list, v1_baseline_crps_dict,\n      v2_baseline_crps_list, v3_baseline_crps_list,\n      v1_baseline_crps, v2_baseline_crps, v3_baseline_crps, v4_baseline_crps], filename)\n\nfile1 = drive.CreateFile({'parents': [{'id': result_folder_id}]})\nfile1.SetContentFile(filename)\nfile1.Upload()\nprint(filename) "