### google drive runtime connection

In [None]:
# Ctrl + F9 alle Zellen ausführen

!pip install PyDrive
!pip install CRPS

In [None]:
import pyarrow.parquet as pq
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth) 

data_folder_id = '1RigGnEyyNGnO_SPBSc_RwO9jjdbnPTAV'
result_folder_id = '1CNBTHtBOTFXh01WUpP2EF1aEmDi0rSyg'  

### Import packages and define functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import CRPS.CRPS as pscore
import copy
from joblib import dump, load, Parallel, delayed
import statsmodels.api as sm
from scipy.stats import nbinom, poisson
from time import sleep
from tqdm import tqdm

## functions for the distribtion models
# truncated negative binomial---------------------------------------------------
def truncNegBin_CDF(y, n, p):
    f_zero = nbinom.pmf(0, n, p)
    if y > 0:
        return (nbinom.cdf(y, n, p) - nbinom.cdf(0, n, p)) / (1 - f_zero)
    else:
        return 0
    
def truncNegBin_PPF(x, n, p, epsilon=1e-6, max_iterations=100):
    # if f(0)=0 no truncation is needed
    if (1 - nbinom.pmf(0, n, p)) == 1:
        return nbinom.ppf(x, n, p)
    else:
        # Define the range of y where the solution might exist
        lower_bound = 0
        upper_bound = 1000000000  # Adjust this based on the expected range of y

        # Bisection method
        for _ in range(max_iterations):
            y = (lower_bound + upper_bound) / 2
            cdf_value = truncNegBin_CDF(y, n, p)

            if abs(cdf_value - x) < epsilon:
                return np.ceil(y)  # Found a good approximation

            if cdf_value < x:
                lower_bound = y
            else:
                upper_bound = y

        # Return the best approximation if max_iterations is reached
        return np.ceil(y)

def calculate_trunc_nbinom_quantile(quantile, n, p):
    return truncNegBin_PPF(quantile, n, p)

# truncated poisson---------------------------------------------------
def truncPois_CDF(y, mu):
    f_zero = poisson.pmf(0, mu)
    if y > 0:
        return (poisson.cdf(y, mu) - poisson.cdf(0, mu)) / (1 - f_zero)
    else:
        return 0
    
def truncPois_PPF(x, mu, epsilon=1e-6, max_iterations=100):
    # if f(0)=0 no truncation is needed
    if (1 - poisson.pmf(0, mu)) == 1:
        return poisson.ppf(x, mu)
    else:
        # Define the range of y where the solution might exist
        lower_bound = 0
        upper_bound = 1000000000  # Adjust this based on the expected range of y

        # Bisection method
        for _ in range(max_iterations):
            y = (lower_bound + upper_bound) / 2
            cdf_value = truncPois_CDF(y, mu)

            if abs(cdf_value - x) < epsilon:
                return np.ceil(y)  # Found a good approximation

            if cdf_value < x:
                lower_bound = y
            else:
                upper_bound = y

        # Return the best approximation if max_iterations is reached
        return np.ceil(y)

def calculate_trunc_pois_quantile(quantile, mu):
    return truncPois_PPF(quantile, mu)

### function to compute distribution-------------------------------------------------
def baseFatalModel_quantiles(featureSeries, quantiles, w=None, model='hurdle'):
    # list to store quantiles 
    dummy_fatalities_list = []
    # string to store model distribution
    dist_string = ''

    mean = None
    var = None

    numberQuantiles = len(quantiles)

    # hurdle model
    if model == 'hurdle':
        if w == None:
            
            # calculate pt, i.e. the probability that y>0
            p_t = 1 - (featureSeries.value_counts().get(0, 0) / featureSeries.count())
            # calculate n (r) and p via average/variance without the zero values
            mean = pd.Series.mean(featureSeries[featureSeries != 0])
            var = pd.Series.var(featureSeries[featureSeries != 0])

        elif w <= 0:
            return 'w has to be > 0'
        
        else:
            features = featureSeries.tail(w).loc[:,'ged_sb']
            # calculate pt, i.e. the probability that y>0
            p_t = 1 - (features.value_counts().get(0, 0) / features.count())
            # calculate n (r) and p via average/variance without the zero values
            mean = pd.Series.mean(features[features != 0])
            var = pd.Series.var(features[features != 0])

        # pd.Series.var or mean returns Nan in case of a passed series of length 1
        if np.isnan(mean):
            mean = 0
        if np.isnan(var):
            var = 0

        # check if there are values above zero, otherwise no second component (trunc dist.) needed
        if p_t > 0:
            # component 1, y=0: Bernoulli
            comp2_quantiles = [q for q in quantiles if q > p_t] #quantiles for the second component
            removed_elements_length = numberQuantiles-len(comp2_quantiles)
            zeros_array = np.zeros(removed_elements_length) #zero values that originate from the bernoulli dist

            # component 2, y>0
            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                trunc_nbinom_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_nbinom_quantile)(quantile, n, p) for quantile in comp2_quantiles) #fast way
                trunc_nbinom_quantiles = np.array(trunc_nbinom_quantiles)

                dummy_fatalities_list = np.concatenate((zeros_array, trunc_nbinom_quantiles)).tolist()
                dist_string = 'BernoulliTruncNBinom'

            elif mean == 0 and var == 0: # due to faster calculation
                dummy_fatalities_list = [0] * numberQuantiles
                dist_string = 'BernoulliTruncPois'

            else:  # equivalent to all means and 0 < var <= mean
                trunc_pois_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_pois_quantile)(quantile, mean) for quantile in comp2_quantiles) #fast way
                trunc_pois_quantiles = np.array(trunc_pois_quantiles)
                
                dummy_fatalities_list = np.concatenate((zeros_array, trunc_pois_quantiles)).tolist()
                dist_string = 'BernoulliTruncPois'
            
        # p_t = 0 so no second component is needed    
        else:
            dummy_fatalities_list = [0] * numberQuantiles
            dist_string = 'BernoulliHurdle'

    # nbinom model
    elif model == 'nbinom':
        if w == None:
             # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries)
            var = pd.Series.var(featureSeries)
        elif w <= 0:
            return 'w has to be > 0'
        else:
            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(featureSeries.tail(w).loc[:,'ged_sb'])

        if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()
                dist_string = 'NBinom'

        elif mean == 0 and var == 0: # due to faster calculation
                dummy_fatalities_list = [0] * numberQuantiles
                dist_string = 'Pois'

        else: # equivalent to all means and 0 < var <= mean
                dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()
                dist_string = 'Pois'

    return {'fatalities': dummy_fatalities_list, 'dist': dist_string, 'mean': mean, 'var': var}

In [None]:
actual_years
feature_years
estimModel
s_prediction_list
window_list
country_list
number_countries
number_dataframes
number_w
baseline_estimate_list







#### Compute the average over all indiviual moving windows per w and year

In [None]:
# lists to store all crps values
baseline_crps_list_to_oct20 = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]
baseline_crps_list_to_oct19 = copy.deepcopy(baseline_crps_list_to_oct20)
baseline_crps_list_to_oct18 = copy.deepcopy(baseline_crps_list_to_oct20)
baseline_crps_list_to_oct17 = copy.deepcopy(baseline_crps_list_to_oct20)

# number of prediction windows
number_s = len(s_prediction_list)

# fill lists with crps calculations
for s in tqdm(s_prediction_list):
    sleep(3)
    #print('                  prediction window ' + str(s-2) + '/' + str(number_s), end='\r')

    for index in range(number_countries):
        country = country_list[index]
        #print('country ' + str(index+1) + '/' + str(number_countries), end='\r')
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            # loop over all subset windows of the country and w 
            for j in range(len(baseline_estimate_list[i]['country_predict_list'][index]['predictionWindowsN'])):

                distribution = baseline_estimate_list[i]['country_predict_list'][index]['predictionWindowsN'][j][0]['fatalities']
                actual = baseline_estimate_list[i]['country_predict_list'][index]['predictionWindowsN'][j][1]['unreal_actuals'][s-3]

                crps = pscore(np.array(distribution),actual).compute()[0]
                dummy_crps_list.append(crps)

            # dataframe to_oct17
            baseline_crps_list_to_oct17[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list_to_oct17[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list[:-(3*12)]))

            # dataframe to_oct18
            baseline_crps_list_to_oct18[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list_to_oct18[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list[12:-(2*12)]))

            # dataframe to_oct19
            baseline_crps_list_to_oct19[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list_to_oct19[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list[(2*12):-12]))

            # dataframe to_oct20
            baseline_crps_list_to_oct20[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list_to_oct20[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list[(3*12):]))

task2_baseline_list = [baseline_crps_list_to_oct17, baseline_crps_list_to_oct18,
                       baseline_crps_list_to_oct19, baseline_crps_list_to_oct20]

#### Minimization
'w_minimization_list' contains the minimal w's for the different baselines for each year

In [None]:
# list to store the results of the minimal w's
w_minimization_list = [{'predictionYear':year, 'minWData':[]} for year in actual_years]

# list to store the list to compute the minimal w's
w_compute_list = [{'predictionYear':year, 'data':[]} for year in actual_years]

# loop over the four different datasets to predict (18-21)
for task2_index in range(len(task2_baseline_list)):
    v1_baseline_crps_dict = {'w':[],'CRPS':[]}
    v2_baseline_crps_list = [{'country_id': country, 'baseline': {'w':[],'CRPS':[]}} for country in country_list]
    v3_baseline_crps_list = [{'s':s,'w':[],'CRPS':[]} for s in s_prediction_list]

    ## baseline v1---------------------------------------------------------------------------
    # loop over w
    for j in range(number_w):
        w = window_list[j]
        dummy_crps_v1_list = []
        # loop over countries
        for i in range(number_countries):
            # loop over prediction windows s
            for k in range(number_s):
                dummy_crps_v1_list.append(task2_baseline_list[task2_index][i]['baseline'][k]['CRPS'][j])
        v1_baseline_crps_dict['w'].append(w)
        v1_baseline_crps_dict['CRPS'].append(np.mean(dummy_crps_v1_list))

    v1_baseline_crps = pd.DataFrame(v1_baseline_crps_dict)

    w_compute_list[task2_index]['data'].append(v1_baseline_crps)

    v1_baseline_crps = v1_baseline_crps[v1_baseline_crps.CRPS == v1_baseline_crps.loc[:,'CRPS'].min()]
    v1_baseline_crps.set_index(pd.Index(range(len(v1_baseline_crps))), inplace=True)
        
    w_minimization_list[task2_index]['minWData'].append(v1_baseline_crps)
    #----------------------------------------------------------------------------------------

    ## baseline v2----------------------------------------------------------------------------
    # list for baseline v2
    for i in range(number_countries):
        for j in range(number_w):
            w = window_list[j]
            dummy_crps_v2_list = []
            for k in range(number_s):
                dummy_crps_v2_list.append(task2_baseline_list[task2_index][i]['baseline'][k]['CRPS'][j])
            v2_baseline_crps_list[i]['baseline']['w'].append(w)
            v2_baseline_crps_list[i]['baseline']['CRPS'].append(np.mean(dummy_crps_v2_list))
        
    # dataframe with the w that minimizes the CRPS for every country (v2)
    data_v2 = {
        'country_id':[],
        'w':[],
        'CRPS':[]
    }
    for i in range(len(v2_baseline_crps_list)):
        # get the index of the minimal CRPS value
        min_index = v2_baseline_crps_list[i]['baseline']['CRPS'].index(min(v2_baseline_crps_list[i]['baseline']['CRPS']))
        
        # store values in dict
        data_v2['country_id'].append(v2_baseline_crps_list[i]['country_id'])
        data_v2['w'].append(v2_baseline_crps_list[i]['baseline']['w'][min_index])
        data_v2['CRPS'].append(v2_baseline_crps_list[i]['baseline']['CRPS'][min_index])
        
    v2_baseline_crps = pd.DataFrame(data_v2)
    w_minimization_list[task2_index]['minWData'].append(v2_baseline_crps)
    w_compute_list[task2_index]['data'].append(v2_baseline_crps_list)
    #----------------------------------------------------------------------------------------


    ## baseline v3---------------------------------------------------------------------------
    for s_index in range(number_s):
        dummy_crps_v3_list = []
        s = s_prediction_list[s_index]
        for w_index in range(number_w):
            w = window_list[w_index]
            for i in range(number_countries):
                dummy_crps_v3_list.append(task2_baseline_list[task2_index][i]['baseline'][s_index]['CRPS'][w_index])
            v3_baseline_crps_list[s_index]['w'].append(w)
            v3_baseline_crps_list[s_index]['CRPS'].append(np.mean(dummy_crps_v3_list))

    # dataframe with the w that minimize the CRPS for each prediction window s
    data_v3 = {
        's':[],
        'w':[],
        'CRPS':[]
    }
    # length of the v3_baseline list is the number of prediction windows
    for i in range(len(v3_baseline_crps_list)):
        s = s_prediction_list[i]
        # get the index of the minimal CRPS value
        min_index = v3_baseline_crps_list[i]['CRPS'].index(min(v3_baseline_crps_list[i]['CRPS']))

        # store values in dict
        data_v3['s'].append(s)
        data_v3['w'].append(v3_baseline_crps_list[i]['w'][min_index])
        data_v3['CRPS'].append(v3_baseline_crps_list[i]['CRPS'][min_index])

    v3_baseline_crps = pd.DataFrame(data_v3)

    w_minimization_list[task2_index]['minWData'].append(v3_baseline_crps)
    w_compute_list[task2_index]['data'].append(v3_baseline_crps_list)
    #----------------------------------------------------------------------------------------

    ## baseline v4---------------------------------------------------------------------------
    v4_baseline_crps = [{'country_id':country,
                        's':[],
                        'w':[],
                        'CRPS':[]
                        } for country in country_list]

    # loop over all countries
    for i in range(len(task2_baseline_list[task2_index])):
        # loop over all prediction windows
        for s_index in range(number_s):
            s = s_prediction_list[s_index]
            # get the index of the minimal CRPS value
            min_index = task2_baseline_list[task2_index][i]['baseline'][s_index]['CRPS'].index(min(task2_baseline_list[task2_index][i]['baseline'][s_index]['CRPS']))
        
            # store values in dict
            v4_baseline_crps[i]['s'].append(s)
            v4_baseline_crps[i]['w'].append(task2_baseline_list[task2_index][i]['baseline'][s_index]['w'][min_index])
            v4_baseline_crps[i]['CRPS'].append(task2_baseline_list[task2_index][i]['baseline'][s_index]['CRPS'][min_index])

        v4_baseline_crps[i] = pd.DataFrame(v4_baseline_crps[i])

    w_minimization_list[task2_index]['minWData'].append(v4_baseline_crps)
    w_compute_list[task2_index]['data'].append(task2_baseline_list[task2_index])

In [None]:
dump([task2_baseline_list, w_minimization_list, w_compute_list], 
       'task2_baseline_wmin_vars.joblib')