## Baseline model
At first all four datasets are modified in a way, that all of them contain the same countries with at least the last 36 months of observations. Countries that are not present in all datasets are not used for the minimization of the CRPS in dependecy of w or s. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations to_oct_17
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last 36 months are in the dataset of a country
def check_last_36Months(country, yearindex):
    month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
    month_list = month_list[-36:]
     
    last_36_months = True
    for month in month_list:
        if month not in country.index.get_level_values('month_id'):
            last_36_months = False
            break

    return last_36_months


# list of all countries that are present in all four datasets
country_list = []
for i in range(len(features_df_list)):
    country_list.extend(features_df_list[i]['data'].index.get_level_values('country_id').unique().tolist())

unique_list = []

for item in country_list:
    if country_list.count(item) == 4:
        unique_list.append(item)

country_list = list(set(unique_list))

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))


# modify country_list so that it contains only country_ids 
# that have the last 36 months of observations in ALL DATASETS!
dummy_list = []
for countryIndex in country_list:
    # loop through datasets
    for i in range(len(features_df_list)):
        dummy_hasLast36_months = True
        if check_last_36Months(country_feature_group_list[i].get_group(countryIndex), i) is not True:
            dummy_hasLast36_months = False
    
    if dummy_hasLast36_months is True:
        dummy_list.append(countryIndex)

# the values in country_list are the 'country_id'
country_list = dummy_list


#### The minimization is based on calculating the quantiles for each country, w and year (of the datasets).

In [2]:
# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
# list of the (prediction) windows
window_list = list(range(2, 37))
s_prediction_list = list(range(3, 15))



""" ## changes, so that the calculation does not take a long time -------------------
#shorter windows
window_list = list(range(2, 25))
# remove all but ten countries
elements_to_remove = country_list[0:(len(country_list)-10)]
country_list = [element for element in country_list if element not in elements_to_remove]

baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': []}} for country in country_list]
index_list = ['2018', '2019']
##---------------------------------------------------------------------------------- """


number_countries = len(country_list)
number_dataframes = len(features_df_list)
number_w = len(window_list)


# loop through all countries (that are present in each dataset)
for index in range(number_countries):
    country = country_list[index]

    print('country ' + str(index+1) + '/' + str(number_countries), end='\r')

    # list to store the predictions for each year temporally
    baseline_predict_list = [[] for _ in range(number_dataframes)]
    
    # loop through datasets
    for i in range(number_dataframes): 
        features = country_feature_group_list[i].get_group(country) # features of country in dataset i
        
        baseline_predict_list[i] = []

        quantiles = np.arange(0.001, 0.9999, 0.001)
        quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies

        dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles]

        # loop through windows
        for j in range(number_w):
            w = window_list[j] # current window

            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(features.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(features.tail(w).loc[:,'ged_sb'])

            #hier verteilung = nbinom ppf als
            dummy_fatalities_list = []

            # string to store distribution
            dist_string = ''

            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()

                dist_string = 'NBinom'

            elif var != 0 and var <= mean:
                    dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

                    dist_string = 'Pois'

            else:
                    dummy_fatalities_list = [0] * 999
                    dist_string = 'None'

            baseline_predict_list[i].append({'window':w, 'country_id':country, 'dist':dist_string, 
                                             'mean':mean, 'var':var, 'quantile':[], 'fatalities':[]}) 

            baseline_predict_list[i][j]['quantile'] = dummy_quantile_list    
            baseline_predict_list[i][j]['fatalities'] = dummy_fatalities_list

            baseline_predict_list[i][j] = pd.DataFrame(baseline_predict_list[i][j])
            baseline_predict_list[i][j].set_index(['window', 'quantile'], inplace=True)

        baseline_country_predict_list[index]['prediction'][index_list[i]] = baseline_predict_list[i]

        # combine each w dataset together
        baseline_country_predict_list[index]['prediction'][index_list[i]] = pd.concat(baseline_country_predict_list[index]['prediction'][index_list[i]], axis=0)
        baseline_country_predict_list[index]['prediction'][index_list[i]].sort_index(axis=0, inplace=True)

country 191/191

In [3]:
print(country_list)
baseline_country_predict_list[9]['prediction']['2018'].xs(12, level = 'window')

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 67, 69, 70, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 89, 90, 92, 93, 94, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 198, 199, 205, 206, 209, 213, 214, 218, 220, 222, 223, 231, 232, 233, 234, 235, 237, 242, 243, 244, 245, 246]


Unnamed: 0_level_0,country_id,dist,mean,var,fatalities
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.1%,10,,0.0,0.0,0
0.2%,10,,0.0,0.0,0
0.3%,10,,0.0,0.0,0
0.4%,10,,0.0,0.0,0
0.5%,10,,0.0,0.0,0
...,...,...,...,...,...
99.5%,10,,0.0,0.0,0
99.6%,10,,0.0,0.0,0
99.7%,10,,0.0,0.0,0
99.8%,10,,0.0,0.0,0


## Baseline 1-4

Optimize **w** (through the CRPS) regarding
|            | datasets    | countries   | prediction windows |
|------------|-------------|-------------|--------------------|
| baseline 1 | all         | all         | all                |
| baseline 2 | all         | inidvidual  | all                |
| baseline 3 | all         | all         | individual         |
| baseline 4 | all         | inidvidual  | individual         |

In [4]:
# list to store all crps values
baseline_crps_list = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]

# numver of prediction windows
number_s = len(s_prediction_list)

# fill list with crps calculations
for s in s_prediction_list:
    print('                  prediction window ' + str(s-2) + '/' + str(number_s), end='\r')

    for index in range(number_countries):
        country = country_list[index]
        print('country ' + str(index+1) + '/' + str(number_countries), end='\r')
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            for j in range(number_dataframes):
                year = actual_years[j]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                true_obs = monthly_totals_actuals.iloc[s-3,0]

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")

                crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                dummy_crps_list.append(crps)

            baseline_crps_list[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list))
    


country 191/191   prediction window 12/12

In [5]:
v1_baseline_crps_dict = {'w':[],'CRPS':[]}
v2_baseline_crps_list = [{'country_id': country, 'baseline': {'w':[],'CRPS':[]}} for country in country_list]
v3_baseline_crps_list = [{'s':s,'w':[],'CRPS':[]} for s in s_prediction_list]

## baseline v1---------------------------------------------------------------------------
# loop over w
for j in range(number_w):
    w = window_list[j]
    dummy_crps_v1_list = []
    # loop over countries
    for i in range(number_countries):
        # loop over prediction windows s
        for k in range(number_s):
            dummy_crps_v1_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
    v1_baseline_crps_dict['w'].append(w)
    v1_baseline_crps_dict['CRPS'].append(np.mean(dummy_crps_v1_list))

v1_baseline_crps = pd.DataFrame(v1_baseline_crps_dict)
v1_baseline_crps = v1_baseline_crps[v1_baseline_crps.CRPS == v1_baseline_crps.loc[:,'CRPS'].min()]
v1_baseline_crps.set_index(pd.Index(range(len(v1_baseline_crps))), inplace=True)
    
    
#----------------------------------------------------------------------------------------

## baseline v2----------------------------------------------------------------------------
# list for baseline v2
for i in range(number_countries):
    for j in range(number_w):
        w = window_list[j]
        dummy_crps_v2_list = []
        for k in range(number_s):
            dummy_crps_v2_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
        v2_baseline_crps_list[i]['baseline']['w'].append(w)
        v2_baseline_crps_list[i]['baseline']['CRPS'].append(np.mean(dummy_crps_v2_list))
    
# dataframe with the w that minimizes the CRPS for every country (v2)
data_v2 = {
    'country_id':[],
    'w':[],
    'CRPS':[]
}
for i in range(len(v2_baseline_crps_list)):
    # get the index of the minimal CRPS value
    min_index = v2_baseline_crps_list[i]['baseline']['CRPS'].index(min(v2_baseline_crps_list[i]['baseline']['CRPS']))
    
    # store values in dict
    data_v2['country_id'].append(v2_baseline_crps_list[i]['country_id'])
    data_v2['w'].append(v2_baseline_crps_list[i]['baseline']['w'][min_index])
    data_v2['CRPS'].append(v2_baseline_crps_list[i]['baseline']['CRPS'][min_index])
    
v2_baseline_crps = pd.DataFrame(data_v2)
#----------------------------------------------------------------------------------------


## baseline v3---------------------------------------------------------------------------
for s_index in range(number_s):
    dummy_crps_v3_list = []
    s = s_prediction_list[s_index]
    for w_index in range(number_w):
        w = window_list[w_index]
        for i in range(number_countries):
            dummy_crps_v3_list.append(baseline_crps_list[i]['baseline'][s_index]['CRPS'][w_index])
        v3_baseline_crps_list[s_index]['w'].append(w)
        v3_baseline_crps_list[s_index]['CRPS'].append(np.mean(dummy_crps_v3_list))

# dataframe with the w that minimize the CRPS for each prediction window s
data_v3 = {
    's':[],
    'w':[],
    'CRPS':[]
}
# length of the v3_baseline list is the number of prediction windows
for i in range(len(v3_baseline_crps_list)):
    s = s_prediction_list[i]
    # get the index of the minimal CRPS value
    min_index = v3_baseline_crps_list[i]['CRPS'].index(min(v3_baseline_crps_list[i]['CRPS']))

    # store values in dict
    data_v3['s'].append(s)
    data_v3['w'].append(v3_baseline_crps_list[i]['w'][min_index])
    data_v3['CRPS'].append(v3_baseline_crps_list[i]['CRPS'][min_index])

v3_baseline_crps = pd.DataFrame(data_v3)
#----------------------------------------------------------------------------------------

## baseline v4---------------------------------------------------------------------------
v4_baseline_crps = [{'country_id':country,
                    's':[],
                    'w':[],
                    'CRPS':[]
                    } for country in country_list]

# loop over all countries
for i in range(len(baseline_crps_list)):
    # loop over all prediction windows
    for s_index in range(number_s):
        s = s_prediction_list[s_index]
        # get the index of the minimal CRPS value
        min_index = baseline_crps_list[i]['baseline'][s_index]['CRPS'].index(min(baseline_crps_list[i]['baseline'][s_index]['CRPS']))
    
        # store values in dict
        v4_baseline_crps[i]['s'].append(s)
        v4_baseline_crps[i]['w'].append(baseline_crps_list[i]['baseline'][s_index]['w'][min_index])
        v4_baseline_crps[i]['CRPS'].append(baseline_crps_list[i]['baseline'][s_index]['CRPS'][min_index])

    v4_baseline_crps[i] = pd.DataFrame(v4_baseline_crps[i])
#----------------------------------------------------------------------------------------

In [6]:
# calculation of the overall CRPS to compare the impact of the level of detail in modeling 
dummy_array_v4 = []
for countryData in v4_baseline_crps:
    dummy_array_v4.append(np.mean(countryData.loc[:,'CRPS']))

print('Overall CRPS')
print('baseline 1: ' + str(np.round(v1_baseline_crps.iloc[0,1], decimals = 4)))
print('baseline 2: ' + str(np.round(np.mean(v2_baseline_crps.loc[:,'CRPS']), decimals = 4)))
print('baseline 3: ' + str(np.round(np.mean(v3_baseline_crps.loc[:,'CRPS']), decimals = 4)))
print('baseline 4: ' + str(np.round(np.mean(dummy_array_v4), decimals = 4)))

Overall CRPS
baseline 1: 15.1751
baseline 2: 13.7932
baseline 3: 15.3875
baseline 4: 12.9081


In [8]:
from joblib import dump, load

# save variables in joblib file
dump([country_list, baseline_country_predict_list, baseline_crps_list, v1_baseline_crps_dict,
      v2_baseline_crps_list, v3_baseline_crps_list,
      v1_baseline_crps, v2_baseline_crps, v3_baseline_crps, v4_baseline_crps], 'baseline_variables.joblib')

# load variables
#geladene_variablen = load('baseline_variables.joblib')


['baseline_variables.joblib']