## Baseline model
At first all four datasets are modified in a way, that all of them contain the same countries with at least the last 36 months of observations. Countries that are not present in all datasets are not used for the minimization of the CRPS in dependecy of w or s. 

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations to_oct_17
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last 36 months are in the dataset of a country
def check_last_36Months(country, yearindex):
    month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
    month_list = month_list[-36:]
     
    last_36_months = True
    for month in month_list:
        if month not in country.index.get_level_values('month_id'):
            last_36_months = False
            break

    return last_36_months


# list of all countries that are present in all four datasets
country_list = []
for i in range(len(features_df_list)):
    country_list.extend(features_df_list[i]['data'].index.get_level_values('country_id').unique().tolist())

unique_list = []

for item in country_list:
    if country_list.count(item) == 4:
        unique_list.append(item)

country_list = list(set(unique_list))

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))


# modify country_list so that it contains only country_ids 
# that have the last 36 months of observations in ALL DATASETS!
dummy_list = []
for countryIndex in country_list:
    # loop through datasets
    for i in range(len(features_df_list)):
        dummy_hasLast36_months = True
        if check_last_36Months(country_feature_group_list[i].get_group(countryIndex), i) is not True:
            dummy_hasLast36_months = False
    
    if dummy_hasLast36_months is True:
        dummy_list.append(countryIndex)

# the values in country_list are the 'country_id'
country_list = dummy_list


#### The minimization is based on calculating the quantiles for each country, w and year (of the datasets).

In [105]:
# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
# list of the prediction windows
#window_list = list(range(2, 37))



## changes, so that the calculation does not take a long time --------
#shorter windows
window_list = list(range(2, 25))
# remove all but ten countries
""" elements_to_remove = country_list[0:(len(country_list)-10)]
country_list = [element for element in country_list if element not in elements_to_remove] """

baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': []}} for country in country_list]
index_list = ['2018', '2019']
##----------------------------------------------------------------------------------


number_countries = len(country_list)
number_dataframes = len(features_df_list)
number_w = len(window_list)


# loop through all countries (that are present in each dataset)
for index in range(number_countries):
    country = country_list[index]

    print('country ' + str(index+1) + '/' + str(number_countries), end='\r')

    # list to store the predictions for each year temporally
    baseline_predict_list = [[] for _ in range(number_dataframes)]
    
    # loop through datasets
    for i in range(2): #range(number_dataframes): 
        features = country_feature_group_list[i].get_group(country) # features of country in dataset i
        
        baseline_predict_list[i] = []

        quantiles = np.arange(0.001, 0.9999, 0.001)
        quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies

        dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles]

        # loop through windows
        for j in range(number_w):
            w = window_list[j] # current window

            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(features.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(features.tail(w).loc[:,'ged_sb'])

            #hier verteilung = nbinom ppf als
            dummy_fatalities_list = []

            # string to store distribution
            dist_string = ''

            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()

                dist_string = 'NBinom'

            elif var != 0 and var <= mean:
                    dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

                    dist_string = 'Pois'

            else:
                    dummy_fatalities_list = [0] * 999
                    dist_string = 'None'

            baseline_predict_list[i].append({'window':w, 'country_id':country, 'dist':dist_string, 
                                             'mean':mean, 'var':var, 'quantile':[], 'fatalities':[]}) 

            baseline_predict_list[i][j]['quantile'] = dummy_quantile_list    
            baseline_predict_list[i][j]['fatalities'] = dummy_fatalities_list

            baseline_predict_list[i][j] = pd.DataFrame(baseline_predict_list[i][j])
            baseline_predict_list[i][j].set_index(['window', 'quantile'], inplace=True)

        baseline_country_predict_list[index]['prediction'][index_list[i]] = baseline_predict_list[i]

        # combine each w dataset together
        baseline_country_predict_list[index]['prediction'][index_list[i]] = pd.concat(baseline_country_predict_list[index]['prediction'][index_list[i]], axis=0)
        baseline_country_predict_list[index]['prediction'][index_list[i]].sort_index(axis=0, inplace=True)

country 10/10

In [78]:
print(country_list)
baseline_country_predict_list[9]['prediction']['2018'].xs(12, level = 'window')

[232, 233, 234, 235, 237, 242, 243, 244, 245, 246]


Unnamed: 0_level_0,country_id,dist,mean,var,fatalities
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.1%,246,NBinom,33.416667,1375.537879,0.0
0.2%,246,NBinom,33.416667,1375.537879,0.0
0.3%,246,NBinom,33.416667,1375.537879,0.0
0.4%,246,NBinom,33.416667,1375.537879,0.0
0.5%,246,NBinom,33.416667,1375.537879,0.0
...,...,...,...,...,...
99.5%,246,NBinom,33.416667,1375.537879,198.0
99.6%,246,NBinom,33.416667,1375.537879,207.0
99.7%,246,NBinom,33.416667,1375.537879,218.0
99.8%,246,NBinom,33.416667,1375.537879,234.0


## Baseline 1 and 2

### Variant 1
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **all countries**
* **all prediciton windows**

### Variant 2
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **inidividual countries**
* **all prediciton windows**

In [145]:
import cProfile

s_prediction_list = list(range(3, 14))
v1_baseline_crps_list = {'w':[],'CRPS':[]}
v2_baseline_crps_list = [{'country_id': country, 'baseline': {'w':[],'CRPS':[]}} for country in country_list]

def your_function_to_profile():
    # calculate the CRPS over all countries for each w
    for i in range(number_w):
        w = window_list[i]
        print('window ' + str(w-1) + '/' + str(number_w), end='\r')

        # dictionary variant 2
        #v2_crps_dict = {'country_id':[], 'CRPS':[]}
        v2_crps_country_list = [{'country_id': 0, 'crps': []} for _ in range(number_countries)]

        for j in range(2): #range(len(actual_years)):
            year = actual_years[j]
            #print('year ' + str(year))

            # lists variant 1
            v1_yearly_crps_list = []    
            v1_crps_mean_list = []
            

            for index in range(number_countries):
                country = country_list[index]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                dummy_crps_list = [0]*len(s_prediction_list)

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")
                
                for k in s_prediction_list:
                    true_obs = monthly_totals_actuals.iloc[3-k,0]
                    crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                    dummy_crps_list[3-k] = crps
                    
                # v1 list with mean over the s windows    
                v1_yearly_crps_list.append(np.mean(dummy_crps_list))

                # v2 list with mean over the s windows  
                if j == 0:
                    v2_crps_country_list[index]['country_id'] = country
                v2_crps_country_list[index]['crps'].append(np.mean(dummy_crps_list))

            # v1 mean over all countries
            v1_crps_mean_list.append(np.mean(v1_yearly_crps_list))

            #print(v1_crps_mean_list)

        # v1 save results
        v1_baseline_crps_list['w'].append(w)
        v1_baseline_crps_list['CRPS'].append(np.mean(v1_crps_mean_list))

        # v2 save results
        for country_index in range(number_countries):
            if country_list[country_index] == v2_baseline_crps_list[country_index]['country_id']:
                v2_baseline_crps_list[country_index]['baseline']['w'].append(w)
                v2_baseline_crps_list[country_index]['baseline']['CRPS'].append(np.mean(v2_crps_country_list[country_index]['crps']))
    
cProfile.run('your_function_to_profile()')

         3301897 function calls (3286375 primitive calls) in 58.004 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       23    0.000    0.000    0.000    0.000 2264627338.py:15(<listcomp>)
        1    0.279    0.279   58.004   58.004 2264627338.py:7(your_function_to_profile)
        8    0.000    0.000    0.000    0.000 <__array_function__ internals>:177(copyto)
     4541    0.017    0.000    0.453    0.000 <__array_function__ internals>:177(linspace)
     1219    0.004    0.000    0.070    0.000 <__array_function__ internals>:177(mean)
     4541    0.012    0.000    0.033    0.000 <__array_function__ internals>:177(ndim)
     4541    0.014    0.000    0.073    0.000 <__array_function__ internals>:177(result_type)
     5060    0.018    0.000    0.276    0.000 <__array_function__ internals>:177(sort)
    15180    0.090    0.000    2.127    0.000 <__array_function__ internals>:177(sum)
        1    0.000    0.000   58.004 

In [146]:
# dataframe with the w that minimizes the CRPS for all countries (v1)
v1_baseline_crps = pd.DataFrame(v1_baseline_crps_dict)
v1_baseline_crps = v1_baseline_crps[v1_baseline_crps.CRPS == v1_baseline_crps.loc[:,'CRPS'].min()]
v1_baseline_crps.set_index(pd.Index(range(len(v1_baseline_crps))), inplace=True)

# dataframe with the w that minimizes the CRPS for every country (v2)
data = {
    'country_id':[],
    'w':[],
    'CRPS':[]
}
for i in range(len(v2_baseline_crps_list)):
    # get the index of the minimal CRPS value
    min_index = v2_baseline_crps_list[i]['baseline']['CRPS'].index(min(v2_baseline_crps_list[i]['baseline']['CRPS']))
    
    # store values in dict
    data['country_id'].append(v2_baseline_crps_list[i]['country_id'])
    data['w'].append(v2_baseline_crps_list[i]['baseline']['w'][min_index])
    data['CRPS'].append(v2_baseline_crps_list[i]['baseline']['CRPS'][min_index])
    
v2_baseline_crps = pd.DataFrame(data)

#v1_baseline_crps
v2_baseline_crps

Unnamed: 0,country_id,w,CRPS
0,232,2,0.0
1,233,2,0.0
2,234,2,0.0
3,235,2,0.0
4,237,24,3.403092
5,242,2,0.0
6,243,2,0.0
7,244,2,0.0
8,245,17,9.15141
9,246,2,18.77008


In [163]:
print(v1_baseline_crps_list)
v1_baseline_crps

{'w': [], 'CRPS': []}


Unnamed: 0,w,CRPS
0,3,1.650465


## Baseline 3 and 4
### Variant 3
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **all countries**
* **inidvidual prediciton windows**

In [181]:
s_prediction_list = list(range(3, 14))

baseline_crps_list = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]

number_s = len(s_prediction_list)

for s in s_prediction_list:
    print('                  prediction window ' + str(s-2) + '/' + str(number_s), end='\r')

    for index in range(number_countries):
        country = country_list[index]
        print('country ' + str(index+1) + '/' + str(number_countries), end='\r')
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            for j in range(2): #range(number_dataframes):
                year = actual_years[j]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                true_obs = monthly_totals_actuals.iloc[3-s,0]

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")

                crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                dummy_crps_list.append(crps)

            baseline_crps_list[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list))
    


country 10/10     prediction window 11/11

In [166]:
v1_baseline_crps_dict = {'w':[],'CRPS':[]}
v2_baseline_crps_list = [{'country_id': country, 'baseline': {'w':[],'CRPS':[]}} for country in country_list]
v3_baseline_crps_dict = {'s':[],'w':[],'CRPS':[]}

## baseline v1---------------------------------------------------------------------------
for j in range(number_w):
    w = window_list[j]
    dummy_crps_v1_list = []
    for i in range(number_countries):
        for k in range(number_s):
            dummy_crps_v1_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
    v1_baseline_crps_dict['w'].append(w)
    v1_baseline_crps_dict['CRPS'].append(np.mean(dummy_crps_v1_list))

v1_baseline_crps = pd.DataFrame(v1_baseline_crps_dict)
v1_baseline_crps = v1_baseline_crps[v1_baseline_crps.CRPS == v1_baseline_crps.loc[:,'CRPS'].min()]
v1_baseline_crps.set_index(pd.Index(range(len(v1_baseline_crps))), inplace=True)
    
    
#----------------------------------------------------------------------------------------

## baseline v2----------------------------------------------------------------------------
# list for baseline v2
for i in range(number_countries):
    for j in range(number_w):
        w = window_list[j]
        dummy_crps_v2_list = []
        for k in range(number_s):
            dummy_crps_v2_list.append(baseline_crps_list[i]['baseline'][k]['CRPS'][j])
        v2_baseline_crps_list[i]['baseline']['w'].append(w)
        v2_baseline_crps_list[i]['baseline']['CRPS'].append(np.mean(dummy_crps_v2_list))
    
# dataframe with the w that minimizes the CRPS for every country (v2)
data = {
    'country_id':[],
    'w':[],
    'CRPS':[]
}
for i in range(len(v2_baseline_crps_list)):
    # get the index of the minimal CRPS value
    min_index = v2_baseline_crps_list[i]['baseline']['CRPS'].index(min(v2_baseline_crps_list[i]['baseline']['CRPS']))
    
    # store values in dict
    data['country_id'].append(v2_baseline_crps_list[i]['country_id'])
    data['w'].append(v2_baseline_crps_list[i]['baseline']['w'][min_index])
    data['CRPS'].append(v2_baseline_crps_list[i]['baseline']['CRPS'][min_index])
    
v2_baseline_crps = pd.DataFrame(data)
#----------------------------------------------------------------------------------------


## baseline v3---------------------------------------------------------------------------
data = {
    's':[],
    'w':[],
    'CRPS':[]
}
    
    
#----------------------------------------------------------------------------------------

## baseline v4---------------------------------------------------------------------------
data = [{'country_id':country,
         's':[],
         'w':[],
         'CRPS':[]
         } for country in country_list]
    
    
#----------------------------------------------------------------------------------------

#v1_baseline_crps
#v2_baseline_crps

Unnamed: 0,w,CRPS
0,3,3.426688


In [178]:
baseline_crps_list

[{'country_id': 232,
  'baseline': [{'s': 3,
    'w': [2,
     3,
     4,
     5,
     6,
     7,
     8,
     9,
     10,
     11,
     12,
     13,
     14,
     15,
     16,
     17,
     18,
     19,
     20,
     21,
     22,
     23,
     24],
    'CRPS': [0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0]},
   {'s': 4,
    'w': [2,
     3,
     4,
     5,
     6,
     7,
     8,
     9,
     10,
     11,
     12,
     13,
     14,
     15,
     16,
     17,
     18,
     19,
     20,
     21,
     22,
     23,
     24],
    'CRPS': [0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0,
     0.0]},
   {'s': 5,
    'w': [2,
     3,
     4,
     5,
    