In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations to_oct_17
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last 36 months are in the dataset of a country
def check_last_36Months(country, yearindex):
    month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
    month_list = month_list[-36:]
     
    last_36_months = True
    for month in month_list:
        if month not in country.index.get_level_values('month_id'):
            last_36_months = False
            break

    return last_36_months


# list of all countries that are present in all four datasets
country_list = []
for i in range(len(features_df_list)):
    country_list.extend(features_df_list[i]['data'].index.get_level_values('country_id').unique().tolist())

unique_list = []

for item in country_list:
    if country_list.count(item) == 4:
        unique_list.append(item)

country_list = list(set(unique_list))

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))


# modify country_list so that it contains only country_ids 
# that have the last 36 months of observations in ALL DATASETS!
dummy_list = []
for countryIndex in country_list:
    # loop through datasets
    for i in range(len(features_df_list)):
        dummy_hasLast36_months = True
        if check_last_36Months(country_feature_group_list[i].get_group(countryIndex), i) is not True:
            dummy_hasLast36_months = False
    
    if dummy_hasLast36_months is True:
        dummy_list.append(countryIndex)

# the values in country_list are the 'country_id'
country_list = dummy_list

In [21]:
country_feature_group_list[0].get_group(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,gleditsch_ward,ged_sb,ged_ns,ged_os,acled_sb,acled_sb_count,acled_os,ged_sb_tsum_24,wdi_sp_pop_totl,ged_sb_tlag_1,...,ind_efficiency_t48,irr_agr_efficiency_t48,services_efficiency_t48,general_efficiency_t48,water_stress_t48,renewable_internal_pcap_t48,renewable_pcap_t48,splag_1_decay_ged_sb_5,splag_1_decay_ged_os_5,splag_1_decay_ged_ns_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
121,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752897.0,0.0,...,43.141952,0.151779,27.952694,1.380676,3.333333,315260.986388,354505.092577,0.093750,1.034032,0.093750
122,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752897.0,0.0,...,43.141952,0.151779,27.952694,1.380676,3.333333,315260.986388,354505.092577,0.091081,1.060721,0.091081
123,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752897.0,0.0,...,43.141952,0.151779,27.952694,1.380676,3.333333,315260.986388,354505.092577,0.088488,1.058992,0.088488
124,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752897.0,0.0,...,43.141952,0.151779,27.952694,1.380676,3.333333,315260.986388,354505.092577,0.085969,1.057313,0.085969
125,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752897.0,0.0,...,43.141952,0.151779,27.952694,1.380676,3.333333,315260.986388,354505.092577,0.083522,1.027213,0.083522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,759087.0,0.0,...,43.355336,0.113272,22.032310,1.653920,3.298402,317403.873381,356914.728988,0.000210,1.048196,1.192779
451,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,759087.0,0.0,...,43.355336,0.113272,22.032310,1.653920,3.298402,317403.873381,356914.728988,0.000204,1.046824,1.187291
452,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,759087.0,0.0,...,43.355336,0.113272,22.032310,1.653920,3.298402,317403.873381,356914.728988,0.000198,1.017023,1.181959
453,1,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,759087.0,0.0,...,43.355336,0.113272,22.032310,1.653920,3.298402,317403.873381,356914.728988,0.000193,1.044196,1.176779


In [7]:
# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
# list of the (prediction) windows
#window_list = list(range(2, 37))
s_prediction_list = list(range(3, 15))



## changes, so that the calculation does not take a long time -------------------
#shorter windows
window_list = list(range(2, 25))
# remove all but ten countries
elements_to_remove = country_list[0:(len(country_list)-10)]
country_list = [element for element in country_list if element not in elements_to_remove]

baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': []}} for country in country_list]
actual_years = ['2018', '2019']
index_list = ['2018', '2019']
##----------------------------------------------------------------------------------

In [8]:



number_countries = len(country_list)
number_dataframes = len(actual_years)
number_w = len(window_list)


# loop through all countries (that are present in each dataset)
for index in range(number_countries):
    country = country_list[index]

    print('country ' + str(index+1) + '/' + str(number_countries), end='\r')

    # list to store the predictions for each year temporally
    baseline_predict_list = [[] for _ in range(number_dataframes)]
    
    # loop through datasets
    for i in range(number_dataframes): 
        features = country_feature_group_list[i].get_group(country) # features of country in dataset i
        
        baseline_predict_list[i] = []

        quantiles = np.arange(0.001, 0.9999, 0.001)
        quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies

        dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles]

        # calculate number of parts 

        # loop through windows
        for j in range(number_w):    

            w = window_list[j] # current window
            baseline_predict_list[i].append({'window':w, 'predictions':[]})



            # loop through all X equal parts of the feature dataset (traindata length w, actuals is vector of the next t+3 till t+12 observations)

            baseline_predict_list[i]['predictions'].append([{'country_id':country, 'w':w, 'dist':dist_string, 
                                                             'mean':mean, 'var':var, 'last_month_id':0, 
                                                             'quantile':[], 'fatalities':[]}, 
                                                             [{'s':[], 'unreal_actuals':[]}]])

            {'window':w, 'country_id':country, 'dist':dist_string, 
                                             'mean':mean, 'var':var, 'quantile':[], 'fatalities':[]}

            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(features.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(features.tail(w).loc[:,'ged_sb'])

            #hier verteilung = nbinom ppf als
            dummy_fatalities_list = []

            # string to store distribution
            dist_string = ''

            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()

                dist_string = 'NBinom'

            elif var != 0 and var <= mean:
                    dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

                    dist_string = 'Pois'

            else:
                    dummy_fatalities_list = [0] * 999
                    dist_string = 'None'

            baseline_predict_list[i].append({'window':w, 'country_id':country, 'dist':dist_string, 
                                             'mean':mean, 'var':var, 'quantile':[], 'fatalities':[]}) 

            baseline_predict_list[i][j]['quantile'] = dummy_quantile_list    
            baseline_predict_list[i][j]['fatalities'] = dummy_fatalities_list

            baseline_predict_list[i][j] = pd.DataFrame(baseline_predict_list[i][j])
            baseline_predict_list[i][j].set_index(['window', 'quantile'], inplace=True)

        baseline_country_predict_list[index]['prediction'][index_list[i]] = baseline_predict_list[i]

        # combine each w dataset together
        baseline_country_predict_list[index]['prediction'][index_list[i]] = pd.concat(baseline_country_predict_list[index]['prediction'][index_list[i]], axis=0)
        baseline_country_predict_list[index]['prediction'][index_list[i]].sort_index(axis=0, inplace=True)

country 10/10

In [9]:
baseline_country_predict_list[9]['prediction']['2018'].xs(12, level = 'window')
#baseline_country_predict_list[9]['prediction']['2018']['window'][12] = liste von dicts mit verteilung und actual realisationen (vektor der länge 12!) aus features
# liste hat die länge floor(len(featuresdata)/max window w + 1)

Unnamed: 0_level_0,country_id,dist,mean,var,fatalities
quantile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.1%,246,NBinom,33.416667,1375.537879,0.0
0.2%,246,NBinom,33.416667,1375.537879,0.0
0.3%,246,NBinom,33.416667,1375.537879,0.0
0.4%,246,NBinom,33.416667,1375.537879,0.0
0.5%,246,NBinom,33.416667,1375.537879,0.0
...,...,...,...,...,...
99.5%,246,NBinom,33.416667,1375.537879,198.0
99.6%,246,NBinom,33.416667,1375.537879,207.0
99.7%,246,NBinom,33.416667,1375.537879,218.0
99.8%,246,NBinom,33.416667,1375.537879,234.0


In [10]:
# list to store all crps values
baseline_crps_list = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]

# numver of prediction windows
number_s = len(s_prediction_list)

# fill list with crps calculations
for s in s_prediction_list:
    print('                  prediction window ' + str(s-2) + '/' + str(number_s), end='\r')

    for index in range(number_countries):
        country = country_list[index]
        print('country ' + str(index+1) + '/' + str(number_countries), end='\r')
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            for j in range(number_dataframes):
                year = actual_years[j]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                true_obs = monthly_totals_actuals.iloc[s-3,0]

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")

                crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                dummy_crps_list.append(crps)

            baseline_crps_list[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list))
    
# time to calculate: ~66 min with all 190 countries

country 10/10     prediction window 12/12