In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations starting with january 1990
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last n months are in the dataset of a country,
# other than that the last month of a country in the feature dataset has to be 3 months before the first actuals month!!
def check_last_nMonths(n, country_id, yearindex):
    country = country_feature_group_list[yearindex].get_group(country_id)

    # reference month of the actual dataset
    actual_month_list = actuals_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()

    # if the last month of the feature dataset in the country does not match the first of the actuals return false
    if (actual_month_list[0] - 3) != country.index.get_level_values('month_id').unique().tolist()[-1]:
        return False
    else:
        month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
        last_month = month_list[-1] # equals the first month - 3 from the corresponding actuals dataset
        first_month = month_list[0]

        last_n_months = True

        if last_month-n+1 < first_month:
            last_n_months = False
        else:
            month_list = list(range(last_month-n+1, last_month+1))
            
            for month in month_list:
                if month not in country.index.get_level_values('month_id'):
                    last_n_months = False
                    break

        return last_n_months
        #return True

#--------------------------------------------------------------------------------------------
# because of the concatination only the last dataframe is used (later on the appended months are dropped for datasets before 2020)

# IF THIS CODE IS USED FOR THE 2024 PREDICTION ADJUST THE features_1990to2020 in the whole file to 1990to2023

features_1990to2020_df = features_df_list[3]['data']
country_list = sorted(features_1990to2020_df.index.get_level_values('country_id').unique().tolist())

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

# same reason as mentioned two lines earlier
country_feature_group_1990to2020 = country_feature_group_list[3]


print(len(country_list))

213


In [34]:
#country_feature_group_list[0].get_group(191)
#country_feature_group_list[3].get_group(254).index.get_level_values('month_id').unique().tolist()
#country_actual_group_list[0].get_group(246).index.get_level_values('month_id').unique().tolist()

[144]

In [37]:
# modify country_list so that it contains only country_ids 
# that have at least the last n months of observations in the last dataset (2020)!
numberMonths_toOct20 = 96 # 96 = 5*12 (5 jahre für 2017) + 3*12 (jedes Jahr 12 Monate mehr also 2020 8 Jahre)
#-- note------
# dataset 2020 is used, because of the structure of the other datasets.
# 2020 is dataset 2019 with 12 additional rows (months) etc.
# for the CRPS calculation  of the datasets != 2020 the last 12*x windows are deleted
# this procedure is saving computation time
#-------------


#IMPORTANT
#if you do not minimize over all countries but only the single countries, 
# it is sufficient to check if all countries contain the last month in the features dataset (this way you use the full information). 
# But you still have to check check_last_nMonths(len(countrymonths), countryIndex, 3), so that no month is missing in between.

# => so currently not all information is used for each country

dummy_list = []
for countryIndex in country_list:
    dummy_hasLastN_months = True

    # index 3 is the last dataset
    # 76, da Land 246 z.b. genau die letzten 112 Monate (in '2020') als Beobachtungen hat 
    if check_last_nMonths(numberMonths_toOct20, countryIndex, 3) is not True:
        dummy_hasLastN_months = False  
    
    if dummy_hasLastN_months is True:
        dummy_list.append(countryIndex)

# the values in country_list are the 'country_id'
country_list = dummy_list

#IMPORTANT
# all countries that have the last month as observation have the last 96 months as observations (in 2020)!!! so no country is excluded
# checked by modifing the check_last_nMonths function -> else: return True

len(country_list)

191

In [38]:
# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
# list of the (prediction) windows
#window_list = list(range(2, 37))
s_prediction_list = list(range(3, 15))



## changes, so that the calculation does not take a long time -------------------
#shorter windows
window_list = list(range(2, 25))
# remove all but ten countries
elements_to_remove = country_list[0:(len(country_list)-10)]
country_list = [element for element in country_list if element not in elements_to_remove]

baseline_country_predict_list = [{'country_id': country, 'prediction': {'2021': []}} for country in country_list]
actual_years = ['2021']
index_list = ['2021']

In [39]:
country_list

[232, 233, 234, 235, 237, 242, 243, 244, 245, 246]

In [132]:
features = country_feature_group_1990to2020.get_group(246)
fatalities_index = features.columns.get_loc('acled_os')
features.loc[(slice(488, 490), slice(None)), 'ged_sb'].values

array([24.,  1.,  0.])

In [52]:
#fd
print(number_countries)

10


In [86]:
N = 100  # Anzahl der Spalten im Raster
x = 2   # Länge des Fensters in Spalten
Z = 5   # Anzahl der Verschiebungen von rechts nach links

column_index = N-x+1-Z+1

print(column_index)


95


In [None]:
#asdf

In [141]:
number_countries = len(country_list)
number_dataframes = len(actual_years)
number_w = len(window_list)

#calculate the number of subsets, that are used to estimate the distribution and validate it via 12 months of actuals 
# the number is set with the maximal w (e.g. 24): if w=24, actuals are 12 months (starting with s=3 to s=14) 
# -> 24 + 2 + 12 = 39 observations of ged_sb per window
# example: if the dataset has 96 observations there are 96 - 38 = 58 shiftable windows for 2020
numberWindows = numberMonths_toOct20 - (window_list[-1] + 2 + 12)


# list to store the predictions for each w
baseline_predict_list = []


    


quantiles = np.arange(0.001, 0.9999, 0.001)
quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies
dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles] # sting values of the quantiles

# last month of the dataframe as reference for the moving prediction windows
last_month = features_1990to2020_df.index.get_level_values('month_id').tolist()[-1]

# loop through windows
for i in range(number_w):    

    print('                              window ' + str(i+1) + '/' + str(number_w)  , end='\r')

    w = window_list[i] # current window
    baseline_predict_list.append({'window':w, 
                                  'country_predict_list':[{'country_id':country, 'predictionWindowsN':[]} for country in country_list]})
    
    # loop through all countries
    for index in range(number_countries):
        country = country_list[index]
    
        print('country ' + str(index+1) + '/' + str(number_countries), end='\r')

        features = country_feature_group_1990to2020.get_group(country) # features of country
        

        # loop through all X equal parts of the feature dataset (traindata length w, actuals is vector of the next t+3 till t+12 observations)
        for j in range(numberWindows):
            windowLength = w + 2 + 12
            starting_month_window = last_month - windowLength + 1 - numberWindows + 1  + j
            ending_month_window = starting_month_window + w - 1

            starting_month_actuals = ending_month_window + 3
            ending_month_actuals = starting_month_actuals + 11
             
            window_features = features.loc[(slice(starting_month_window, ending_month_window), slice(None)), 'ged_sb']
            window_actuals = features.loc[(slice(starting_month_actuals, ending_month_actuals), slice(None)), 'ged_sb']

            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(window_features)
            var = pd.Series.var(window_features)

            #hier verteilung = nbinom ppf als
            dummy_fatalities_list = []

            # string to store distribution
            dist_string = ''

            if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()

                dist_string = 'NBinom'

            elif var != 0 and var <= mean:
                dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

                dist_string = 'Pois'

            else:
                dummy_fatalities_list = [0] * 999
                dist_string = 'None'

            
             
            baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'].append(
                 [{'country_id':country, 'w':w, 'dist':dist_string, 
                   'mean':mean, 'var':var, 'first_month_feature':starting_month_window, 
                   'quantile':dummy_quantile_list, 'fatalities':dummy_fatalities_list}, 
                   {'s':s_prediction_list, 
                    'month_id': window_actuals.index.get_level_values('month_id'),
                    'unreal_actuals':window_actuals.values}])
            

            # unnötig? braucht Rechenzeit
            baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][0] = pd.DataFrame(
                baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][0])
            baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][0].set_index(['w', 'quantile'], inplace=True)

            baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][1] = pd.DataFrame(
                baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][1])
            baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'][j][1].set_index(['s', 'month_id'], inplace=True)







#baseline_predict_list

country 10/10                 window 23/23

In [142]:

""" baseline_predict_list[i]['country_predict_list'][index]['predictionWindowsN'].append(
                 [{'country_id':country, 'w':w, 'dist':dist_string, 
                   'mean':mean, 'var':var, 'last_month_feature':ending_month_window, 
                   'quantile':dummy_quantile_list, 'fatalities':dummy_fatalities_list}, 
                   [{'s':s_prediction_list, 'unreal_actuals':window_actuals}]]) """

#                     w                        country         rollingWindow|predicton/actuals
baseline_predict_list[0]['country_predict_list'][9]['predictionWindowsN'][-1][1]

Unnamed: 0_level_0,Unnamed: 1_level_0,unreal_actuals
s,month_id,Unnamed: 2_level_1
3,479,0.0
4,480,0.0
5,481,2.0
6,482,0.0
7,483,0.0
8,484,1.0
9,485,13.0
10,486,3.0
11,487,9.0
12,488,24.0


In [112]:
#as
country_feature_group_1990to2020.get_group(246).tail(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,gleditsch_ward,ged_sb,ged_ns,ged_os,acled_sb,acled_sb_count,acled_os,ged_sb_tsum_24,wdi_sp_pop_totl,ged_sb_tlag_1,...,ind_efficiency_t48,irr_agr_efficiency_t48,services_efficiency_t48,general_efficiency_t48,water_stress_t48,renewable_internal_pcap_t48,renewable_pcap_t48,splag_1_decay_ged_sb_5,splag_1_decay_ged_os_5,splag_1_decay_ged_ns_5
month_id,country_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
461,246,626,174.0,0.0,67.0,126.0,24.0,52.0,1584.0,10658226.0,110.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.965161,5.188033,5.443064
462,246,626,73.0,32.0,6.0,125.0,55.0,76.0,1618.0,10658226.0,174.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.909217,5.20793,5.42964
463,246,626,11.0,137.0,10.0,26.0,28.0,33.0,1283.0,10658226.0,73.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.937861,5.144264,5.386533
464,246,626,10.0,48.0,9.0,22.0,10.0,19.0,1117.0,10658226.0,11.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.854226,5.08241,5.458502
465,246,626,25.0,28.0,3.0,27.0,26.0,40.0,1142.0,10658226.0,10.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.800629,5.214415,5.388513
466,246,626,5.0,102.0,13.0,12.0,32.0,69.0,1055.0,10658226.0,25.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.777026,5.151375,5.292049
467,246,626,0.0,25.0,5.0,7.0,3.0,39.0,1002.0,10658226.0,5.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.852544,5.089319,5.281327
468,246,626,39.0,0.0,10.0,17.0,9.0,36.0,1027.0,10658226.0,0.0,...,14.815546,0.024892,13.758357,9.110691,4.226076,2463.315547,4689.77383,4.798995,5.138131,5.27091
469,246,626,38.0,49.0,8.0,61.0,15.0,530.0,1040.0,10395329.0,39.0,...,24.300152,0.04418,36.366656,18.992252,4.226076,2426.355899,4619.408346,4.719313,5.13179,5.177793
470,246,626,24.0,13.0,1.0,47.0,10.0,48.0,1061.0,10395329.0,38.0,...,24.300152,0.04418,36.366656,18.992252,4.226076,2426.355899,4619.408346,4.934043,5.210224,5.087327


In [None]:
baseline_country_predict_list[9]['prediction']['2018'].xs(12, level = 'window')
#baseline_country_predict_list[9]['prediction']['2018']['window'][12] = liste von dicts mit verteilung und actual realisationen (vektor der länge 12!) aus features
# liste hat die länge floor(len(featuresdata)/max window w + 1)

In [None]:
# list to store all crps values
baseline_crps_list = [
    {
        'country_id': country,
        'baseline': [
            {'s': s, 'w': [], 'CRPS': []}
            for s in s_prediction_list
        ]
    }
    for country in country_list
]

# number of prediction windows
number_s = len(s_prediction_list)

# fill list with crps calculations
for s in s_prediction_list:
    print('                  prediction window ' + str(s-2) + '/' + str(number_s), end='\r')

    for index in range(number_countries):
        country = country_list[index]
        print('country ' + str(index+1) + '/' + str(number_countries), end='\r')
            
        for i in range(number_w):
            w = window_list[i]
            dummy_crps_list = [] 

            for j in range(number_dataframes):
                year = actual_years[j]
                monthly_totals_actuals = country_actual_group_list[j].get_group(country)
                true_obs = monthly_totals_actuals.iloc[s-3,0]

                NB_prediction = baseline_country_predict_list[index]['prediction'][year].xs(w, level="window")

                crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
                dummy_crps_list.append(crps)

            baseline_crps_list[index]['baseline'][s-3]['w'].append(w)
            baseline_crps_list[index]['baseline'][s-3]['CRPS'].append(np.mean(dummy_crps_list))
    
# time to calculate: ~66 min with all 190 countries