## Baseline variant 1
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **all countries**
* **all prediciton windows**
-> alle Monate rausnehmen, die nicht in allen Ländern sind -> gemeinsame Schnittmenge

for i in w's
    for i in dataset
        nbinomquant = ...
        for i in s vorhersagehorizont
            crps(w,s) = ...
        meancrps(dataset) = meancrps über alle s
    datensatz(w) = aktuelles w
    datensatz(w's meanCrps) = meanCrps über alle datasets

optimales w = min CRPS w
-> CRPS des optimalen w's


Problem: einige Länder haben << 36 Monate als Beobachtung
-> also unterscheiden in Länder? Ansatz w für alle Länder mit >=36 monaten berechnen und ergebnis oder falls nur kleiner verfügbar für die anderen verwenden

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations to_oct_17
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last 36 months are in the dataset of a country
def check_last_36Months(country, yearindex):
    month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
    month_list = month_list[-36:]
     
    last_36_months = True
    for month in month_list:
        if month not in country.index.get_level_values('month_id'):
            last_36_months = False
            break

    return last_36_months


In [2]:
# check how many countries have the last 36 months as observations
grouped_features = features_df_list[3]['data'].groupby('country_id')

liste = []
for name, country in grouped_features:
    liste.append(check_last_36Months(country, 3))

print(str(liste.count(False)) + " aus " + str(len(liste)) + " sind False.")

22 aus 213 sind False.


In [3]:
# list of all countries that are present in all four datasets
country_list = []
for i in range(len(features_df_list)):
    country_list.extend(features_df_list[i]['data'].index.get_level_values('country_id').unique().tolist())

unique_list = []

for item in country_list:
    if country_list.count(item) == 4:
        unique_list.append(item)

country_list = list(set(unique_list))

In [4]:
country_feature_group_list = []
country_actual_group_list = []
# country group list of all four datasets 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

In [5]:
# modify country_list so that it contains only country_ids that have the last 36 months of observations in ALL DATASETS!
dummy_list = []
for countryIndex in country_list:
    # loop through datasets
    for i in range(len(features_df_list)):
        dummy_hasLast36_months = True
        if check_last_36Months(country_feature_group_list[i].get_group(countryIndex), i) is not True:
            dummy_hasLast36_months = False
    
    if dummy_hasLast36_months is True:
        dummy_list.append(countryIndex)

country_list = dummy_list

In [82]:
import cProfile

# list to save the predictions for each country
baseline_country_predict_list = [{'country_id': country, 'prediction': {'2018': [], '2019': [], '2020': [], '2021': []}} for country in country_list]
index_list = ['2018', '2019', '2020', '2021']
# list of the prediction windows
window_list = list(range(2, 37))
#window_list = list(range(2, 3))

# ACHTUNG ! für Variante 1 gilt:
# Es werden nur Länder zur Bestimmung des idealen w verwendet,
# die sowohl in allen Datensätzen vorkommen, als auch die letzten 36 Monate als Beobachtungen besitzen

#def your_function_to_profile():
# loop through all countries (that are present in each dataset)
for index in range(len(country_list)):
    country = country_list[index]
    #baseline_country_predict_list.append({'country_id':country, 'prediction':{'2018':[],'2019':[],'2020':[],'2021':[]}})
    #baseline_country_predict_list[index]['country_id'] = country
    # list to store the predictions for each year temporally
    baseline_predict_list = [[]]*len(features_df_list)
    # loop through datasets
    for i in range(1): #len(features_df_list)
        features = country_feature_group_list[i].get_group(country) # features of country in dataset i
        
        baseline_predict_list[i] = []#[[]]*len(window_list)

        quantiles = np.arange(0.001, 0.9999, 0.001)
        quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies

        dummy_quantile_list = [f"{round(q * 100, 1)}%" for q in quantiles]

        # loop through windows
        for j in range(len(window_list)):
            w = window_list[j] # current window

            baseline_predict_list[i].append({'window':w, 'country_id':country, 'quantile':[], 'fatalities':[]}) #

            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(features.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(features.tail(w).loc[:,'ged_sb'])

            #if w=1 mean=variance? -> Verschiebungssatz Var auch Null

            #hier verteilung = nbinom ppf als
            dummy_fatalities_list = []
            #dummy_quantile_list = []

            

            if var != 0 and var != mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                
                
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()
                #dummy_quantile_list = quantiles.tolist()
            else:
                dummy_fatalities_list = [0] * 999
                #dummy_quantile_list = [f"{round((k/10000)*100, 1)}%" for k in range(10, 9999, 10)]




            """ dummy_fatalities_list.append(0)
            dummy_quantile_list.append(str((k/10000)*100)+'%') """




            """ if var != 0 and var != mean:
                n = (mean**2)/(var-mean) # equivalent to r
                p = mean/var
                
                # compute all quantiles from 0.1% to 99.9%
                for k in range(10, 9999, 10):
                    dummy_fatalities_list.append(nbinom.ppf(k/10000, n, p))
                    dummy_quantile_list.append(str((k/10000)*100)+'%')
            else:
                # compute all quantiles from 0.1% to 99.9%
                for k in range(10, 9999, 10):
                    dummy_fatalities_list.append(0)
                    dummy_quantile_list.append(str((k/10000)*100)+'%') """

            baseline_predict_list[i][j]['quantile'] = dummy_quantile_list    
            baseline_predict_list[i][j]['fatalities'] = dummy_fatalities_list

            baseline_predict_list[i][j] = pd.DataFrame(baseline_predict_list[i][j])
            baseline_predict_list[i][j].set_index(['window', 'quantile'], inplace=True)

            

        baseline_country_predict_list[index]['prediction'][index_list[i]] = baseline_predict_list[i]

        # combine each w dataset together
        baseline_country_predict_list[index]['prediction'][index_list[i]] = pd.concat(baseline_country_predict_list[index]['prediction'][index_list[i]], axis=0)
        baseline_country_predict_list[index]['prediction'][index_list[i]].sort_index(axis=0, inplace=True)


# list to save the predictions for each country
#cProfile.run('your_function_to_profile()')

In [83]:
baseline_country_predict_list[190]['prediction']['2018']

Unnamed: 0_level_0,Unnamed: 1_level_0,country_id,fatalities
window,quantile,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.1%,246,5.0
2,0.2%,246,6.0
2,0.3%,246,6.0
2,0.4%,246,7.0
2,0.5%,246,7.0
...,...,...,...
36,99.5%,246,396.0
36,99.6%,246,418.0
36,99.7%,246,446.0
36,99.8%,246,485.0


In [81]:
combined_df = pd.concat(baseline_country_predict_list[190]['prediction']['2018'], axis=0)
combined_df.sort_index(axis=0, inplace=True)
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,country_id,fatalities
window,quantile,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.1%,246,5.0
2,0.2%,246,6.0
2,0.3%,246,6.0
2,0.4%,246,7.0
2,0.5%,246,7.0
...,...,...,...
36,99.5%,246,396.0
36,99.6%,246,418.0
36,99.7%,246,446.0
36,99.8%,246,485.0


In [None]:
# find the minimal windwo w for all countries


In [21]:
g = features_df_list[0]['data'].groupby('country_id')
g.get_group(23).index.get_level_values('country_id')[0]

23

### Prediction with all countries summed

In [29]:
baseline_predict_list = [[]]*len(feature_years)

window_list = list(range(2, 37))


# loop through datasets
for i in range(len(feature_years)):
    # sum of the conflict related deaths for each month
    monthly_totals_features = features_df_list[i]['data'].groupby('month_id').sum()

    baseline_predict_list[i] = {'actual_year':actual_years[i], 'prediction':[]}
    # loop through windows
    for j in range(len(window_list)):
        w = window_list[j] # current window

        baseline_predict_list[i]['prediction'].append({'window':w, 'quantile':[], 'fatalities':[]})

        # calculate n (r) and p via average/variance
        mean = pd.Series.mean(monthly_totals_features.tail(w).loc[:,'ged_sb'])
        var = pd.Series.var(monthly_totals_features.tail(w).loc[:,'ged_sb'])

        """ if w=1 mean=variance? -> Verschiebungssatz Var auch Null """

        n = (mean**2)/(var-mean) # equivalent to r
        p = mean/var

        dummy_fatalities_list = []
        dummy_quantile_list = []

        # compute all quantiles from 0.1% to 99.9%
        for k in range(10, 9999, 10):
            dummy_fatalities_list.append(nbinom.ppf(k/10000, n, p))
            dummy_quantile_list.append(str((k/10000)*100)+'%')

        baseline_predict_list[i]['prediction'][j]['quantile'] = dummy_quantile_list    
        baseline_predict_list[i]['prediction'][j]['fatalities'] = dummy_fatalities_list

        baseline_predict_list[i]['prediction'][j] = pd.DataFrame(baseline_predict_list[i]['prediction'][j])
        baseline_predict_list[i]['prediction'][j].set_index('quantile')

    


#baseline_predict_list[1]['prediction']['window']


In [109]:
prediction_window_list = list(range(3, 14))
yearly_crps_mean_list = []
baseline_predict_crps_list = {'w':[],'CRPS':[]}

for i in range(len(window_list)):
    for j in range(len(feature_years)):
        monthly_totals_actuals = actuals_df_list[j]['data'].groupby('month_id').sum()
        dummy_crps_list = [0]*len(prediction_window_list)
        for k in prediction_window_list:
            true_obs = monthly_totals_actuals.iloc[3-k,0]
            NB_prediction = pd.DataFrame(baseline_predict_list[j]['prediction'][i]['fatalities'])
            crps = pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]
            dummy_crps_list[3-k] = crps
        yearly_crps_mean_list.append(np.mean(dummy_crps_list))
    baseline_predict_crps_list['w'].append(window_list[i])
    baseline_predict_crps_list['CRPS'].append(np.mean(yearly_crps_mean_list))
        
baseline_predict_crps = pd.DataFrame(baseline_predict_crps_list)        



In [118]:
baseline_predict_crps[baseline_predict_crps.CRPS == baseline_predict_crps.loc[:,'CRPS'].min()]

Unnamed: 0,w,CRPS
20,22,1665.88427


In [107]:
monthly_totals_actuals = actuals_df_list[0]['data'].groupby('month_id').sum()
true_obs = monthly_totals_actuals.iloc[0,0]
true_obs
NB_prediction = pd.DataFrame(baseline_predict_list[0]['prediction'][0]['fatalities'])
pscore(NB_prediction.loc[:,'fatalities'].to_numpy(),true_obs).compute()[0]


1754.787838889941

In [38]:
# Gruppiere den Datensatz nach 'Land ID' und zähle die Anzahl der Monate für jedes Land
land_counts = df_features.groupby('country_id').size()

# Finde den minimalen und maximalen Monat über alle Länder
min_month = df_features.index.get_level_values('month_id').min()
max_month = df_features.index.get_level_values('month_id').max()

""" # Iteriere über die Länder und markiere fehlende Monate zum Löschen
to_delete = []
for land, count in land_counts.items():
    if count != max_month - min_month + 1:
        missing_months = set(range(min_month, max_month+1)) - set(df_features.loc[land].index.get_level_values('month_id'))
        to_delete.extend([(land, month) for month in missing_months])

# Filtere den Datensatz, um nur die Monate beizubehalten, die nicht zum Löschen markiert wurden
data_filtered = df_features.drop(to_delete)
data_filtered """
land_df = pd.DataFrame(land_counts)
land_df[land_df.iloc[:,0] < 36]

Unnamed: 0_level_0,0
country_id,Unnamed: 1_level_1
185,10
186,10
188,23
189,20
192,3
196,5
197,5
230,21
247,6
248,1


## Baseline variant 2
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **inidividual countries**
* **all prediciton windows**

for country x
    for i in w's
        for i in dataset
            nbinomquant = ...
            for i in s vorhersagehorizont
                crps(x,w,s) = ...
            meancrps(x,dataset) = meancrps über alle s
        datensatz(x,w) = aktuelles w
        datensatz(x,w's meanCrps) = meanCrps über alle datasets des landes x
    datensatz2(x,opimales w) = min CRPS w
    datensatz2(x, w's meanCrps) = optimaler Crps des landes x unter opti. w

-> CRPS = mittelrwert der CRPS über aller länder

## Baseline variant 3
Optimize **w** (through the CRPS) regarding
* all **task 2 datasets** (2018-2021)
* **all countries**
* **inidvidual prediciton windows**
-> alle Monate rausnehmen, die nicht in allen Ländern sind -> gemeinsame Schnittmenge

for i in s (prediction window)
    for i in w's
        for i in dataset
            nbinomquant = ...
                crps(w,s) = ...
                aktuelles w speichern
    mincrps(s) = kleinster crps über alle w
    w = zugehöriges w des mincrps
    s
CRPS = mean über alle mincrps

optimales w = min CRPS w
-> CRPS des optimalen w's