In [1]:
import pandas as pd
from joblib import load
from joblib import dump
import numpy as np
import os

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020','2021','2022']
actual_years = ['2018','2019','2020','2021','2022','2023']

actuals_df_list = []
features_df_list = []

# path to the current directory
current_dir = os.getcwd()

# read feature dataset
relative_path_features = os.path.join('..', 'data', 'cm_features.parquet')
path_features = os.path.join(current_dir, relative_path_features)
feature_data = pd.read_parquet(path_features, engine='pyarrow')
# true future task
#feature_data_toApr2024 = feature_data

for i in range(len(feature_years)):
    # relative paths to the parquet files
    relative_path_actuals = os.path.join('..', 'data', 'cm_actuals_' + actual_years[i] + '.parquet')
    path_actuals = os.path.join(current_dir, relative_path_actuals)

    feature_data_toOct = feature_data[feature_data['month_id'] <= 454 + (i * 12)]
    feature_data_toOct.set_index(['month_id', 'country_id'], inplace=True)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':feature_data_toOct})


country_list = sorted(features_df_list[-1]['data'].index.get_level_values('country_id').unique().tolist())

# country group list of all years
country_feature_group_list = []
country_actual_group_list = []

# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

# Benchmark View

In [3]:
user_dir = os.path.expanduser('~')

# Pfad zur Parquet-Datei
parquet_file_path = os.path.join(user_dir, 'iCloudDrive\\VIEWS Joblib Predicitons\\VIEWS benchmark', 'bm_last_cm_2018.parquet')

# Parquet-Datei in ein DataFrame einlesen
benchmark_data = pd.read_parquet(parquet_file_path)

benchmark_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome
month_id,country_id,draw,Unnamed: 3_level_1
457,1,0,0
457,1,1,0
457,1,2,0
457,1,3,0
457,1,4,0
...,...,...,...
468,246,995,15
468,246,996,27
468,246,997,18
468,246,998,17


# Nbinom Variant 3

In [47]:

file_path = os.path.join(user_dir, 'iCloudDrive\\VIEWS Joblib Predicitons\\Baseline', 'FinalTask2_baseline_predct_nbinomWmax24.joblib')
loaded_vars_baseline = load(file_path)

task2_baseline_list = loaded_vars_baseline[0] # crps averages for all 4 datasets
w_minimization_list = loaded_vars_baseline[1] # contains the minimal w's for the different baselines for each year and country
baseline_prediction_list = loaded_vars_baseline[2] # predictions with the minimal w's for each dataset and country
baseline1_average_crps = loaded_vars_baseline[3] # mean CRPS from the baseline_prediction_list
baseline2_average_crps = loaded_vars_baseline[4] # ""
baseline3_average_crps = loaded_vars_baseline[5]
baseline4_average_crps = loaded_vars_baseline[6]

print('Overall CRPS, max. w = 24')
print('baseline 1: ' + str(np.round(baseline1_average_crps, decimals = 4)))
print('baseline 2: ' + str(np.round(baseline2_average_crps, decimals = 4)))
print('baseline 3: ' + str(np.round(baseline3_average_crps, decimals = 4)))
print('baseline 4: ' + str(np.round(baseline4_average_crps, decimals = 4)))

Overall CRPS, max. w = 24
baseline 1: 56.2833
baseline 2: 77.1115
baseline 3: 56.1098
baseline 4: 69.2462


In [93]:
prediction_year_nbinom = '2023' # 2019, 2020, 2021, 2022, 2023

dataset_index = actual_years.index(prediction_year_nbinom)
actual_data = actuals_df_list[dataset_index]['data']
actuals_months = actual_data.index.get_level_values('month_id').unique()

In [94]:
actuals_months

Index([517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528], dtype='int64', name='month_id')

In [95]:
monthly_fatlities_pred = pd.DataFrame()

#### achtung nbinom 1, hurdle 0
for country in baseline_prediction_list[2]:

    for s in range(3,15):
        
        month_id = actuals_months[s-3]
        country_id = country['country_id']

        distribution = country['prediction'][prediction_year_nbinom][s-3]['fatalities']
        distribution = [int(num) for num in distribution]
        draw = list(range(0,len(distribution)))

        index_tupel_list = []
        for i in range(len(distribution)):
            index_tupel_list.append((month_id,country_id,draw[i]))

        index = pd.MultiIndex.from_tuples(index_tupel_list, names=['month_id', 'country_id', 'draw'])

        # Erstelle eine Beispiel-Spalte "outcome"
        outcome = distribution

        # Erstelle den DataFrame
        df = pd.DataFrame({'outcome': outcome}, index=index)

        monthly_fatlities_pred = pd.concat([monthly_fatlities_pred, df], ignore_index=False)

In [90]:
import CRPS.CRPS as pscore
import numpy as np

actual_group = actual_data.groupby('country_id')

baseline_country_group = monthly_fatlities_pred.groupby('country_id')
crps_values_test = []

for country in baseline_country_group:
    country_id = country[0]

    country_crps_list = []
    for i in range(0,12):
        y_true = actual_group.get_group(country_id).iloc[i,0]
        month = actuals_months[i]
        prediction = baseline_country_group.get_group(country_id).xs(month, level='month_id').values.flatten()
        crps = pscore(prediction,y_true).compute()[0]
        country_crps_list.append(crps)
    crps_values_test.append(np.mean(country_crps_list))

mean_crps_test = np.mean(crps_values_test)


print('Mean CRPS = ' + str(mean_crps_test) + ' \\'+'\\')

Mean CRPS = 125.61592006015138 \\


In [96]:
monthly_fatlities_pred.to_parquet('cm_nbinom_test_window_' + prediction_year_nbinom + '.parquet')


In [92]:
monthly_fatlities_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,outcome
month_id,country_id,draw,Unnamed: 3_level_1
505,1,0,0
505,1,1,0
505,1,2,0
505,1,3,0
505,1,4,0
...,...,...,...
516,246,994,101
516,246,995,107
516,246,996,114
516,246,997,124


# Hurdle Variant 1

In [4]:
file_path = os.path.join(user_dir, 'iCloudDrive\\VIEWS Joblib Predicitons\\Baseline', 'FinalTask2_baseline_predct_hurdleWmax24.joblib')
loaded_vars_baseline = load(file_path)

task2_baseline_list = loaded_vars_baseline[0] # crps averages for all 4 datasets
w_minimization_list = loaded_vars_baseline[1] # contains the minimal w's for the different baselines for each year and country
baseline_prediction_list = loaded_vars_baseline[2] # predictions with the minimal w's for each dataset and country
baseline1_average_crps = loaded_vars_baseline[3] # mean CRPS from the baseline_prediction_list
baseline2_average_crps = loaded_vars_baseline[4] # ""
baseline3_average_crps = loaded_vars_baseline[5]
baseline4_average_crps = loaded_vars_baseline[6]

print('Overall CRPS, max. w = 24')
print('baseline 1: ' + str(np.round(baseline1_average_crps, decimals = 4)))
print('baseline 2: ' + str(np.round(baseline2_average_crps, decimals = 4)))
print('baseline 3: ' + str(np.round(baseline3_average_crps, decimals = 4)))
print('baseline 4: ' + str(np.round(baseline4_average_crps, decimals = 4)))

Overall CRPS, max. w = 24
baseline 1: 56.6621
baseline 2: 77.4704
baseline 3: 58.6499
baseline 4: 74.6854


In [25]:
prediction_year_hurdle = '2023' # 2019, 2020, 2021, 2022, 2023

dataset_index = actual_years.index(prediction_year_hurdle)
actual_data = actuals_df_list[dataset_index]['data']
actuals_months = actual_data.index.get_level_values('month_id').unique()

In [26]:
monthly_fatlities_pred = pd.DataFrame()

#### achtung hurdle 0 da variante 1
for country in baseline_prediction_list[0]:

    for s in country['prediction'][prediction_year_hurdle][0]['s']:
        
        month_id = actuals_months[s-3]
        country_id = country['country_id']

        distribution = country['prediction'][prediction_year_hurdle][0]['fatalities']
        distribution = [int(num) for num in distribution]
        draw = list(range(0,len(distribution)))

        index_tupel_list = []
        for i in range(len(distribution)):
            index_tupel_list.append((month_id,country_id,draw[i]))

        index = pd.MultiIndex.from_tuples(index_tupel_list, names=['month_id', 'country_id', 'draw'])

        # Erstelle eine Beispiel-Spalte "outcome"
        outcome = distribution

        # Erstelle den DataFrame
        df = pd.DataFrame({'outcome': outcome}, index=index)

        monthly_fatlities_pred = pd.concat([monthly_fatlities_pred, df], ignore_index=False)

In [27]:
import CRPS.CRPS as pscore
import numpy as np

actual_group = actual_data.groupby('country_id')

baseline_country_group = monthly_fatlities_pred.groupby('country_id')
crps_values_test = []

for country in baseline_country_group:
    country_id = country[0]

    country_crps_list = []
    for i in range(0,12):
        y_true = actual_group.get_group(country_id).iloc[i,0]
        month = actuals_months[i]
        prediction = baseline_country_group.get_group(country_id).xs(month, level='month_id').values.flatten()
        crps = pscore(prediction,y_true).compute()[0]
        country_crps_list.append(crps)
    crps_values_test.append(np.mean(country_crps_list))

mean_crps_test = np.mean(crps_values_test)


print('Mean CRPS = ' + str(mean_crps_test) + ' \\'+'\\')

Mean CRPS = 87.95059459068102 \\


In [28]:
monthly_fatlities_pred.to_parquet('cm_hurdle_test_window_' + prediction_year_hurdle + '.parquet')

# Neural Net

In [None]:
from joblib import load
# var to set---- Jahr hier und unten in .joblib müssen übereinstimmen!!!
prediction_year_NN = '2018' # 2019, 2020, 2021
#------

dataset_index = actual_years.index(prediction_year_NN)
actual_data = actuals_df_list[dataset_index]['data']
actuals_months = actual_data.index.get_level_values('month_id').unique()
# FinalTask2_NN_2021_Hyperparamctr2204all
# FinalTask2_NN_2021_HyperparamctrallIndividual
# var to set----
vars = load('FinalTask2_NNNegBin_2018_HyperparamFixed.joblib')
#-----
NNet_prediction_list, country_list, pred_year_string, seed, zero_fatalities_country_list, nan_list = vars

In [None]:
monthly_fatlities_pred = pd.DataFrame()
# Iterieren Sie über die Daten und füllen Sie den DataFrame
for country in NNet_prediction_list:
    for s in country[pred_year_string][0]['s']:
        month_id = actuals_months[s-3]
        country_id = country['country_id']
        distribution = country[pred_year_string][0]['distribution'][s-3]
        draw = list(range(0,len(distribution)))

        index_tupel_list = []
        for i in range(len(distribution)):
            index_tupel_list.append((month_id,country_id,draw[i]))

        index = pd.MultiIndex.from_tuples(index_tupel_list, names=['month_id', 'country_id', 'draw'])

        # Erstelle eine Beispiel-Spalte "outcome"
        outcome = list(distribution)

        # alle Einträge in int transformieren
        outcome = [int(element) for element in outcome]

        # Erstelle den DataFrame
        df = pd.DataFrame({'outcome': outcome}, index=index)

        monthly_fatlities_pred = pd.concat([monthly_fatlities_pred, df], ignore_index=False)

In [None]:
monthly_fatlities_pred.to_parquet('cm_NN_test_window_' + prediction_year_NN + '.parquet')