### set seed so that the model outputs are reproducible
After the kernel is restarted the same results are created.

In [1]:
import keras
import tensorflow as tf
from tensorflow.keras import initializers
# Set the seed using keras.utils.set_random_seed. This will set:
# 1) `numpy` seed
# 2) `tensorflow` random seed
# 3) `python` random seed
seed = 0
keras.utils.set_random_seed(seed)

# This will make TensorFlow ops as deterministic as possible, but it will
# affect the overall performance, so it's not enabled by default.
# `enable_op_determinism()` is introduced in TensorFlow 2.9.
tf.config.experimental.enable_op_determinism()

## Load the data

In [2]:
import pandas as pd
import numpy as np
import os
import CRPS.CRPS as pscore
import copy
from joblib import dump, load
from time import sleep
from tqdm import tqdm
import warnings


def check_Actuals(country_id, dataindex):
    # Check if the country_id exists in actual dataset
    if country_id not in country_actual_group_list[dataindex].groups.keys():
        return False
    else:
        return True

# check if the last month of a country in the feature dataset is 3 months before the first month that has to be predicted
def check_last_featureMonth(country_id, dataindex):
    # Check if the country_id exists in actual dataset
    if country_id not in country_actual_group_list[dataindex].groups.keys():
        raise ValueError('country does not have actuals')


    # last month of the feature dataset
    last_feature_month = country_feature_group_list[dataindex].get_group(country_id).index.get_level_values('month_id').unique().tolist()[-1]

    # first month of the actual dataset
    first_actual_month = country_actual_group_list[dataindex].get_group(country_id).index.get_level_values('month_id').unique().tolist()[0]

    # if the last month of the feature dataset in the country does not match the first of the actuals return false
    if (first_actual_month - 3) != last_feature_month:
        return False
    else:
        return True


# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

# path to the current directory
current_dir = os.getcwd()

for i in range(len(feature_years)):
    # relative paths to the parquet files
    relative_path_features = os.path.join('..', 'data', 'cm_features_to_oct' + feature_years[i] + '.parquet')
    relative_path_actuals = os.path.join('..', 'data', 'cm_actuals_' + actual_years[i] + '.parquet')

    path_features = os.path.join(current_dir, relative_path_features)
    path_actuals = os.path.join(current_dir, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations starting with january 1990
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])



### Drop features that contain missing values

In [3]:
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor


data = features_df_list[-1]['data']
if 'gleditsch_ward' in data.columns:
    data = data.drop(columns='gleditsch_ward') # column not necessary

## Features without missing values
columns_without_missing_values = data.columns[data.notna().all()]

for i in range(len(features_df_list)):
    data_set = features_df_list[i]['data']
    features_df_list[i]['data'] = data_set[columns_without_missing_values]

# last dataset contains all other datasets (because of concat) 
all_features = features_df_list[-1]['data'].columns


### Feature subsets

In [4]:
## different feature selection from views
# 59 features that map the conflict history of a country
conflict_history = [
    'ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5',
    'splag_1_decay_ged_sb_5', 'wdi_sp_pop_totl', 'ged_sb_tlag_1',
    'ged_sb_tlag_2', 'ged_sb_tlag_3', 'ged_sb_tlag_4',
    'ged_sb_tlag_5', 'ged_sb_tlag_6', 'ged_sb_tsum_24',
    'decay_ged_sb_100', 'decay_ged_sb_500', 'decay_ged_os_100',
    'decay_ged_ns_5', 'decay_ged_ns_100', 'ged_ns', 'ged_os',
    'acled_sb', 'acled_sb_count', 'acled_os',
    'ged_os_tlag_1', 'decay_acled_sb_5', 'decay_acled_os_5',
    'decay_acled_ns_5', 'splag_1_decay_ged_os_5',
    'splag_1_decay_ged_ns_5'
]


# 59 features that are drawn from the Varieties of Democracy project
vdem = [
    'ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5',
    'splag_1_decay_ged_sb_5', 'wdi_sp_pop_totl', 'vdem_v2x_delibdem',
    'vdem_v2x_egaldem', 'vdem_v2x_libdem', 'vdem_v2x_libdem_48',
    'vdem_v2x_partip', 'vdem_v2x_accountability',
    'vdem_v2x_civlib', 'vdem_v2x_clphy', 'vdem_v2x_cspart',
    'vdem_v2x_divparctrl', 'vdem_v2x_edcomp_thick', 'vdem_v2x_egal',
    'vdem_v2x_execorr', 'vdem_v2x_frassoc_thick', 'vdem_v2x_gencs',
    'vdem_v2x_gender', 'vdem_v2x_genpp', 'vdem_v2x_horacc',
    'vdem_v2x_neopat', 'vdem_v2x_pubcorr', 'vdem_v2x_rule',
    'vdem_v2x_veracc', 'vdem_v2x_freexp', 'vdem_v2xcl_acjst', 
    'vdem_v2xcl_dmove', 'vdem_v2xcl_prpty', 'vdem_v2xcl_rol', 
    'vdem_v2xcl_slave', 'vdem_v2xdl_delib', 'vdem_v2xeg_eqdr',
    'vdem_v2xeg_eqprotec', 'vdem_v2xel_frefair', 'vdem_v2xel_regelec',
    'vdem_v2xme_altinf', 'vdem_v2xnp_client', 'vdem_v2xnp_regcorr',
    'vdem_v2xpe_exlecon', 'vdem_v2xpe_exlpol', 'vdem_v2xpe_exlgeo',
    'vdem_v2xpe_exlgender', 'vdem_v2xpe_exlsocgr', 'vdem_v2xps_party',
    'vdem_v2xcs_ccsi', 'vdem_v2xnp_pres', 'vdem_v2xeg_eqaccess',
    'vdem_v2x_diagacc', 'vdem_v2clrgunev', 'splag_vdem_v2x_libdem',
    'splag_vdem_v2xcl_dmove', 'splag_vdem_v2x_accountability',
    'splag_vdem_v2xpe_exlsocgr', 'splag_vdem_v2xcl_rol', 'wdi_sm_pop_netm',
    'wdi_sp_dyn_imrt_in'
]

# 30 features that are drawn from the WDI as well as some conflict history indicators
wdi = [
    'ged_sb', 'decay_ged_sb_5', 'decay_ged_os_5',
    'splag_1_decay_ged_sb_5', 'wdi_sp_pop_totl', 'wdi_ag_lnd_frst_k2',
    'wdi_dt_oda_odat_pc_zs', 'wdi_ms_mil_xpnd_gd_zs', 'wdi_ms_mil_xpnd_zs',
    'wdi_nv_agr_totl_kd', 'wdi_nv_agr_totl_kn', 'wdi_ny_gdp_pcap_kd',
    'wdi_sp_dyn_le00_in', 'wdi_se_prm_nenr', 'wdi_sh_sta_maln_zs', 
    'wdi_sh_sta_stnt_zs', 'wdi_sl_tlf_totl_fe_zs', 'wdi_sm_pop_refg_or', 
    'wdi_sm_pop_netm', 'wdi_sm_pop_totl_zs', 'wdi_sp_dyn_imrt_in', 
    'wdi_sh_dyn_mort_fe', 'wdi_sp_pop_1564_fe_zs', 'wdi_sp_pop_65up_fe_zs',
    'wdi_sp_pop_grow', 'wdi_sp_urb_totl_in_zs',
    'splag_wdi_sl_tlf_totl_fe_zs', 'splag_wdi_sm_pop_refg_or',
    'splag_wdi_sm_pop_netm', 'splag_wdi_ag_lnd_frst_k2'
]

ged = ['ged_sb']

feature_subset_dict = {'conflict_history':conflict_history,
                       'vdem':vdem,
                       'wdi':wdi,
                       'all':all_features,
                       'ged':ged}

### Group data by country_id

In [5]:
country_list = sorted(features_df_list[3]['data'].index.get_level_values('country_id').unique().tolist())

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

### List of the countries for which a prediction is requested

In [6]:
relative_path_countrylist = os.path.join('..', 'data', 'country_list.csv')
path_countrylist = os.path.join(current_dir, relative_path_countrylist)

# CSV-Datei einlesen und als Pandas-Datensatz speichern
countryList_prediction = pd.read_csv(path_countrylist)
country_list_views = countryList_prediction.loc[:,'country_id'].values.tolist() 

month_list = []
countries_to_remove = []
for country_id in country_list:

    if country_id in country_list_views:
        feature_data_views = country_feature_group_list[0].get_group(country_id)

        # numbers of months from the feature dataset
        month_list_feature_data_original = feature_data_views.index.get_level_values('month_id').tolist()
        number_months_feature_data = len(month_list_feature_data_original) 

        # check if actuals exist for the country
        if check_Actuals(country_id, 0):
            # check if the last feature month is 3 months before the first actuals month
            if not check_last_featureMonth(country_id, 0): 
                month_list.append([str(country_id) +' last month missing'])
            else:
                month_list.append([number_months_feature_data, country_id])
        else:
            month_list.append(str(country_id) + ' no actuals')
    else:
        countries_to_remove.append(country_id)

country_list = list(set(country_list) - set(countries_to_remove))
month_list.sort()

## Feed Forward Neural Network
Goal is to estimate the empirical distribution of the fatalities per month.
### Definition of the CRPS loss function and the Feed forward Neural Network subclass

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Layer, Input, Dense, Lambda, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Loss
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.callbacks import EarlyStopping

# crps loss function 
def crps(y_true, S):
    """
    Computes continuous ranked probability score:

    Parameters
    ----------
    y_true : tf tensor of shape (BATCH_SIZE, 1)
        True values.
    S : tf tensor of shape (BATCH_SIZE, N_SAMPLES)
        Predictive samples.

    Returns
    -------
    tf tensor of shape (BATCH_SIZE,)
        Scores.

    """
    beta=1
    n_samples = S.shape[-1]

    def expected_dist(diff, beta):
        return K.sum(K.pow(K.sqrt(K.square(diff)+K.epsilon()), beta),axis=-1) #axis = -1: last dimension <=> N_SAMPLES
    es_1 = expected_dist(y_true - S, beta)
    es_2 = 0
    for i in range(n_samples):
        es_2 = es_2 + expected_dist(K.expand_dims(S[:,i]) - S, beta)
    return es_1/n_samples - es_2/(2*n_samples**2)


class CRPSLoss(Loss):
    def call(self, y_true, S):
        return crps(y_true, S)

# Define custom ReLU activation function
class ReLUTransform(Layer):
    def call(self, inputs):
        return tf.nn.relu(inputs)

# number of neurons per hidden layer (equally spaced)
def get_numberNeurons_per_hiddenlayer(numberHiddenLayers, numberNeurons):

    neurons_per_HL = np.round(numberNeurons/numberHiddenLayers)

    if neurons_per_HL <= 0:
        raise ValueError('Number of neurons per hidden layer less than one.')

    return neurons_per_HL

# Define the Feed Forward Neural Network subclass
class FeedForwardNN(tf.keras.Model):
    def __init__(self, input_shape, name="FeedFwdNN", neurons_output = 200, number_hidden_layers=1, 
                 number_neurons=10, dropout_rate=None):
        super(FeedForwardNN, self).__init__(name=name)

        number_neurons_per_hlayer = get_numberNeurons_per_hiddenlayer(number_hidden_layers, number_neurons)

        self.hidden_layers = []

        self.hidden_layers.append(Dense(number_neurons_per_hlayer, activation='relu'))
        if dropout_rate is not None:
            self.hidden_layers.append(Dropout(dropout_rate))

        if number_hidden_layers > 1:
            for _ in range(number_hidden_layers - 1):
                self.hidden_layers.append(Dense(number_neurons_per_hlayer, activation='relu'))

        self.untransformed_output = Dense(neurons_output)
        self.final_output = Lambda(ReLUTransform())

        self.model = self.build(input_shape)
        
    def call(self, inputs):
        x = inputs
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.untransformed_output(x)
        y = self.final_output(x)
        return y

### Definition of the functions to perform the train-/test-split with rolling windows

In [8]:
from sklearn import preprocessing

## conflict trap
# drops all months before a starting conflict
# defintion of a beginning conflict: fatalities(monthX) > 0 with mean(fatalities(window_size number months starting with monthX)) > threshold
# (average fatalities per month in the starting half year are greater than the threshold)
# iterates trough the dataset beginning with the first entry
def drop_before_conflict_trap(data, threshold, window_size, minimal_data_size):
    index_ged_sb = data.columns.get_loc('ged_sb')

    start_index = 0
    while start_index < len(data.iloc[:,index_ged_sb]) - window_size + 1:
        window = data.iloc[start_index:start_index + window_size, index_ged_sb].to_list()
        
        if window[0] > 0 and sum(window) / window_size >= threshold:
            break
        else:
            start_index += 1


    if len(data) >= minimal_data_size:
        # if there is no conflict trap do nothing
        if start_index == len(data.iloc[:,index_ged_sb]) - window_size + 1:
            return data
        # if the truncation would result in a too small dataset prevent this
        elif len(data.iloc[start_index:, :]) < minimal_data_size:
            return data.iloc[-minimal_data_size:, :]
        # drop every entry before the conflict trap 
        else:
            return data.iloc[start_index:, :]
    else:
        return data
    

## function used to calculate w_max, number of rolling windows etc.
# length of a whole window (containing w input months and 12 acutal months)
def rollingWindowLength(w):
    return w + 2 + 12

# number of months available for training (after removing the test months)
def number_train_months(numberMonths_available, w):
    #  all months feature data   -  test set input
    return numberMonths_available - w

def number_rolling_windows(numberMonths_available, w):
    return max(0,numberMonths_available - rollingWindowLength(w) + 1)


def find_max_W(numberMonths_available, w_min, w_max):
    if number_rolling_windows(numberMonths_available, w_min) == 0:
        raise ValueError('not enough months for one training window with w_min = ' + str(w_min))

    # find the maximal w
    max_W = w_max
    number_months_train = number_train_months(numberMonths_available, max_W)
    number_train_rollwindows_wmax = number_rolling_windows(number_months_train, max_W)

    # calculate w_max so that the number of rolling windows for the validation set is >= 1
    # and that
    # the number of rolling windows for the train set is >= 1
    while number_train_rollwindows_wmax == 0 and max_W > w_min:
        max_W -= 1
        number_months_train = number_train_months(numberMonths_available, max_W)
        number_train_rollwindows_wmax = number_rolling_windows(number_months_train, max_W)

    return max_W


def month_lists_TrainTest(w_min, w_max, month_list_feature_data):
    # numbers of months from the shortened feature dataset
    number_months = len(month_list_feature_data)
    
    # find w_max (as mentioned above, if there are not enoug months, the w_max has to be < w_max)
    w_max_local = find_max_W(number_months, w_min, w_max)

    w = w_max_local

    # length of the maximum rolling window and the used "unreal" acutals starting 3 months after the last used month
    n_train_months = number_train_months(number_months, w)

    month_list_train = month_list_feature_data[0:n_train_months]
    month_list_test = month_list_feature_data[-w:]

    return month_list_train, month_list_test, w


def Train_ArrayXY_split(w, month_list, data_feature, s):
    X = []
    Y = []

    train_months = len(month_list)

    number_rolling_windows_train = number_rolling_windows(train_months, w)

    for i in range(0, number_rolling_windows_train):
        starting_month_features = month_list[i]

        index_ending_month_features = i + w - 1
        ending_month_features = month_list[index_ending_month_features]

        starting_month_unrActuals = month_list[index_ending_month_features + 3]
        ending_month_unrActuals = month_list[index_ending_month_features + 14]

        window_features = data_feature.loc[slice(starting_month_features, ending_month_features), :] # excluding "unreal" actuals
        window_actuals = data_feature.loc[slice(starting_month_unrActuals, ending_month_unrActuals), 'ged_sb'].iloc[s - 3] # "unreal" actuals


        normalized_window_features = preprocessing.normalize(window_features)
        window_features_array = np.array([normalized_window_features.flatten()])[0]

        window_actual_array = np.array([window_actuals])

        X.append(window_features_array)
        Y.append(window_actual_array)

    X = np.array(X)
    Y = np.array(Y)

    return X,Y

def Test_ArrayXY_split(month_list, data_feature, data_actual, s):
    X = []
    Y = []

    starting_month_test = month_list[0]
    ending_month_test = month_list[-1]

    window_features_test = data_feature.loc[slice(starting_month_test, ending_month_test), :] # all w features to predict the fatalities
    window_actuals_test = data_actual.iloc[s - 3].values # real actuals

    normalized_window_features_test = preprocessing.normalize(window_features_test)
    window_features_array_test = np.array([normalized_window_features_test.flatten()])[0]

    window_actual_array_test = window_actuals_test

    X.append(window_features_array_test)
    Y.append(window_actual_array_test)

    X = np.array(X)
    Y = np.array(Y)

    return X,Y

## Estimation of the future fatalites
### Set the year to predict

In [9]:
prediction_year = '2021' # 2019, 2020, 2021
dataset_index = actual_years.index(prediction_year)

### Manipulate country list for the prediction

In [10]:
zero_fatalities_country_list = []
countries_with_high_percentage_list = []
someNonzero_fatalities_country_list = []

for country_id in country_list:
    feature_data_all = country_feature_group_list[dataset_index].get_group(country_id)

    # Berechnen des Prozentsatzes der Werte größer als 0 in der Spalte 'ged_sb'
    positive_percentage = (feature_data_all['ged_sb'] > 0).mean() * 100

    if (feature_data_all['ged_sb'] == 0).all():
        zero_fatalities_country_list.append(country_id)
    elif positive_percentage >= 60:
        countries_with_high_percentage_list.append(country_id)
    else:
        someNonzero_fatalities_country_list.append(country_id)

### Prediction

In [11]:
import random
from random import randint, uniform

# this function is not needed in the prediction (only hyperparam tuning)
def draw_params(seed):

    #random.seed(seed)
    keras.utils.set_random_seed(seed)
    
    dropout = random.choice([0, 1])
    
    ### ranges of hyperparameters
    ranges = {
        'wmax': randint(1, 12),
        'numbHiddenL': randint(1, 6),
        'relNeurons': uniform(0.1, 1.0),
        'lr': uniform(0.001, 0.15), 
        'featureSubset': random.choice(list(feature_subset_dict.keys())),
        'batch_size': randint(1, 6),
        'dropoutrate': uniform(0.1, 0.5),
        'epochs': randint(3, 40)
    }
    
    if dropout == 0:
        ranges['dropoutrate'] = 0


    # Erstelle ein neues Dictionary, um die gezogenen Werte zu speichern
    drawn_params = {}

    # Ziehe die Werte und speichere sie im neuen Dictionary
    for param_name, param_value in ranges.items():
        if isinstance(param_value, float):
            if param_name == 'relNeurons' or param_name == 'dropoutrate':
                drawn_params[param_name] = round(param_value, 2)
            else:
                drawn_params[param_name] = round(param_value, 3)
        else:
            drawn_params[param_name] = param_value

    return drawn_params

def get_numberNeurons_all_layers(inputNeurons, outputNeurons, lam):
    upper_bound = max(inputNeurons, outputNeurons)
    lower_bound = min(inputNeurons, outputNeurons)

    return np.round(lam * (upper_bound - lower_bound) + lower_bound)


def evaluate_model(hyperParameters, s, feature_dataset, actuals_dataset):

    ## fixed paramters--------
    w_min= 1
    number_output_neurons = 200
    var_threshold = 0.05
    mean_fatlities_per_month_threshold = 5
    #--------------

    ## prepare feature dataset
    # only FEATURE SUBSET
    feature_data = feature_dataset.loc[:,feature_subset_dict[hyperParameters['featureSubset']]]

    ## Drop features with NEAR ZERO VARIANCE (but dont drop 'ged_sb' -> needed for conflict trap detection)
    columns_to_keep = [col for col in feature_data.columns if (col == 'ged_sb') or (feature_data[col].var() >= var_threshold)]
    feature_data = feature_data[columns_to_keep]

    ## remove months before the CONDFLICT TRAP (regime change)
    # if the average number of fatalities per month in 6 months is above 'mean_fatlities_per_month_threshold' and the fatalities of the starting month are > 0 
    # the conflict trap starts and all obsservations before that month are dropped
    # 76 is the minimal length of the dataframe (refers to the minimal size of the data for all countries -> country_id 246 len = 76) 
    feature_data = drop_before_conflict_trap(feature_data, mean_fatlities_per_month_threshold, 6, 76)
    
    month_list_feature_data = feature_data.index.get_level_values('month_id').tolist()


    ### data split
    month_list_train, month_list_test, w = month_lists_TrainTest(w_min, hyperParameters['wmax'], month_list_feature_data)

    ## training dataset------
    X_train, Y_train = Train_ArrayXY_split(w, month_list_train, feature_data, s)

    ## test dataset-------
    X_test, Y_test = Test_ArrayXY_split(month_list_test, feature_data, actuals_dataset, s)

    ### prediction with the neural net
    ## Define inputs with predefined shape
    input_shape = (len(X_train[0]),)
    inputs = Input(shape=input_shape)

    # only needed in hyperparameter tuning
    """ hyperParameters['numbNeurons'] = get_numberNeurons_all_layers(input_shape[0], 
                                                               number_output_neurons, 
                                                               hyperParameters['relNeurons']) """
    # overwrite the old w
    hyperParameters['w'] = w

    ## define neural net
    # Create an instance of the FeedForwardNN model
    nameString = 'FFwdNN_s' + str(s)
    model = FeedForwardNN(input_shape=inputs.shape, name=nameString, neurons_output = number_output_neurons,
                          number_hidden_layers=hyperParameters['numbHiddenL'], 
                          number_neurons=hyperParameters['numbNeurons'], 
                          dropout_rate=hyperParameters['dropoutrate'])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=hyperParameters['lr']), loss=CRPSLoss())

    # fit the model
    history = model.fit(X_train, Y_train, 
                        batch_size=hyperParameters['batch_size'], epochs=hyperParameters['epochs'],
                        verbose=0, shuffle=False)

    ## prediction
    prediction = model.predict(X_test)
    # round the prediction values to integers
    empirical_distribution = np.round(np.sort(prediction[0])).astype(int)

    # save the training and validation loss to generate plots
    loss = history.history['loss']
    epochs = range(1, len(loss) + 1)

    y_true = Y_test[0][0]
    crps_prediction = pscore(empirical_distribution,y_true).compute()[0]

    return loss, epochs, y_true, crps_prediction, empirical_distribution

In [12]:
import os
from joblib import load
import numpy as np


user_dir = os.path.expanduser('~')
file_path = os.path.join(user_dir, 'iCloudDrive\\Joblib BA\\final hyperparam', 'FinalrandomctrAllwRand12_50s8_30valid_2021.joblib')
loaded_vars_rndHyperparam = load(file_path)

country_selected_hyperparam_list, prediction_year_hyper, nonZero_country_list_hyper, zero_fatalities_country_list_hyper = loaded_vars_rndHyperparam

if prediction_year_hyper != prediction_year:
    raise ValueError("Hyperparam prediction year differs from current prediction year.")

if zero_fatalities_country_list != zero_fatalities_country_list_hyper:
    raise ValueError("List of countries with zero fatalities are not identical.")

def find_country_hyperparam(country_id, hyperparam_list):
    country_list_local = []
    for i in range(len(hyperparam_list)):
        country = hyperparam_list[i]['country']
        country_list_local.append(country)
        if country == country_id:
            # 0 ist the first element in the 50 rndm hyperparams list -> minimal val_loss
            return hyperparam_list[i]['hyperparams'][0]['random_params']
        
    if country_id not in country_list:
        raise ValueError("Country is not in the Hyperparameter list.")

In [13]:
### prediction
s_prediction_list = list(range(3, 15))
number_s = len(s_prediction_list)
number_countries = len(country_list)

pred_year_string = 'prediction_' + prediction_year

# list to save the predictions for each country
NNet_prediction_list = [{'country_id': country, pred_year_string: []} for country in country_list]

# loop through all countries
for country_index in tqdm(range(number_countries)):
    country = country_list[country_index]

    ## load datasets
    features = country_feature_group_list[dataset_index].get_group(country)
    actuals = country_actual_group_list[dataset_index].get_group(country)

    bestS8_hyperparams = None

    if country in zero_fatalities_country_list:

        distribution = np.array([0]*200)

        NNet_prediction_list[country_index][pred_year_string].append({'s':[None for _ in range(number_s)],
                                                                  'distribution':[distribution for _ in range(number_s)],
                                                                  'actual':[None for _ in range(number_s)],
                                                                  'CRPS':[None for _ in range(number_s)], 
                                                                  'loss':[None for _ in range(number_s)], 
                                                                  'epochs':None,
                                                                  'hyperparams':None})
    else:

        bestS8_hyperparams = find_country_hyperparam(country, country_selected_hyperparam_list)

        NNet_prediction_list[country_index][pred_year_string].append({'s':[None for _ in range(number_s)],
                                                                  'distribution':[None for _ in range(number_s)],
                                                                  'actual':[None for _ in range(number_s)],
                                                                  'CRPS':[None for _ in range(number_s)], 
                                                                  'loss':[None for _ in range(number_s)], 
                                                                  'epochs':None,
                                                                  'hyperparams':bestS8_hyperparams})

    for s in s_prediction_list:

        if country in zero_fatalities_country_list:

            model_y_true = actuals.iloc[s - 3].values[0]
            model_crps_prediction = pscore(distribution,model_y_true).compute()[0]

            NNet_prediction_list[country_index][pred_year_string][0]['s'][s-3] = s
            NNet_prediction_list[country_index][pred_year_string][0]['actual'][s-3] = model_y_true
            NNet_prediction_list[country_index][pred_year_string][0]['CRPS'][s-3] = model_crps_prediction

        else:
            hyper_params = bestS8_hyperparams
            model_loss, model_epochs, model_y_true, model_crps_prediction, distribution = evaluate_model(hyper_params, s, features, actuals)

            NNet_prediction_list[country_index][pred_year_string][0]['s'][s-3] = s
            NNet_prediction_list[country_index][pred_year_string][0]['distribution'][s-3] = distribution
            NNet_prediction_list[country_index][pred_year_string][0]['actual'][s-3] = model_y_true
            NNet_prediction_list[country_index][pred_year_string][0]['CRPS'][s-3] = model_crps_prediction
            NNet_prediction_list[country_index][pred_year_string][0]['loss'][s-3] = model_loss
            NNet_prediction_list[country_index][pred_year_string][0]['epochs'] = model_epochs


  1%|          | 2/191 [00:20<00:10, 18.35it/s]



  2%|▏         | 3/191 [14:59<19:34:11, 374.74s/it]



  5%|▍         | 9/191 [31:02<6:50:55, 135.47s/it] 



  6%|▌         | 11/191 [41:38<9:55:36, 198.53s/it]



  7%|▋         | 13/191 [49:41<10:30:17, 212.46s/it]



  8%|▊         | 15/191 [1:02:34<13:04:16, 267.37s/it]



 11%|█         | 21/191 [1:21:55<6:34:05, 139.09s/it] 



 14%|█▎        | 26/191 [1:31:16<4:59:42, 108.98s/it]



 14%|█▍        | 27/191 [1:39:58<8:09:30, 179.09s/it]



 16%|█▌        | 31/191 [1:53:18<7:55:19, 178.25s/it]



 19%|█▉        | 37/191 [2:11:49<5:37:11, 131.37s/it] 



 20%|█▉        | 38/191 [2:18:05<7:26:49, 175.23s/it]



 21%|██        | 40/191 [2:30:37<10:06:06, 240.84s/it]



 23%|██▎       | 43/191 [2:50:12<11:41:13, 284.28s/it]



 23%|██▎       | 44/191 [3:01:38<15:10:09, 371.49s/it]



 24%|██▎       | 45/191 [3:07:32<14:54:09, 367.46s/it]



 24%|██▍       | 46/191 [3:23:57<20:57:23, 520.30s/it]



 25%|██▍       | 47/191 [3:37:10<23:39:12, 591.33s/it]



 25%|██▌       | 48/191 [4:00:42<32:20:57, 814.39s/it]



 26%|██▌       | 49/191 [4:15:58<33:14:56, 842.94s/it]



 26%|██▌       | 50/191 [4:25:21<29:53:00, 762.98s/it]



 27%|██▋       | 52/191 [4:47:12<27:35:40, 714.68s/it]



 28%|██▊       | 53/191 [4:57:41<26:36:03, 693.94s/it]



 29%|██▉       | 55/191 [5:11:46<20:14:11, 535.67s/it]



 29%|██▉       | 56/191 [5:20:00<19:39:33, 524.25s/it]



 30%|███       | 58/191 [5:32:19<15:22:43, 416.26s/it]



 32%|███▏      | 61/191 [5:40:42<9:19:54, 258.42s/it] 



 32%|███▏      | 62/191 [5:49:28<11:44:22, 327.62s/it]



 33%|███▎      | 63/191 [5:57:39<13:13:14, 371.83s/it]



 34%|███▎      | 64/191 [6:10:05<16:47:02, 475.77s/it]



 35%|███▍      | 66/191 [6:15:26<10:39:48, 307.10s/it]



 36%|███▌      | 69/191 [6:59:17<21:05:35, 622.42s/it]



 37%|███▋      | 71/191 [7:08:41<15:40:26, 470.22s/it]



 38%|███▊      | 73/191 [7:33:07<18:49:14, 574.19s/it]



 39%|███▊      | 74/191 [8:02:07<27:01:03, 831.32s/it]



 40%|███▉      | 76/191 [8:15:21<19:34:12, 612.63s/it]



 41%|████      | 78/191 [8:29:59<15:32:11, 494.97s/it]



 43%|████▎     | 82/191 [8:43:45<9:02:52, 298.83s/it] 



 43%|████▎     | 83/191 [8:57:55<12:42:43, 423.73s/it]



 47%|████▋     | 90/191 [9:16:56<3:43:48, 132.96s/it] 



 48%|████▊     | 92/191 [9:27:32<5:17:33, 192.46s/it]



 49%|████▉     | 94/191 [9:38:34<5:56:37, 220.59s/it]



 52%|█████▏    | 99/191 [9:57:15<3:58:47, 155.73s/it] 



 54%|█████▍    | 103/191 [10:34:55<8:20:18, 341.12s/it] 



 55%|█████▍    | 105/191 [10:51:13<8:41:26, 363.79s/it] 



 56%|█████▌    | 107/191 [11:06:42<9:23:19, 402.37s/it]



 57%|█████▋    | 109/191 [11:19:13<8:19:06, 365.20s/it] 



 58%|█████▊    | 110/191 [11:41:17<13:37:57, 605.89s/it]



 58%|█████▊    | 111/191 [11:52:04<13:42:17, 616.72s/it]



 59%|█████▊    | 112/191 [12:07:23<15:20:51, 699.38s/it]



 60%|█████▉    | 114/191 [12:17:17<10:13:20, 477.93s/it]



 60%|██████    | 115/191 [12:28:50<11:24:31, 540.41s/it]



 61%|██████▏   | 117/191 [12:49:08<10:42:34, 521.00s/it]



 62%|██████▏   | 118/191 [13:06:20<13:38:07, 672.44s/it]



 62%|██████▏   | 119/191 [13:34:03<19:20:46, 967.31s/it]



 63%|██████▎   | 120/191 [13:55:36<20:59:40, 1064.51s/it]



 63%|██████▎   | 121/191 [14:18:51<22:37:18, 1163.40s/it]



 64%|██████▍   | 122/191 [14:28:39<18:59:52, 991.20s/it] 



 64%|██████▍   | 123/191 [14:41:56<17:37:23, 933.00s/it]



 65%|██████▍   | 124/191 [15:09:56<21:31:37, 1156.69s/it]



 65%|██████▌   | 125/191 [15:27:55<20:47:00, 1133.65s/it]



 68%|██████▊   | 130/191 [15:57:00<6:50:09, 403.44s/it]  



 69%|██████▉   | 132/191 [16:18:21<7:24:54, 452.44s/it]



 70%|██████▉   | 133/191 [16:37:57<10:11:28, 632.56s/it]



 70%|███████   | 134/191 [17:07:32<14:45:19, 931.92s/it]



 71%|███████   | 135/191 [17:24:59<14:59:05, 963.32s/it]



 73%|███████▎  | 139/191 [17:37:27<5:07:39, 354.99s/it] 



 74%|███████▍  | 141/191 [17:48:16<4:45:55, 343.11s/it]



 75%|███████▍  | 143/191 [18:05:54<5:04:30, 380.63s/it]



 76%|███████▌  | 145/191 [18:22:58<5:32:26, 433.62s/it]



 77%|███████▋  | 147/191 [18:44:14<5:45:48, 471.56s/it]



 78%|███████▊  | 149/191 [19:23:10<8:07:02, 695.79s/it] 



 79%|███████▉  | 151/191 [19:44:47<7:30:02, 675.07s/it]



 80%|███████▉  | 152/191 [19:58:06<7:37:56, 704.52s/it]



 81%|████████  | 154/191 [20:14:35<5:51:48, 570.49s/it]



 84%|████████▍ | 160/191 [20:43:14<2:16:40, 264.54s/it]



 85%|████████▌ | 163/191 [20:57:45<2:01:16, 259.88s/it]



 86%|████████▌ | 164/191 [21:27:56<4:24:29, 587.75s/it]



 87%|████████▋ | 166/191 [21:57:33<4:30:20, 648.81s/it]



 89%|████████▉ | 170/191 [22:20:26<2:34:34, 441.66s/it]



 90%|████████▉ | 171/191 [22:30:08<2:37:31, 472.58s/it]



 91%|█████████ | 173/191 [22:44:09<2:05:21, 417.88s/it]



 91%|█████████ | 174/191 [22:55:30<2:18:18, 488.14s/it]



 92%|█████████▏| 175/191 [23:04:46<2:15:07, 506.72s/it]



 92%|█████████▏| 176/191 [23:22:46<2:47:11, 668.77s/it]



 93%|█████████▎| 177/191 [23:37:54<2:52:06, 737.61s/it]



 93%|█████████▎| 178/191 [23:47:26<2:29:22, 689.45s/it]



 94%|█████████▎| 179/191 [23:58:35<2:16:40, 683.37s/it]



 95%|█████████▍| 181/191 [24:09:51<1:19:51, 479.12s/it]



 96%|█████████▋| 184/191 [24:40:28<1:04:35, 553.58s/it]



 97%|█████████▋| 185/191 [25:14:10<1:26:16, 862.72s/it]



 97%|█████████▋| 186/191 [25:23:11<1:05:41, 788.34s/it]



 98%|█████████▊| 187/191 [25:52:19<1:08:26, 1026.69s/it]



 98%|█████████▊| 188/191 [26:22:35<1:01:40, 1233.44s/it]



 99%|█████████▉| 189/191 [26:52:41<46:18, 1389.10s/it]  



 99%|█████████▉| 190/191 [27:02:16<19:21, 1161.27s/it]



100%|██████████| 191/191 [27:11:31<00:00, 512.52s/it] 


In [14]:
from joblib import dump, load

joblib_string = 'FinalTask2_NN_' + str(prediction_year) + '_HyperparamctrallIndividual' + '.joblib'

dump([NNet_prediction_list, country_list, pred_year_string, seed, zero_fatalities_country_list],  joblib_string)

['FinalTask2_NN_2021_HyperparamctrallIndividual.joblib']

In [15]:
crps_values_test = []
for country_data in NNet_prediction_list:
    if country_data['country_id'] not in zero_fatalities_country_list:
        country_crps_list = []
        for s in range(0,number_s):
            country_crps_list.append(country_data[pred_year_string][0]['CRPS'][s])
        crps_values_test.append(np.mean(country_crps_list))

mean_crps_test = np.mean(crps_values_test)


print('Mean CRPS = ' + str(mean_crps_test) + ' \\'+'\\')
print('')

Mean CRPS = 56.2974998046875 \\

