In [88]:
import numpy as np
import math
from scipy.stats import nbinom
from joblib import Parallel, delayed

def truncNegBin_PDF(y, n, p):
    one_f_zero = float(1 - nbinom.pmf(0, n, p))
    return nbinom.pmf(y, n, p) / one_f_zero

def truncNegBin_CDF(y, n, p):
    f_zero = nbinom.pmf(0, n, p)
    if y > 0:
        return (nbinom.cdf(y, n, p) - nbinom.cdf(0, n, p)) / (1 - f_zero)
    else:
        return 0
    
def truncNegBin_CDF2(a,n,p):
    p=float(p)
    if a <= 0:
        return 0
    else:
        pdf_array = np.array([truncNegBin_PDF(yi, n, p) for yi in range(1, a+1)])
        return np.sum(pdf_array)

def truncNegBin_PPF(x, n, p, epsilon=1e-6, max_iterations=100):
    # if f(0)=0 no truncation is needed
    if (1 - nbinom.pmf(0, n, p)) == 1:
        return nbinom.ppf(x, n, p)
    else:
        # Define the range of y where the solution might exist
        lower_bound = 0
        upper_bound = 1000000000  # Adjust this based on the expected range of y

        # Bisection method
        for _ in range(max_iterations):
            y = (lower_bound + upper_bound) / 2
            cdf_value = truncNegBin_CDF(y, n, p)

            if abs(cdf_value - x) < epsilon:
                return np.ceil(y)  # Found a good approximation

            if cdf_value < x:
                lower_bound = y
            else:
                upper_bound = y

        # Return the best approximation if max_iterations is reached
        return np.ceil(y)

def calculate_trunc_nbinom_quantile(quantile, n, p):
    return truncNegBin_PPF(quantile, n, p)








mean = 1000
var = 8000

n = (mean**2) / (var - mean) # equivalent to r
p = mean / var


print(nbinom.cdf(10, n, p))
print(truncNegBin_CDF(10, n, p))
print(truncNegBin_CDF2(10, n, p))

3.664901396367409e-115
3.664901396367399e-115
3.664901396367431e-115


#### Proof that the inverse function leads to the same results as the naive way of calculating the quantiles

In [100]:
from scipy.stats import nbinom



# Example usage
mean = 100
var = 200

n = (mean**2) / (var - mean) # equivalent to r
p = mean / var

x = 0.001
inverse_value = truncNegBin_PPF(x, n, p)
print(inverse_value)

y_values = range(1, int(inverse_value)+100)
probabilities = np.array([truncNegBin_PDF(yi, n, p) for yi in y_values])
cdf_array = np.cumsum(probabilities)
quantile = np.argmax(cdf_array >= list([x])) + 1
print(quantile)

61.0
61


In [140]:
# calculation of quantiles
quantiles = np.arange(0.001, 0.9999, 0.001)
quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies
#trunc_nbinom_quantiles = np.array([truncNegBin_PPF(quantile, n, p) for quantile in quantiles]) #slow way
trunc_nbinom_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_nbinom_quantile)(quantile, n, p) for quantile in quantiles) #fast way
trunc_nbinom_quantiles = np.array(trunc_nbinom_quantiles)


In [102]:
trunc_nbinom_quantiles[-1]

148.0

## Hurdle Model

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import os
from scipy.stats import nbinom
from scipy.stats import poisson
import CRPS.CRPS as pscore

# create the feature- and actuals-data list
# set the feature and actuals year lists
feature_years = ['2017','2018','2019','2020']
actual_years = ['2018','2019','2020','2021']

actuals_df_list = []
features_df_list = []

for i in range(len(feature_years)):
    # paths to the data
    absolute_path = os.path.abspath('')
    relative_path_features = "data\cm_features_to_oct" + feature_years[i] + ".parquet"
    relative_path_actuals = "data\cm_actuals_" + actual_years[i] + ".parquet"

    path_features = os.path.join(absolute_path, relative_path_features)
    path_actuals = os.path.join(absolute_path, relative_path_actuals)

    # append datasets to the lists
    actuals_df_list.append({'year':actual_years[i], 'data':pd.read_parquet(path_actuals, engine='pyarrow')})
    features_df_list.append({'year':feature_years[i], 'data':pd.read_parquet(path_features, engine='pyarrow')})

# concat the feature datasets, so that every data contains the observations starting with january 1990
for i in range(1,len(features_df_list)):
    features_df_list[i]['data'] = pd.concat([features_df_list[i-1]['data'], features_df_list[i]['data']])

# function to check, if the last n months are in the dataset of a country,
# other than that the last month of a country in the feature dataset has to be 3 months before the first actuals month!!
def check_last_nMonths(n, country_id, yearindex):
    country = country_feature_group_list[yearindex].get_group(country_id)

    # reference month of the actual dataset
    actual_month_list = actuals_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()

    # if the last month of the feature dataset in the country does not match the first of the actuals return false
    if (actual_month_list[0] - 3) != country.index.get_level_values('month_id').unique().tolist()[-1]:
        return False
    else:
        month_list = features_df_list[yearindex]['data'].index.get_level_values('month_id').unique().tolist()
        last_month = month_list[-1] # equals the first month - 3 from the corresponding actuals dataset
        first_month = month_list[0]

        last_n_months = True

        if last_month-n+1 < first_month:
            last_n_months = False
        else:
            month_list = list(range(last_month-n+1, last_month+1))
            
            for month in month_list:
                if month not in country.index.get_level_values('month_id'):
                    last_n_months = False
                    break

        return last_n_months
        #return True

def nBinom_quantiles(featureSeries, w, quantiles):
        if w == 'None':
             # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries)
            var = pd.Series.var(featureSeries)
        else:
            # calculate n (r) and p via average/variance
            mean = pd.Series.mean(featureSeries.tail(w).loc[:,'ged_sb'])
            var = pd.Series.var(featureSeries.tail(w).loc[:,'ged_sb'])

        #hier verteilung = nbinom ppf als
        dummy_fatalities_list = []

        # string to store distribution
        dist_string = ''

        if var != 0 and var > mean:
                n = (mean**2) / (var - mean) # equivalent to r
                p = mean / var
                dummy_fatalities_list = nbinom.ppf(quantiles, n, p).tolist()

                dist_string = 'NBinom'

        elif var != 0 and var <= mean:
                dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

                dist_string = 'Pois'

        else:
                dummy_fatalities_list = [0] * 999
                dist_string = 'None'

        return {'fatalities': dummy_fatalities_list, 'dist': dist_string, 'mean': mean, 'var': var}

#--------------------------------------------------------------------------------------------
# because of the concatination only the last dataframe is used (later on the appended months are dropped for datasets before 2020)

# IF THIS CODE IS USED FOR THE 2024 PREDICTION ADJUST THE features_1990to2020 in the whole file to 1990to2023

features_1990to2020_df = features_df_list[3]['data']
country_list = sorted(features_df_list[3]['data'].index.get_level_values('country_id').unique().tolist())

# country group list of all four datasets
country_feature_group_list = []
country_actual_group_list = []
# fill list 
for i in range(len(features_df_list)):
    country_feature_group_list.append(features_df_list[i]['data'].groupby('country_id'))
    country_actual_group_list.append(actuals_df_list[i]['data'].groupby('country_id'))

In [None]:
def hurdleModel_quantiles(featureSeries, w=None, quantiles):
    if w == None:
        # calculate pt, i.e. the probability that y>0
        p_t = 1 - (featureSeries.value_counts().get(0, 0) / featureSeries.count())
        # calculate n (r) and p via average/variance


        # mean und var aus ohne 0!!!!

        mean = pd.Series.mean(featureSeries)
        var = pd.Series.var(featureSeries)




    elif w <= 0:
        return 'w has to be > 0'
    else:
        features
        # calculate pt, i.e. the probability that y>0
        p_t = 1 - (featureSeries.tail(w).loc[:,'ged_sb'].value_counts().get(0, 0) / featureSeries.tail(w).loc[:,'ged_sb'].count())
        # calculate n (r) and p via average/variance





        mean = pd.Series.mean(featureSeries.tail(w).loc[:,'ged_sb'])
        var = pd.Series.var(featureSeries.tail(w).loc[:,'ged_sb'])





    #hier verteilung = nbinom ppf als

    # list to store quantiles 
    dummy_fatalities_list = []

    # string to store distribution
    dist_string = ''

    
    if p_t > 0:
        # component 1, y=0: Bernoulli
        comp2_quantiles = [q for q in quantiles if q > p_t]
        removed_elements_length = len(quantiles)-len(comp2_quantiles)
        zeros_array = np.zeros(removed_elements_length)

        # component 2, y>0
        if var != 0 and var > mean:
            n = (mean**2) / (var - mean) # equivalent to r
            p = mean / var
            trunc_nbinom_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_nbinom_quantile)(quantile, n, p) for quantile in quantiles) #fast way
            trunc_nbinom_quantiles = np.array(trunc_nbinom_quantiles)

            dummy_fatalities_list = np.concatenate((zeros_array, trunc_nbinom_quantiles)).tolist()
            dist_string = 'BernoullitruncNBinom'

        elif var != 0 and var <= mean:
            dummy_fatalities_list = poisson.ppf(quantiles, mean).tolist()

            dist_string = 'BernoullitruncPois'

        # benötigt?!?!?!?!?!?!
        else:
            dummy_fatalities_list = [0] * 999
            dist_string = 'None'

    else:
        dummy_fatalities_list = [0] * 999
        dist_string = 'Bernoulli'

    return {'fatalities': dummy_fatalities_list, 'dist': dist_string, 'mean': mean, 'var': var}



    

In [133]:
features = country_feature_group_list[0].get_group(246)

w = 20

# Schritt 3: Berechne die Wahrscheinlichkeit für eine Null
probability_of_zero = features.tail(w).loc[:,'ged_sb'].value_counts().get(0, 0) / features.tail(w).loc[:,'ged_sb'].count()
prob_of_bzero = 1-probability_of_zero

# Gib das Ergebnis aus
print("Wahrscheinlichkeit für Nullen:", probability_of_zero)
print("Wahrscheinlichkeit für > Null:", prob_of_bzero)


Wahrscheinlichkeit für Nullen: 0.05
Wahrscheinlichkeit für > Null: 0.95


In [145]:
# Example usage
mean = 100
var = 200

n = (mean**2) / (var - mean) # equivalent to r
p = mean / var

quantiles = np.arange(0.001, 0.9999, 0.001)
quantiles = [round(q, 3) for q in quantiles] # due to binary inaccuracies
orig_len_quant = len(quantiles)

p_t = 0.3

quantiles = [q for q in quantiles if q > p_t]

removed_elements_length = orig_len_quant-len(quantiles)  # Länge der entfernten Elemente

zeros_array = np.zeros(removed_elements_length)  # Numpy-Array mit Nullen

trunc_nbinom_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_nbinom_quantile)(quantile, n, p) for quantile in quantiles) #fast way
trunc_nbinom_quantiles = np.array(trunc_nbinom_quantiles)

dummy_fatalities_list = np.concatenate((zeros_array, trunc_nbinom_quantiles)).tolist()

len(dummy_fatalities_list)

999