In [1]:
import pandas as pd
from scipy.stats import nbinom
import numpy as np
from joblib import Parallel, delayed
import warnings

# cdf of the truncated negative binomial distribution
def truncNbinomCdf(y, n, p, log=True):

    ## error/input handling part one
    # n and p have to be greater than zero and p <= 1
    if n <= 0 or p <= 0 or p > 1:
        if not isinstance(y, (list, np.ndarray)):
            return np.nan
        else:
            return np.full(len(y), np.nan)

    ## calculation
    is_scalar = np.isscalar(y)  # check if y is scalar or array

    if is_scalar:
        y = np.array([y])  # typecast scalar to onedimensional array
    elif isinstance(y, list):
        y = np.array(y)

    f_zero = nbinom.pmf(0, n, p) # f(0) untruncated density

    # general formula for lower trunc. distributions
    cdf_y = (nbinom.cdf(y, n, p) - nbinom.cdf(0, n, p)) / (1 - f_zero)
    ## error/input handling part two
    # VERY IMPORTANT STEP: there might be numerical instabilities, which may lead to
    # nbinomo.cdf(x) < nbinom.cdf(0)!!!!! this then leads to negative values of the cdf
    # or nans in the log version (np.log(neg number))
    cdf_y[cdf_y < 0] = 0
    # set values to 0, if y <= 0 (y <=0 not allowed per definition of a 0 truncated count distribution)
    cdf_y[y <= 0] = 0

    # in case of log CDF
    if log: 
        # ignore the 'RuntimeWarning: divide by zero encountered in log' warning (np.log(0))
        warnings.filterwarnings('ignore', category=RuntimeWarning)
        log_cdf_y = np.log(cdf_y) # general formula for lower trunc. distributions
        warnings.filterwarnings('default', category=RuntimeWarning)

        if is_scalar:
            return log_cdf_y[0]  # return scalar, if onedimensional array
        else:
            return log_cdf_y
    # normal CDF
    else:
        if is_scalar:
            return cdf_y[0]  
        else:
            return cdf_y
    

#log.p	logical; if TRUE, probabilities p are given as log(p)    
def qnbinom_trunc(p, nNbinom, pNbinom, log_p=False):
    ## calculation of the quantile
    # if f(0)=0 no truncation is needed
    if nbinom.pmf(0, nNbinom, pNbinom) == 0:
        return nbinom.ppf(p, nNbinom, pNbinom) # if p=0, -1 is returned instead of 0 (0 is truncated)
        # but this is not that important and is ignored here (because the 0 quantile does not make sense)
    else:

        # n and p have to be greater than zero and p <= 1
        if nNbinom <= 0 or pNbinom <= 0 or pNbinom > 1:
            if not isinstance(p, (list, np.ndarray)):
                return np.nan
            else:
                return np.full(len(p), np.nan)

        # Convert p (quantile) to array if it's a scalar
        if not isinstance(p, (list, np.ndarray)):
            p = np.array([p])
        elif isinstance(p, list):
            p = np.array(p)
        
        n = len(p) # number of quantiles

        # Set log-probabilities (lower tail)
        if log_p:
            logp = p
        else:
            warnings.filterwarnings('ignore', category=RuntimeWarning)
            logp = np.log(p)
            warnings.filterwarnings('default', category=RuntimeWarning)
        
        # error handling/deal with special cases (outputs NA and Inf)
        quantiles = np.full(n, np.nan)
        na = np.isnan(logp) # nan <-> p < 0 -> return nan
        neginf = np.isneginf(logp) # -inf <-> p = 0 -> return 0 (due to truncation, otherwise -1)
        zero = logp == 0  # 0 <-> p = 1 -> return inf
        aboveZero = logp > 0 # >0 <-> p > 1 -> return nan

        # set quantile array if one of the restrictions is not fulfilled
        quantiles[neginf] = 0
        quantiles[zero] = np.inf
        quantiles[aboveZero] = np.nan
        
        # mask array, true if conditions are fulfilled
        mask = np.logical_not(np.logical_or(na, np.logical_or(neginf, np.logical_or(zero, aboveZero))))
        validLogp = logp[mask]

        if len(validLogp) == 0:
            # Return output
            if len(quantiles) == 1:
                return quantiles[0] # if single quantile is handed over
            else:
                return quantiles

        # find valid max value with mask
        lp_max = np.max(validLogp)
        p_max = np.exp(lp_max)

        # calculate mean and variance out of n and p
        mean = (nNbinom * (1 - pNbinom)) / pNbinom
        var = (nNbinom * (1 - pNbinom)) / (pNbinom**2)

        # find an adequate upper limit, starting from the extreme conservative chebychev inequality
        upper = int(mean + np.sqrt(var/(1-np.exp(lp_max)))) #Chebychev inequality

        # if upper < 1000 there is an log(0)=-inf with warning -> ignore this warning
        warnings.filterwarnings('ignore', category=RuntimeWarning)
        # lower the upper limit (saves computation time)
        while truncNbinomCdf(upper-1000, nNbinom, pNbinom, log=False) > p_max:
            upper = upper - 1000

        # after this section warnings are enabled again
        warnings.filterwarnings('default', category=RuntimeWarning)

        yarray = np.arange(1, int(upper)+1) # the y values for which the CDF is going to be calculated
        logcdf = truncNbinomCdf(yarray, nNbinom, pNbinom) # calculate log CDF (faster computation time)

        # Compute output
        for i in range(n): # for all quantiles   
            if not na[i] and not neginf[i] and not zero[i] and not aboveZero[i]:
                    quantiles[i] = np.sum(logcdf < np.array(logp[i])) + 1 #+1 because 0 is truncated
        
        # Return output
        if len(quantiles) == 1:
            return quantiles[0] # if single quantile is handed over
        else:
            return quantiles

In [2]:
print(nbinom.ppf([-1, 0, 1, 2, 0.9, 0.7], 2, 0.1))
print(qnbinom_trunc([-1, 0, 1, 2, 0.9, 0.7], 2, 0.1))

print(nbinom.ppf(0,2,0.2))
print(qnbinom_trunc(0,2,0.2))

[nan -1. inf nan 36. 22.]
[nan  0. inf nan 36. 22.]
-1.0
0.0


# extrem wichtig
es kommt zu numerischen instabilitäten, wie im unteren Beispiel, wenn n (anzahl erfolge) nicht gerundet wird!!

In [3]:
""" n = 308.3191118596585
p = 0.09206801011370347 """


n = 9252.0
p = 0.9226650803093397

print(nbinom.cdf(2, n, p))
print(nbinom.cdf(0,n,p))
print(nbinom.cdf(2, n, p) - nbinom.cdf(0,n,p))

0.0
5e-324
-5e-324


In [4]:
truncNbinomCdf(2, n, p, True) 

-inf

## Bisektionsverfahren alt

In [5]:
def truncNegBin_PPF(x, n, p, epsilon=1e-6, max_iterations=100):
    # if f(0)=0 no truncation is needed
    if (1 - nbinom.pmf(0, n, p)) == 1:
        return nbinom.ppf(x, n, p)
    else:
        # Define the range of y where the solution might exist
        lower_bound = 0
        upper_bound = 1000000000  # Adjust this based on the expected range of y

        # Bisection method
        for _ in range(max_iterations):
            y = (lower_bound + upper_bound) / 2
            cdf_value = truncNbinomCdf(y, n, p, False)

            if abs(cdf_value - x) < epsilon:
                return np.ceil(y)  # Found a good approximation

            if cdf_value < x:
                lower_bound = y
            else:
                upper_bound = y

        # Return the best approximation if max_iterations is reached
        return np.ceil(y)

def calculate_trunc_nbinom_quantile(quantile, n, p):
    return truncNegBin_PPF(quantile, n, p)

## Vergleich der Funktionen

In [6]:
import time
from scipy.stats import nbinom
from joblib import Parallel, delayed
import numpy as np

# calculation of quantiles
quantiles = np.arange(0.001, 0.9999, 0.001)
quantiles = np.round(quantiles, 3)

# Example usage
mean = 9000000.5
var = 373000000006.5

n = (mean**2) / (var - mean) # equivalent to r
p = mean / var

# Measure execution time for variable 'a'
start_time_a = time.time()
a = nbinom.ppf(quantiles, n, p)
end_time_a = time.time()
execution_time_a = end_time_a - start_time_a

# Measure execution time for variable 'b'
start_time_b = time.time()
b = qnbinom_trunc(quantiles, n, p)  # Assuming this function is defined somewhere in your code
end_time_b = time.time()
execution_time_b = end_time_b - start_time_b

# Measure execution time for variable 'c'
start_time_c = time.time()
trunc_nbinom_quantiles = Parallel(n_jobs=-1)(delayed(calculate_trunc_nbinom_quantile)(quantile, n, p) for quantile in quantiles)
c = np.array(trunc_nbinom_quantiles)
end_time_c = time.time()
execution_time_c = end_time_c - start_time_c

print("Execution time for variable 'a':", execution_time_a, "seconds")
print("Execution time for variable 'b':", execution_time_b, "seconds")
print("Execution time for variable 'c':", execution_time_c, "seconds")

Execution time for variable 'a': 0.007998228073120117 seconds
Execution time for variable 'b': 0.009003162384033203 seconds
Execution time for variable 'c': 3.566498279571533 seconds


In [7]:
a

array([ 7230094.,  7342474.,  7412149.,  7463582.,  7504723.,  7539207.,
        7569010.,  7595331.,  7618954.,  7640424.,  7660130.,  7678366.,
        7695354.,  7711271.,  7726256.,  7740424.,  7753868.,  7766667.,
        7778887.,  7790583.,  7801805.,  7812594.,  7822985.,  7833012.,
        7842701.,  7852079.,  7861167.,  7869985.,  7878551.,  7886880.,
        7894989.,  7902889.,  7910592.,  7918111.,  7925455.,  7932633.,
        7939653.,  7946524.,  7953253.,  7959847.,  7966312.,  7972653.,
        7978877.,  7984988.,  7990991.,  7996891.,  8002691.,  8008396.,
        8014009.,  8019534.,  8024974.,  8030333.,  8035612.,  8040816.,
        8045945.,  8051004.,  8055994.,  8060918.,  8065777.,  8070574.,
        8075311.,  8079989.,  8084611.,  8089177.,  8093691.,  8098152.,
        8102562.,  8106924.,  8111238.,  8115506.,  8119728.,  8123907.,
        8128042.,  8132136.,  8136189.,  8140203.,  8144177.,  8148114.,
        8152015.,  8155879.,  8159708.,  8163503., 

In [8]:
b

array([ 7230094.,  7342474.,  7412149.,  7463582.,  7504723.,  7539207.,
        7569010.,  7595331.,  7618954.,  7640424.,  7660130.,  7678366.,
        7695354.,  7711271.,  7726256.,  7740424.,  7753868.,  7766667.,
        7778887.,  7790583.,  7801805.,  7812594.,  7822985.,  7833012.,
        7842701.,  7852079.,  7861167.,  7869985.,  7878551.,  7886880.,
        7894989.,  7902889.,  7910592.,  7918111.,  7925455.,  7932633.,
        7939653.,  7946524.,  7953253.,  7959847.,  7966312.,  7972653.,
        7978877.,  7984988.,  7990991.,  7996891.,  8002691.,  8008396.,
        8014009.,  8019534.,  8024974.,  8030333.,  8035612.,  8040816.,
        8045945.,  8051004.,  8055994.,  8060918.,  8065777.,  8070574.,
        8075311.,  8079989.,  8084611.,  8089177.,  8093691.,  8098152.,
        8102562.,  8106924.,  8111238.,  8115506.,  8119728.,  8123907.,
        8128042.,  8132136.,  8136189.,  8140203.,  8144177.,  8148114.,
        8152015.,  8155879.,  8159708.,  8163503., 

In [9]:
c

array([ 7230094.,  7342474.,  7412149.,  7463582.,  7504723.,  7539207.,
        7569010.,  7595331.,  7618954.,  7640424.,  7660130.,  7678366.,
        7695354.,  7711271.,  7726256.,  7740424.,  7753868.,  7766667.,
        7778887.,  7790583.,  7801805.,  7812594.,  7822985.,  7833012.,
        7842701.,  7852079.,  7861167.,  7869985.,  7878551.,  7886880.,
        7894989.,  7902889.,  7910592.,  7918111.,  7925455.,  7932633.,
        7939653.,  7946524.,  7953253.,  7959847.,  7966312.,  7972653.,
        7978877.,  7984988.,  7990991.,  7996891.,  8002691.,  8008396.,
        8014009.,  8019534.,  8024974.,  8030333.,  8035612.,  8040816.,
        8045945.,  8051004.,  8055994.,  8060918.,  8065777.,  8070574.,
        8075311.,  8079989.,  8084611.,  8089177.,  8093691.,  8098152.,
        8102562.,  8106924.,  8111238.,  8115506.,  8119728.,  8123907.,
        8128042.,  8132136.,  8136189.,  8140203.,  8144177.,  8148114.,
        8152015.,  8155879.,  8159708.,  8163503., 

In [10]:
b == c

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

## Laufzeit der Funktionen überprüfen

In [11]:
def your_function_to_profile(quantiles, n, p):
    return qnbinom_trunc(quantiles, n, p)

In [12]:
import cProfile
import pstats
cProfile.run('your_function_to_profile(quantiles, n, p)', 'profile_results')

# Die Profilierungsergebnisse lesen und nach der selbst verbrauchten Zeit (tottime) sortieren
stats = pstats.Stats('profile_results')
stats.sort_stats('tottime')  # Sortieren nach der selbst verbrauchten Zeit
stats.print_stats() 

Tue Jul 11 09:52:22 2023    profile_results

         304 function calls (302 primitive calls) in 0.008 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.006    0.006    0.006    0.006 c:\Users\Tobias\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_discrete_distns.py:364(_ppf)
        1    0.000    0.000    0.007    0.007 c:\Users\Tobias\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\stats\_distn_infrastructure.py:3598(ppf)
        4    0.000    0.000    0.000    0.000 c:\Users\Tobias\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\core\fromnumeric.py:69(_wrapreduction)
        9    0.000    0.000    0.000    0.000 c:\Users\Tobias\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\lib\stride_tricks.py:340(_broadcast_to)
        1    0.000    0.000    0.001    0.001 c:\Users\Tobias\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy

<pstats.Stats at 0x290de57f510>