In [2]:
import numpy as np
import pandas as pd
from scipy.stats import nbinom
from scipy.optimize import minimize_scalar
from scipy.special import nbdtrik
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
def find_p(N, bezug, F_observed):
    """"
    Find p for a given N such that P(X < S) from the Negative Binomial
    matches the empirically observed uncensored probability.
    """
    func = lambda p: abs(nbinom.cdf(bezug - 1, N, p) - F_observed)
    res = minimize_scalar(func, bounds=(0.01, 0.99), method='bounded')
    return res.x


def uncensor(N_hat, p_hat, s, max):
    '''
    Compute E[X|X>=s] for X~NB(N_hat,p_hat).
    max: upper demand limit used in calculation
    '''
    X_vals = np.arange(s, max)
    pmf = nbinom.pmf(X_vals, N_hat, p_hat)
    tail_prob = 1- nbinom.cdf(s-1, N_hat, p_hat)
    if tail_prob < 1e-8:
        return s  # fallback: don't uncensor
    exp = np.sum(X_vals * pmf) / tail_prob
    return int(np.round(exp))


def compute_NB_result(group_key_data):
    '''
    Given one hierarchical group, calculate the Agrawal demand
    '''
    def objective(N, bezug, x_bar, F_observed):
        p = find_p(N, bezug, F_observed)
        prob_uncensored = nbinom.cdf(bezug - 1, N, p)
        if prob_uncensored == 0.0: return 10**8 #if probability of demand being uncensored is too low, return a high objective value
        expect = nbinom.expect(lambda x: x, args=(N, p), lb=0, ub=bezug-1)
        mean_uncensored = expect / prob_uncensored
        return abs(mean_uncensored - x_bar)
    

    (bezug, period), group = group_key_data
    group = group.dropna(subset=['Bezug', 'Verkauf'])

    group_uncensored = group[group['Zensiert']==0]
    x_bar = group_uncensored['Verkauf'].mean()
    F_observed = len(group_uncensored)/len(group)

    result = minimize_scalar(lambda N: objective(N,bezug,x_bar,F_observed), bounds=(0.1, 500), method='bounded')
    N_hat = result.x
    p_hat = find_p(N_hat, bezug, F_observed)

    ub = max(100, bezug*2)
    demand = uncensor(N_hat, p_hat, bezug, max = ub)
    print(f'Period: {period}, Bezug: {bezug}, N: {N_hat}, p: {p_hat}')
    return {'Period': period, 'Bezug': bezug, 'Agrawal_Demand': demand}


def agrawal_parallel_hierarchical(df, max_cpus=None):
    '''
    Group magazine data by bezug and period run Agrawal algorithm in parallel.
    Returns original dataframe with new columns Agrawal_Demand
    '''
    #Step 1: calculate Agrawal demand
    df.sort_values(['Bezug', 'Period'], inplace=True)
    grouped = list(df.groupby(['Bezug', 'Period']))
    results = []

    with ThreadPoolExecutor(max_workers=max_cpus) as executor:
        futures = [executor.submit(compute_NB_result, group) for group in grouped]
        for future in as_completed(futures):
            results.append(future.result())
    df_results= pd.DataFrame(results)

    #Step 2: calculate hierarchical group uncensored fraction)
    group_data = []
    for (bezug,period),group in df.groupby(['Bezug', 'Period']):
        group_data.append({'Bezug':bezug, 'Period':period, 'Uncensored_fraction': len(group[group['Zensiert']==0])/len(group)})
    df_group = pd.DataFrame(group_data)
    df_results = pd.merge(df_results, df_group, how='outer', on=['Bezug','Period'])

    #Step 3: add results to original data
    df_join = pd.merge(df, df_results, how="outer", on=['Bezug','Period'])

    #only use Agrawal estimation for censored data (use real sales value for not censored data)
    df_join['Agrawal_Demand'] = np.where(df_join['Zensiert'], df_join['Agrawal_Demand'], df_join['Verkauf'])

    #only use Agrawal estimation for valid results (uncensored fraction is not 0 or 1)
    df_join['Agrawal_Demand'] = np.where((df_join['Uncensored_fraction']==0.0)|(df_join['Uncensored_fraction']==1.0), df_join['Verkauf'], df_join['Agrawal_Demand'])
    
    df_join.drop(['Uncensored_fraction'],axis='columns', inplace=True)
    return df_join


In [None]:
df = pd.read_csv('original_data/I_20250212_ZQ0.35_ZG0.4_testfile.csv')
df_agrawal = agrawal_parallel_hierarchical(df)

Period: 2022-004, Bezug: 1.0, N: 499.9999865216339, p: 0.19689626512644393Period: 2023-003, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332

Period: 2022-005, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2022-006, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2023-002, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2023-005, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2023-008, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2022-008, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2022-004, Bezug: 2.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2023-007, Bezug: 1.0, N: 499.9999865216339, p: 0.9899941583562332
Period: 2023-001, Bezug: 1.0, N: 499.9999865216339, p: 0.19689626512644393Period: 2023-001, Bezug: 2.0, N: 1.7631890801979926, p: 0.4328460875295658

Period: 2023-004, Bezug: 1.0, N: 499.9999865216339, p: 0.19689626512644393
Period: 2023-003, Bezug: 2.0, N: 1

### Explore Results

In [None]:
#Analyze poor results (invalid/didn't converge)
for letter in 'ABCDEFGHI':
    df = pd.read_csv('result_files/'+letter+'_20250212_ZQ0.35_ZG0.4_testfile_Agrawal.csv')
    total = df['Group_length'].sum()
    no_converge = df[(df['N']>499)&(df['p']>0.98)]['Group_length'].sum()
    no_all_uncensored = df[(df['Uncensored_fraction']==0.0)|(df['Uncensored_fraction']==1.0)]['Group_length'].sum()
    bad_results =  df[(df['Uncensored_fraction']==0.0)|(df['Uncensored_fraction']==1.0)|((df['N']>499)&(df['p']>0.98))]['Group_length'].sum()
    print(f"{letter}: {no_converge}/{total} = {no_converge/total}")
    print(f"{letter}: {no_all_uncensored}/{total} = {no_all_uncensored/total}")
    print(f"{letter}: {bad_results}/{total} = {bad_results/total}")

A: 50525/277894 = 0.1818139290520846
A: 12258/277894 = 0.04411034423197334
A: 56130/277894 = 0.20198349010773892
B: 29761/200524 = 0.1484161496878179
B: 7588/200524 = 0.03784085695477848
B: 31437/200524 = 0.15677425146117174
C: 33825/199383 = 0.16964836520666254
C: 6480/199383 = 0.032500263312318506
C: 34143/199383 = 0.1712432855358782
D: 7629/76163 = 0.10016674763336528
D: 3183/76163 = 0.04179194622060581
D: 9245/76163 = 0.12138439924897916
E: 5871/68555 = 0.08563926774122967
E: 4811/68555 = 0.0701772299613449
E: 7968/68555 = 0.11622784625483189
F: 10561/67764 = 0.1558497137122956
F: 2458/67764 = 0.03627294728764536
F: 11853/67764 = 0.17491588454046397
G: 5772/55162 = 0.10463725028099054
G: 4519/55162 = 0.08192233784126754
G: 10103/55162 = 0.18315144483521265
H: 2106/27177 = 0.07749199690915112
H: 1913/27177 = 0.07039040365014534
H: 3586/27177 = 0.13194981050152702
I: 2424/21595 = 0.11224820560314888
I: 1623/21595 = 0.07515628617735587
I: 3074/21595 = 0.1423477656865015
