# Test 1 Censoring

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path = 'Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv'
df = pd.read_csv(path)

df["is_stockout"] = (df["Zensiert"] == 1)
df = df.dropna(subset=['Verkauf_MBR'])
df = df.dropna(subset=['Verkauf'])

df.head()
len(df)

277894

# N3

In [3]:
# Grouping N3

import pandas as pd
import numpy as np

def apply_n3_uncensoring(df):
    df = df.copy()
    
    # Mark closed observations
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Compute open group means
    open_means = (
        df[~df['is_closed']]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('open_mean')
        .reset_index()
        .round()
    )
    
    # Merge open means back to df
    df = df.merge(open_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Compute the max(mean, observed) for censored rows only
    mask = df['is_closed'] & df['open_mean'].notna()
    df.loc[mask, 'Verkauf_Uncensored'] = np.maximum(df.loc[mask, 'Verkauf_Uncensored'], df.loc[mask, 'open_mean'])
    
    return df.drop(['is_closed', 'open_mean'], axis=1)

# # Naive N3
# def apply_n3_uncensoring(df):
#     """
#     N3 uncensoring: replace censored values with max(current_value, mean_of_uncensored).
#     """
#     df = df.copy()
    
#     # Calculate mean of all uncensored observations
#     uncensored_mean = df[df['Zensiert'] == 0]['Verkauf'].mean()
    
#     # Replace censored values with max(current, uncensored_mean)
#     df.loc[df['Zensiert'] == 1, 'Verkauf'] = np.maximum(
#         df.loc[df['Zensiert'] == 1, 'Verkauf'], 
#         uncensored_mean
#     )
    
#     return df

df_n3_uncensored = apply_n3_uncensoring(df)

Saving file

In [4]:
df_n3_uncensored.to_csv(path[:-4] + '_n3.csv', index=False)
df_n3_uncensored

Unnamed: 0,EHASTRA_EH_NUMMER,Heftjahr,Heftnummer,Period,Bezug_MBR,Verkauf_MBR,Remission_MBR,Bezug,Verkauf,Remission,...,SBC,XYZ,Laenge,Zensierungsquote,Zensierungsgrad,Zensiert,Entgangener Verkauf,Steigung,is_stockout,Verkauf_Uncensored
0,EHA0017186,2022,4,2022-004,5.0,1.0,4.0,5.0,1.0,4.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,1.0
1,EHA0017186,2022,5,2022-005,5.0,0.0,5.0,5.0,0.0,5.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
2,EHA0017186,2022,6,2022-006,5.0,0.0,5.0,5.0,0.0,5.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
3,EHA0017186,2022,7,2022-007,4.0,0.0,4.0,4.0,0.0,4.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
4,EHA0017186,2022,8,2022-008,3.0,0.0,3.0,3.0,0.0,3.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277889,EHJ9923970,2024,25,2024-025,9.0,7.0,2.0,9.0,7.0,2.0,...,S,X,132,0.2735,0.4,0,0.0,0.0085,False,7.0
277890,EHJ9923970,2024,26,2024-026,8.0,8.0,0.0,9.0,8.0,1.0,...,S,X,132,0.2735,0.4,0,0.0,0.0085,False,8.0
277891,EHJ9923970,2024,27,2024-027,10.0,8.0,2.0,7.0,7.0,0.0,...,S,X,132,0.2735,0.4,1,1.0,0.0085,True,10.0
277892,EHJ9923970,2024,28,2024-028,10.0,9.0,1.0,5.0,5.0,0.0,...,S,X,132,0.2735,0.4,1,4.0,0.0085,True,10.0


# N2

In [5]:
# Grouping N2
def apply_n2_uncensoring(df):
    """
    N2 uncensoring with grouping: replace censored values with mean of uncensored values within each group.
    """
    df = df.copy()

    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate mean of uncensored observations for each group
    uncensored_means = (
        df[df['Zensiert'] == 0]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('uncensored_mean')
        .reset_index()
        .round()
    )
    
    # Merge back to original DataFrame
    df = df.merge(uncensored_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Replace censored values with group mean
    df.loc[df['Zensiert'] == 1, 'Verkauf_Uncensored'] = df.loc[df['Zensiert'] == 1, 'uncensored_mean']
    
    return df.drop('uncensored_mean', axis=1)

df_n2_uncensored = apply_n2_uncensoring(df)

# Naive N2 

# def apply_n2_uncensoring(df):
#     """
#     Simple N2 uncensoring: replace censored values with mean of all uncensored values.
#     """
#     df = df.copy()
    
#     # Calculate mean of all uncensored observations
#     uncensored_mean = df[df['Zensiert'] == 0]['Verkauf'].mean()
    
#     # Replace censored values with this mean
#     df.loc[df['Zensiert'] == 1, 'Verkauf'] = uncensored_mean
    
#     return df

# df_n2_uncensored = apply_n2_uncensoring(df)

Saving File

In [6]:
df_n2_uncensored.to_csv(path[:-4] + '_n2.csv', index=False)

# N1

In [7]:
# Grouping N1

import pandas as pd
import numpy as np

def apply_n1_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    # Identify closed observations
    is_closed = (df["Zensiert"] == 1)

    # Compute mean Verkauf per group and broadcast using transform
    group_means = df.groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored'].transform('mean').round()
    
    # Replace closed observations
    df.loc[is_closed, 'Verkauf_Uncensored'] = group_means[is_closed]
    
    return df

df_n1_uncensored = apply_n1_uncensoring(df)

# # Naive N1

# def apply_n1_uncensoring(df):
#     """
#     N1 uncensoring: replace censored values with mean of all values.
#     """
#     df = df.copy()
    
#     # Calculate mean of all observations (censored and uncensored)
#     overall_mean = df['Verkauf'].mean()
    
#     # Replace censored values with this mean
#     df.loc[df['Zensiert'] == 1, 'Verkauf'] = overall_mean
    
#     return df

Saving File

In [8]:
df_n1_uncensored.to_csv(path[:-4] + '_n1.csv', index=False)

# EM

In [9]:
import numpy as np
from scipy.stats import poisson
import pandas as pd

def apply_em_uncensoring(df, max_iter=30, tolerance=1e-6):

    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Direct boolean indexing without extra columns
    stockout_condition = (df['Zensiert'] == 1)
    
    # Process groups with minimal overhead
    for (pos), group in df.groupby(['EHASTRA_EH_NUMMER']):
        group_stockout = stockout_condition.loc[group.index]
        
        if not group_stockout.any():
            continue
            
        sales = group['Verkauf_Uncensored'].values
        is_stockout = group_stockout.values
        
        uncensored = sales[~is_stockout]
        censored = sales[is_stockout]
        
        # Quick lambda initialization
        lambda_est = np.mean(uncensored) if len(uncensored) > 0 else np.mean(sales) * 1.5
        lambda_est = max(lambda_est, 0.1)
        
        # Fast EM loop
        for _ in range(max_iter):
            lambda_old = lambda_est
            
            # Batch E-step
            surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
            exact_prob = poisson.pmf(censored, lambda_est)
            surv_prob = np.maximum(surv_prob, 1e-12)
            
            expected = lambda_est + censored * exact_prob / surv_prob
            expected = np.maximum(expected, censored.astype(float))
            
            # M-step
            lambda_est = max(np.mean(np.concatenate([uncensored, expected])), 0.1)
            
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final update
        surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
        exact_prob = poisson.pmf(censored, lambda_est)
        surv_prob = np.maximum(surv_prob, 1e-12)
        
        final_expected = lambda_est + censored * exact_prob / surv_prob
        final_expected = np.maximum(final_expected, censored.astype(float))
        
        # Update original dataframe
        stockout_indices = group.index[is_stockout]
        df.loc[stockout_indices, 'Verkauf_Uncensored'] = final_expected.round()
    
    return df

df_em_uncensored = apply_em_uncensoring(df) 

Saving File

In [10]:
df_em_uncensored.to_csv(path[:-4] + '_em.csv', index=False)

# PD (Poisson, Grouped by HF & POS)

In [11]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

def apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=20, tolerance=1e-4):
    """
    Proper Projection Detruncation implementation using Poisson distribution.
    
    Parameters:
    - tau: Parameter that controls the aggressiveness of unconstraining (0 < tau < 1)
           tau = 0.5 gives balanced results similar to EM
           smaller tau values are more aggressive
    - max_iter: Maximum number of iterations
    - tolerance: Convergence tolerance
    """
    df = df.copy()
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    def compute_pd_projection1(obs_val, lambda_est, tau):
        """
        Compute the PD projection for a single observation using Poisson distribution.
        This balances area A (original to new estimate) with area B (new estimate to infinity)
        weighted by parameter tau.
        """
        # For Poisson distribution, we need to find the projection value
        # that balances the two areas according to tau
        
        def objective(k_proj):
            k_proj = int(round(k_proj))  # Ensure integer for discrete distribution
            
            if k_proj < obs_val:
                return float('inf')  # Invalid projection
            
            # Area A: P(obs_val <= X <= k_proj) = P(X <= k_proj) - P(X <= obs_val-1)
            area_A = poisson.cdf(k_proj, lambda_est) - poisson.cdf(obs_val - 1, lambda_est)
            
            # Area B: P(X > k_proj) = 1 - P(X <= k_proj)
            area_B = 1 - poisson.cdf(k_proj, lambda_est)
            
            # PD tries to balance: tau * area_A = (1 - tau) * area_B
            # Rearranging: area_A / area_B = (1 - tau) / tau
            if area_B > 1e-10:  # Avoid division by zero
                ratio = area_A / area_B
                target_ratio = (1 - tau) / tau
                return abs(ratio - target_ratio)
            else:
                return abs(area_A - (1 - tau))
        
        # Search for optimal projection in reasonable range
        # For Poisson, search from obs_val to obs_val + reasonable upper bound
        upper_bound = int(obs_val + max(10, int(3 * np.sqrt(lambda_est))))
        
        best_k = obs_val
        best_objective = float('inf')
        
        # Discrete search since Poisson is discrete
        for k in range(int(obs_val), upper_bound + 1):
            obj_val = objective(k)
            if obj_val < best_objective:
                best_objective = obj_val
                best_k = k
        
        return best_k
    
    # Process groups
    grouped = df.groupby(['EHASTRA_EH_NUMMER', 'Heftnummer'])
    
    # can group by different things, just pass in appropriate arguments here 
    for (pos, hf), group in grouped:
        open_mask = ~group['is_closed']
        closed_mask = group['is_closed']
        
        open_sales = group.loc[open_mask, 'Verkauf_Uncensored'].values
        closed_sales = group.loc[closed_mask, 'Verkauf_Uncensored'].values
        
        if len(closed_sales) == 0:
            continue
        
        # Initialize lambda parameter using all available data
        all_sales = group['Verkauf_Uncensored'].values
        lambda_est = np.mean(all_sales)
        
        # Ensure minimum lambda for numerical stability
        lambda_est = max(lambda_est, 0.1)
        
        closed_indices = group[closed_mask].index.values
        
        # Iterative process
        for iteration in range(max_iter):
            lambda_old = lambda_est
            
            # Project closed observations using PD heuristic
            projected_values = np.array([
                compute_pd_projection1(obs, lambda_est, tau) 
                for obs in closed_sales
            ])
            
            # Re-estimate lambda using open + projected values
            all_values = np.concatenate([open_sales, projected_values])
            lambda_est = np.mean(all_values)
            
            # Ensure minimum lambda
            lambda_est = max(lambda_est, 0.1)
            
            # Check convergence
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final projection
        final_projections = np.array([
            compute_pd_projection1(obs, lambda_est, tau) 
            for obs in closed_sales
        ])
        
        # Update dataframe (already integers from Poisson)
        df.loc[closed_indices, 'Verkauf_Uncensored'] = final_projections.round()
    
    return df.drop('is_closed', axis=1)

def apply_projection_detruncation_aggressive1(df, max_iter=20, tolerance=1e-4):
    """
    More aggressive PD implementation with tau=0.3 as mentioned in the paper
    """
    return apply_projection_detruncation_fixed1(df, tau=0.3, max_iter=max_iter, tolerance=tolerance)

def apply_projection_detruncation_balanced1(df, max_iter=20, tolerance=1e-4):
    """
    Balanced PD implementation with tau=0.5 (similar to EM results)
    """
    return apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=max_iter, tolerance=tolerance)

df_projection_uncensored = apply_projection_detruncation_balanced1(df)

Saving File

In [12]:
df_projection_uncensored.to_csv(path[:-4] + '_pd.csv', index=False)

# Conrad

In [None]:
from scipy.stats import poisson
import pandas as pd
import numpy as np

def berechnung(links, rechts, n, N, r, x_summe, value_tol=0.00001, max_iterations=1000):
    """
    Till's Code
    """
    iteration = 0
    
    while iteration < max_iterations:
        mu = (links + rechts) / 2
        wert_0 = (x_summe - mu * n) * (1 - poisson.cdf(N-1, mu)) + mu * (n - r) * (1 - poisson.cdf(N-2, mu))
        
        if iteration < 3:
            print(f"Iter {iteration}: mu={mu:.4f}, wert_0={wert_0:.8f}")
        
        if abs(wert_0 - 0) < value_tol:
            print(f"Converged after {iteration} iterations: mu={mu:.4f}")
            return mu
        elif wert_0 - 0 < 0:  # Your condition: je kleiner mu desto kleiner wird der wert
            rechts = mu
        elif wert_0 - 0 > 0:
            links = mu
            
        iteration += 1
    
    print(f"✗ Max iterations reached: mu={mu:.4f}")
    return mu

def test_conrad_example():
    links = 1
    rechts = 100
    n = 13
    N = 10
    r = 7
    x_summe = 58
    
    print("Testing with EXACT German implementation:")
    print(f"links={links}, rechts={rechts}")
    print(f"n={n}, N={N}, r={r}, x_summe={x_summe}")
    
    result = berechnung(links, rechts, n, N, r, x_summe)
    print(f"Result: μ = {result:.4f}")
    print(f"Expected: μ ≈ 10.18")
    
    return result

def create_order_specific_mu_dict(df):

    order_specific_mu_dict = {}
    
    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            N = bezug_val
            
            # Count non-stockouts
            stockouts_mask = (group['Verkauf'] == bezug_val)
            r = n - stockouts_mask.sum()  # r = number of NON-stockouts
            
            # x_summe = sum of UNCENSORED observations only
            uncensored_sales = group[~stockouts_mask]['Verkauf']
            x_summe = uncensored_sales.sum()
            
            # Skip problematic cases
            if n < 3:
                continue
            if r == n:  # No stockouts = no censoring information
                continue
            if r == 0:  # All stockouts = no uncensored observations
                continue
            
            try:
                links = 1
                rechts = 100
                mu_est = berechnung(links, rechts, n, N, r, x_summe)
                if mu_est:
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {N}: {e}")
                continue

    print(f"Successfully estimated μ for {len(order_specific_mu_dict)} groups")
    return order_specific_mu_dict

def expected_poisson_tail(mu, N, max_k=200):
    """
    Compute E[X | X >= N] for X ~ Poisson(mu)
    """
    k_vals = np.arange(N, max_k)
    pmf = poisson.pmf(k_vals, mu)
    tail_prob = 1 - poisson.cdf(N - 1, mu)
    if tail_prob < 1e-8:
        return N  # fallback: don't uncensor
    return np.sum(k_vals * pmf) / tail_prob

def uncensor_dataset(df, order_specific_mu_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if not row['is_stockout']:
            continue
            
        # Get the order-quantity-specific demand parameter
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)

        if mu is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_poisson_tail(mu, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Usage
print("Example:")
test_result = test_conrad_example()

print("\nRunning on actual data:")
order_specific_mu_dict = create_order_specific_mu_dict(df)
df_conrad_uncensored = uncensor_dataset(df, order_specific_mu_dict)

Testing Conrad's example first:
Testing with EXACT German implementation:
links=1, rechts=100
n=13, N=10, r=7, x_summe=58
Iter 0: mu=50.5000, wert_0=-295.50000000
Iter 1: mu=25.7500, wert_0=-122.21964104
Iter 2: mu=13.3750, wert_0=-25.84454646
✓ Converged after 22 iterations: mu=10.1819
Result: μ = 10.1819
Expected: μ ≈ 10.18

Now running on your data:
Iter 0: mu=50.5000, wert_0=-101.00000000
Iter 1: mu=25.7500, wert_0=-51.49999998
Iter 2: mu=13.3750, wert_0=-26.74777675
✓ Converged after 23 iterations: mu=3.9797
Iter 0: mu=50.5000, wert_0=-8620.50000000
Iter 1: mu=25.7500, wert_0=-4338.74999833
Iter 2: mu=13.3750, wert_0=-2197.76755762
✓ Converged after 16 iterations: mu=1.8527
Iter 0: mu=50.5000, wert_0=-13048.50000000
Iter 1: mu=25.7500, wert_0=-6539.24997769
Iter 2: mu=13.3750, wert_0=-3283.86478561
✓ Converged after 26 iterations: mu=1.7449
Iter 0: mu=50.5000, wert_0=-6390.00000000
Iter 1: mu=25.7500, wert_0=-3122.99989831
Iter 2: mu=13.3750, wert_0=-1487.71558458
✓ Converged afte

In [49]:
results = calculate_kpis(df_conrad_uncensored, "Conrad")

Method: Conrad, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.169 (overestimation)
- Weighted MAE (α=0): 1.025
- Weighted MAE (α=0.5): 1.132
- Weighted MAE (α=1): 1.344
- Weighted MAE (α=1.5): 1.673
- Accuracy (exact matches): 0.261
- Overestimation Rate: 0.471
- Underestimation Rate: 0.268
- Gini Coefficient: 0.433
Overstock: 45203


In [14]:
df_conrad_uncensored.to_csv(path[:-4] + '_conrad.csv', index=False)

In [15]:
# # Check mean vs variance by magazine
# print("=== MEAN VS VARIANCE BY MAGAZINE ===")
# good_poisson_count = 0
# total_magazines = 0

# for magazine_id in df['EHASTRA_EH_NUMMER'].unique():
#    mag_data = df[df['EHASTRA_EH_NUMMER'] == magazine_id]['Verkauf']
#    mean_val = mag_data.mean()
#    var_val = mag_data.var()
#    ratio = var_val/mean_val
   
#    if 0.7 <= ratio <= 1.3:
#        good_poisson_count += 1
#    total_magazines += 1
   
#    # print(f"Magazine {magazine_id}: Mean={mean_val:.2f}, Variance={var_val:.2f}, Ratio={ratio:.2f}")

# print(f"\nMagazines with Poisson-like ratio (0.7-1.3): {good_poisson_count}/{total_magazines}")

# # Overall (combined)
# print(f"\nCombined: Mean={df['Verkauf'].mean():.2f}, Variance={df['Verkauf'].var():.2f}, Ratio={df['Verkauf'].var()/df['Verkauf'].mean():.2f}")

# Nahmias

In [16]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def compute_mu_sigma_nahmias(sales, S):
    """
    Compute mu and sigma using Nahmias method for censored normal data.
    
    Parameters:
    -----------
    sales : array-like
        Observed sales data (censored at S)
    S : float
        Censoring limit (inventory level)
    
    Returns:
    --------
    tuple: (mu_hat, sigma_hat) or (None, None) if estimation fails
    """
    sales = np.array(sales)
    sales = sales[~np.isnan(sales)]
    
    if len(sales) < 5:  # Need reasonable sample size
        return None, None
    
    # The uncensored observations are those < S
    observed = sales[sales < S]
    n = len(sales)
    r = len(observed)
    
    if r < 2 or r >= n-1:  # Need at least 2 uncensored, 1 censored
        return None, None
        
    p = r / n  # Proportion uncensored
    
    # # Avoid extreme proportions where method is unstable
    # if p < 0.2 or p > 0.8:
    #     return None, None
    
    x_bar = np.mean(observed)
    s2 = np.var(observed, ddof=1)
    
    if s2 <= 1e-6:
        return None, None
    
    try:
        z = norm.ppf(p)
        
        # Check for extreme z values that cause instability
        if abs(z) > 2:
            return None, None
        
        pdf_z = norm.pdf(z)
        
        denominator = 1 - (z * pdf_z / p) - (pdf_z**2 / p**2)
        
        if denominator <= 0.1:  # Need substantial positive denominator
            return None, None
            
        sigma_hat2 = s2 / denominator
        
        if sigma_hat2 <= 0 or sigma_hat2 > 1e6:
            return None, None
            
        sigma_hat = np.sqrt(sigma_hat2)
        mu_hat = x_bar + sigma_hat * pdf_z / p
        
        # Sanity check on estimates
        if abs(mu_hat) > 1e6 or sigma_hat > 1e3:
            return None, None
        
        return mu_hat, sigma_hat
        
    except Exception:
        return None, None

def create_order_specific_nahmias_dict(df):
    """
    Create μ and σ estimates for each (week, order_quantity) combination using Nahmias method.
    Each stockout level gets its own Normal distribution.
    """
    order_specific_mu_dict = {}
    order_specific_sigma_dict = {}

    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            S = bezug_val
            
            # Count stockouts (sales = order quantity)
            stockouts = (group['Verkauf'] == bezug_val).sum()
            
            if n < 5 or stockouts == 0 or stockouts == n:
                continue  # Not enough variation or unusable

            try:
                mu_est, sigma_est = compute_mu_sigma_nahmias(group['Verkauf'], S)
                if mu_est is not None and sigma_est is not None:
                    # Store with (year, week, order_quantity) key
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
                    order_specific_sigma_dict[key] = sigma_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {bezug_val}: {e}")
                continue

    return order_specific_mu_dict, order_specific_sigma_dict

def expected_normal_tail(mu, sigma, S, max_iterations=1000):
    """
    Compute E[X | X >= S] for X ~ Normal(mu, sigma)
    Using numerical integration approximation
    """
    if sigma <= 0:
        return S
    
    # Standardize
    z = (S - mu) / sigma
    
    # If S is way above the mean, just return S (no meaningful tail)
    if z > 6:
        return S
    
    # Use the formula: E[X | X >= S] = mu + sigma * phi(z) / (1 - Phi(z))
    # where phi is PDF and Phi is CDF
    tail_prob = 1 - norm.cdf(z)
    
    if tail_prob < 1e-10:  # Essentially no tail probability
        return S
    
    expected_value = mu + sigma * norm.pdf(z) / tail_prob
    
    return expected_value

def uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    
    Uses order-quantity-specific Normal(μ, σ) parameters.
    
    Modifies the DataFrame in place by updating the Verkauf column.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if not row['is_stockout']:
            continue
            
        # Get the order-quantity-specific demand parameters
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)
        sigma = order_specific_sigma_dict.get(key, None)

        if mu is None or sigma is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_normal_tail(mu, sigma, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Usage
order_specific_mu_dict, order_specific_sigma_dict = create_order_specific_nahmias_dict(df)
df_nahmias_uncensored = uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict)

In [17]:
df_nahmias_uncensored.to_csv(path[:-4] + '_nahmias.csv', index=False)

# Testing

In [7]:
import pandas as pd
import numpy as np

def calculate_kpis(dataframe, method_name):
   """
   Calculate KPIs for demand forecasting evaluation
   
   Parameters:
   - dataframe: pandas DataFrame containing 'Verkauf' (predicted) and 'Verkauf_MBR' (ground truth) columns
   - method_name: string name of the method being evaluated
   - censorship_pct: censorship percentage for display
   - reduction_pct: reduction percentage for display
   - alpha: weight parameter for Weighted MAE (default=1)
            α = 0: standard MAE (no weighting)
            α = 1: linear weighting by true demand
            α > 1: over-proportional penalization of larger errors --> 1.5
            α < 1: emphasis on smaller demands --> 0.5
   """
   df = dataframe.copy()
   censored_df = df[df['Zensiert'] == 1]
   
   # Extract predicted and true values
   y_pred = censored_df['Verkauf_Uncensored'].values  # ŷᵢ (estimated demand)
   y_true = censored_df['Verkauf_MBR'].values  # yᵢ (true demand)
   
   n = len(y_pred)
   
   # 1. Bias calculation
   bias = np.sum(y_pred - y_true) / n
   
   # 2. Accuracy (exact matches)
   exact_matches = np.sum(y_pred == y_true)
   accuracy = exact_matches / n
   
   # 3. Overestimation Rate
   overestimations = np.sum(y_pred > y_true)
   overestimation_rate = overestimations / n
   
   # 4. Underestimation Rate
   underestimations = np.sum(y_pred < y_true)
   underestimation_rate = underestimations / n
   
   # 5. Weighted MAE for different alpha values
   alphas = [0, 0.5, 1, 1.5]
   weighted_maes = {}
   
   for a in alphas:
       if a == 0:
           # Standard MAE (no weighting)
           weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
       else:
           # Weighted MAE with α parameter
           weights = np.power(y_true, a)
           # Handle case where y_true might be 0
           weights = np.where(y_true == 0, 0, weights)
           weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0
   
   # 6. Gini Coefficient
   abs_errors = np.abs(y_pred - y_true)
   mean_abs_error = np.mean(abs_errors)
   n = len(abs_errors)
   sorted_errors = np.sort(abs_errors)
   
   weighted_sum = 0.0
   total_sum = 0.0

   # 7. Overstock, out of curiosity
   overstock = np.sum(np.maximum(0, y_pred - y_true))
   
   for i in range(n):
       weighted_sum += (i + 1) * sorted_errors[i]
       total_sum += sorted_errors[i]
   
   gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
   
   # Determine bias direction
   bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"
   
   # Print results in the specified format
   print(f"Method: {method_name}, on {path}")
   print(f"- Bias: {bias:.3f} ({bias_direction})")
   print(f"- Weighted MAE (α=0): {weighted_maes[0]:.3f}")
   print(f"- Weighted MAE (α=0.5): {weighted_maes[0.5]:.3f}")
   print(f"- Weighted MAE (α=1): {weighted_maes[1]:.3f}")
   print(f"- Weighted MAE (α=1.5): {weighted_maes[1.5]:.3f}")
   print(f"- Accuracy (exact matches): {accuracy:.3f}")
   print(f"- Overestimation Rate: {overestimation_rate:.3f}")
   print(f"- Underestimation Rate: {underestimation_rate:.3f}")
   print(f"- Gini Coefficient: {gini_coefficient:.3f}")
   print(f"Overstock: {int(overstock)}")
   
   return {
       'bias': bias,
       'weighted_mae_0': weighted_maes[0],
       'weighted_mae_0.5': weighted_maes[0.5],
       'weighted_mae_1': weighted_maes[1],
       'weighted_mae_1.5': weighted_maes[1.5],
       'accuracy': accuracy,
       'overestimation_rate': overestimation_rate,
       'underestimation_rate': underestimation_rate,
       'gini_coefficient': gini_coefficient,
       'overstock': overstock
   }

In [19]:
df['Verkauf_Uncensored'] = df['Verkauf'].copy()
results = calculate_kpis(df, "Baseline")

Method: Baseline, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.953 (underestimation)
- Weighted MAE (α=0): 0.953
- Weighted MAE (α=0.5): 1.227
- Weighted MAE (α=1): 1.625
- Weighted MAE (α=1.5): 2.143
- Accuracy (exact matches): 0.404
- Overestimation Rate: 0.000
- Underestimation Rate: 0.596
- Gini Coefficient: 0.578
Overstock: 0


In [20]:
results = calculate_kpis(df_n1_uncensored, "N1")

Method: N1, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.480 (underestimation)
- Weighted MAE (α=0): 1.196
- Weighted MAE (α=0.5): 1.464
- Weighted MAE (α=1): 1.847
- Weighted MAE (α=1.5): 2.368
- Accuracy (exact matches): 0.308
- Overestimation Rate: 0.209
- Underestimation Rate: 0.483
- Gini Coefficient: 0.524
Overstock: 27114


In [21]:
results = calculate_kpis(df_n2_uncensored, "N2")

Method: N2, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.281 (underestimation)
- Weighted MAE (α=0): 1.273
- Weighted MAE (α=0.5): 1.495
- Weighted MAE (α=1): 1.813
- Weighted MAE (α=1.5): 2.248
- Accuracy (exact matches): 0.266
- Overestimation Rate: 0.266
- Underestimation Rate: 0.468
- Gini Coefficient: 0.493
Overstock: 37548


In [22]:
results = calculate_kpis(df_n3_uncensored, "N3")

Method: N3, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.106 (overestimation)
- Weighted MAE (α=0): 0.886
- Weighted MAE (α=0.5): 1.046
- Weighted MAE (α=1): 1.267
- Weighted MAE (α=1.5): 1.550
- Accuracy (exact matches): 0.448
- Overestimation Rate: 0.266
- Underestimation Rate: 0.285
- Gini Coefficient: 0.608
Overstock: 37548


In [23]:
results = calculate_kpis(df_em_uncensored, "EM")

Method: EM, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 1.090 (overestimation)
- Weighted MAE (α=0): 1.250
- Weighted MAE (α=0.5): 1.378
- Weighted MAE (α=1): 1.542
- Weighted MAE (α=1.5): 1.736
- Accuracy (exact matches): 0.255
- Overestimation Rate: 0.686
- Underestimation Rate: 0.058
- Gini Coefficient: 0.472
Overstock: 88579


In [24]:
results = calculate_kpis(df_projection_uncensored, "PD")

Method: PD, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.043 (overestimation)
- Weighted MAE (α=0): 1.068
- Weighted MAE (α=0.5): 1.212
- Weighted MAE (α=1): 1.424
- Weighted MAE (α=1.5): 1.701
- Accuracy (exact matches): 0.386
- Overestimation Rate: 0.225
- Underestimation Rate: 0.389
- Gini Coefficient: 0.599
Overstock: 42038


In [25]:
results = calculate_kpis(df_nahmias_uncensored, "Nahmias, Hierarchical")

Method: Nahmias, Hierarchical, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.500 (underestimation)
- Weighted MAE (α=0): 0.872
- Weighted MAE (α=0.5): 1.057
- Weighted MAE (α=1): 1.341
- Weighted MAE (α=1.5): 1.734
- Accuracy (exact matches): 0.346
- Overestimation Rate: 0.171
- Underestimation Rate: 0.483
- Gini Coefficient: 0.485
Overstock: 14069


In [26]:
results = calculate_kpis(df_conrad_uncensored, "Conrad, Hierarchical")

Method: Conrad, Hierarchical, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.356 (overestimation)
- Weighted MAE (α=0): 1.135
- Weighted MAE (α=0.5): 1.269
- Weighted MAE (α=1): 1.531
- Weighted MAE (α=1.5): 1.950
- Accuracy (exact matches): 0.293
- Overestimation Rate: 0.504
- Underestimation Rate: 0.204
- Gini Coefficient: 0.464
Overstock: 56454


In [27]:
df_n3_uncensored.to_csv(path[:-4] + '_n3.csv', index=False)
df_n2_uncensored.to_csv(path[:-4] + '_n2.csv', index=False)
df_n1_uncensored.to_csv(path[:-4] + '_n1.csv', index=False)
df_em_uncensored.to_csv(path[:-4] + '_em.csv', index=False)
df_projection_uncensored.to_csv(path[:-4] + '_pd.csv', index=False)
df_conrad_uncensored.to_csv(path[:-4] + '_conrad.csv', index=False)
df_nahmias_uncensored.to_csv(path[:-4] + '_nahmias.csv', index=False)

# Conrad, No Grouping

In [28]:
from scipy.stats import poisson
import pandas as pd
import numpy as np

def compute_mu(left, right, n, N, r, x_sum, value_tol=0.01, interval_tol=0.01, max_iterations=1000000000000000):
    iteration = 0

    while iteration < max_iterations and (right - left) > interval_tol:
        mu = (left + right) / 2
        value_0 = (x_sum - mu * n) * (1 - poisson.cdf(N - 1, mu)) + mu * (n - r) * (1 - poisson.cdf(N - 2, mu))

        # if iteration % 10 == 0:
        #     # print(f"Iteration {iteration}: mu={mu:.4f}, value_0={value_0:.6f}")

        if abs(value_0) < value_tol:
            return mu

        if value_0 < 0:
            right = mu
        else:
            left = mu

        iteration += 1

    # print(f"Stopped after {iteration} iterations: mu={mu:.4f}, value_0={value_0:.6f}")
    return mu


def create_order_specific_mu_dict(df):
    """
    Create μ estimates for each order_quantity combination using Conrad's method.
    Each stockout level gets its own Poisson distribution across the entire dataset.
    """
    order_specific_mu_dict = {}
    total_skipped = 0

    for bezug_val, group in df.groupby('Bezug'):
        n = len(group)
        N = bezug_val
        r = (group['Verkauf'] == bezug_val).sum()  # stockouts
        x_sum = np.minimum(group['Verkauf'], N).sum()

        if n < 2:
            # total_skipped += 1
            # print(f"Skipping: n={n} too small")
            continue
        if r == 0:
            # total_skipped += 1
            # print(f"Skipping: no stockouts in group")
            continue
        if r == n:
            # total_skipped += 1
            # print(f"Skipping: all stockouts in group")
            continue
        
        # bounds are problematic
        try:
            mu_est = compute_mu(0.1, 4, n, N, r, x_sum)
            if mu_est:
                # Store with just order_quantity key
                key = bezug_val
                order_specific_mu_dict[key] = mu_est
        except Exception as e:
            # print(f"Error in Bezug {N}: {e}")
            continue
    #print(f"total skipped: {total_skipped}")

    return order_specific_mu_dict

def expected_poisson_tail(mu, N, max_k=200):
    """
    Compute E[X | X >= N] for X ~ Poisson(mu)
    """
    k_vals = np.arange(N, max_k)
    pmf = poisson.pmf(k_vals, mu)
    tail_prob = 1 - poisson.cdf(N - 1, mu)
    if tail_prob < 1e-8:
        return N  # fallback: don't uncensor
    return np.sum(k_vals * pmf) / tail_prob

def uncensor_dataset(df, order_specific_mu_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    
    Uses order-quantity-specific Poisson(μ) parameters.
    
    Modifies the DataFrame in place by updating the Verkauf column.
    """
    df = df.copy()

    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if not row['is_stockout']:
            continue
            
        # Get the order-quantity-specific demand parameter
        key = row['Bezug']
        mu = order_specific_mu_dict.get(key, None)

        if mu is None:
            # no estimate available for this specific order_quantity — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_poisson_tail(mu, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Usage
order_specific_mu_dict = create_order_specific_mu_dict(df)
df_conrad_uncensored_ng = uncensor_dataset(df, order_specific_mu_dict)
results = calculate_kpis(df_conrad_uncensored_ng, "Conrad, No Grouping, Altered")

Method: Conrad, No Grouping, Altered, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.481 (overestimation)
- Weighted MAE (α=0): 1.132
- Weighted MAE (α=0.5): 1.268
- Weighted MAE (α=1): 1.531
- Weighted MAE (α=1.5): 1.950
- Accuracy (exact matches): 0.334
- Overestimation Rate: 0.526
- Underestimation Rate: 0.140
- Gini Coefficient: 0.490
Overstock: 61050


# Nahmias, No Grouping

In [29]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def compute_mu_sigma_nahmias(sales, S):
    """
    Compute mu and sigma using Nahmias method for censored normal data.
    
    Parameters:
    -----------
    sales : array-like
        Observed sales data (censored at S)
    S : float
        Censoring limit (inventory level)
    
    Returns:
    --------
    tuple: (mu_hat, sigma_hat) or (None, None) if estimation fails
    """
    sales = np.array(sales)
    sales = sales[~np.isnan(sales)]
    
    if len(sales) < 5:  # Need reasonable sample size
        return None, None
    
    # The uncensored observations are those < S
    observed = sales[sales < S]
    n = len(sales)
    r = len(observed)
    
    if r < 2 or r >= n-1:  # Need at least 2 uncensored, 1 censored
        return None, None
        
    p = r / n  # Proportion uncensored
    
    # # Avoid extreme proportions where method is unstable
    # if p < 0.2 or p > 0.8:
    #     return None, None
    
    x_bar = np.mean(observed)
    s2 = np.var(observed, ddof=1)
    
    if s2 <= 1e-6:
        return None, None
    
    try:
        z = norm.ppf(p)
        
        # Check for extreme z values that cause instability
        if abs(z) > 2:
            return None, None
        
        pdf_z = norm.pdf(z)
        
        denominator = 1 - (z * pdf_z / p) - (pdf_z**2 / p**2)
        
        if denominator <= 0.1:  # Need substantial positive denominator
            return None, None
            
        sigma_hat2 = s2 / denominator
        
        if sigma_hat2 <= 0 or sigma_hat2 > 1e6:
            return None, None
            
        sigma_hat = np.sqrt(sigma_hat2)
        mu_hat = x_bar + sigma_hat * pdf_z / p
        
        # Sanity check on estimates
        if abs(mu_hat) > 1e6 or sigma_hat > 1e3:
            return None, None
        
        return mu_hat, sigma_hat
        
    except Exception:
        return None, None

def create_order_specific_nahmias_dict(df):
    """
    Create μ and σ estimates for each order_quantity combination using Nahmias method.
    Each stockout level gets its own Normal distribution across the entire dataset.
    """
    order_specific_mu_dict = {}
    order_specific_sigma_dict = {}

    for bezug_val, group in df.groupby('Bezug'):
        n = len(group)
        S = bezug_val
        
        # Count stockouts (sales = order quantity)
        stockouts = (group['Verkauf'] == bezug_val).sum()
        
        if n < 5 or stockouts == 0 or stockouts == n:
            continue  # Not enough variation or unusable

        try:
            mu_est, sigma_est = compute_mu_sigma_nahmias(group['Verkauf'], S)
            if mu_est is not None and sigma_est is not None:
                # Store with just order_quantity key
                key = bezug_val
                order_specific_mu_dict[key] = mu_est
                order_specific_sigma_dict[key] = sigma_est
        except Exception as e:
            print(f"Error in Bezug {bezug_val}: {e}")
            continue

    return order_specific_mu_dict, order_specific_sigma_dict

def expected_normal_tail(mu, sigma, S, max_iterations=1000):
    """
    Compute E[X | X >= S] for X ~ Normal(mu, sigma)
    Using numerical integration approximation
    """
    if sigma <= 0:
        return S
    
    # Standardize
    z = (S - mu) / sigma
    
    # If S is way above the mean, just return S (no meaningful tail)
    if z > 6:
        return S
    
    # Use the formula: E[X | X >= S] = mu + sigma * phi(z) / (1 - Phi(z))
    # where phi is PDF and Phi is CDF
    tail_prob = 1 - norm.cdf(z)
    
    if tail_prob < 1e-10:  # Essentially no tail probability
        return S
    
    expected_value = mu + sigma * norm.pdf(z) / tail_prob
    
    return expected_value

def uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    
    Uses order-quantity-specific Normal(μ, σ) parameters.
    
    Modifies the DataFrame in place by updating the Verkauf column.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if not row['is_stockout']:
            continue
            
        # Get the order-quantity-specific demand parameters
        key = row['Bezug']
        mu = order_specific_mu_dict.get(key, None)
        sigma = order_specific_sigma_dict.get(key, None)

        if mu is None or sigma is None:
            # no estimate available for this specific order_quantity — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_normal_tail(mu, sigma, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Usage
order_specific_mu_dict, order_specific_sigma_dict = create_order_specific_nahmias_dict(df)
df_nahmias_uncensored_ng = uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict)
results = calculate_kpis(df_nahmias_uncensored_ng, "Nahmias, No Grouping")

Method: Nahmias, No Grouping, on A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.565 (underestimation)
- Weighted MAE (α=0): 0.852
- Weighted MAE (α=0.5): 1.034
- Weighted MAE (α=1): 1.295
- Weighted MAE (α=1.5): 1.639
- Accuracy (exact matches): 0.364
- Overestimation Rate: 0.132
- Underestimation Rate: 0.504
- Gini Coefficient: 0.498
Overstock: 10892


# Grouping by POS Testing 

In [2]:
import pandas as pd
import numpy as np

def calculate_kpis_grouped_by_pos(df, method_name):
    """
    Calculate KPIs per POS (EHASTRA_EH_NUMMER), then average across POS.
    
    Parameters:
    - df: DataFrame with 'Verkauf_Uncensored', 'Verkauf_MBR', 'Zensiert', 'EHASTRA_EH_NUMMER'
    - method_name: Name of the method for display
    
    Returns:
    - Dictionary of averaged KPIs
    """
    kpi_list = []

    grouped = df[df['Zensiert'] == 1].groupby('EHASTRA_EH_NUMMER')

    for _, group in grouped:
        y_pred = group['Verkauf_Uncensored'].values
        y_true = group['Verkauf_MBR'].values

        if len(y_pred) == 0:
            continue

        n = len(y_pred)
        bias = np.sum(y_pred - y_true) / n
        accuracy = np.mean(y_pred == y_true)
        over_rate = np.mean(y_pred > y_true)
        under_rate = np.mean(y_pred < y_true)

        weighted_maes = {}
        alphas = [0, 0.5, 1, 1.5]
        for a in alphas:
            if a == 0:
                weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
            else:
                weights = np.power(y_true, a)
                weights = np.where(y_true == 0, 0, weights)
                weighted_maes[a] = (
                    np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights)
                    if np.sum(weights) > 0 else 0
                )

        abs_errors = np.abs(y_pred - y_true)
        sorted_errors = np.sort(abs_errors)
        n_err = len(abs_errors)
        weighted_sum = np.sum((np.arange(1, n_err + 1)) * sorted_errors)
        total_sum = np.sum(sorted_errors)
        gini = (2 * weighted_sum) / (n_err * total_sum) - (n_err + 1) / n_err if total_sum > 0 else 0

        overstock = np.sum(np.maximum(0, y_pred - y_true))

        kpi_list.append({
            'bias': bias,
            'weighted_mae_0': weighted_maes[0],
            'weighted_mae_0.5': weighted_maes[0.5],
            'weighted_mae_1': weighted_maes[1],
            'weighted_mae_1.5': weighted_maes[1.5],
            'accuracy': accuracy,
            'overestimation_rate': over_rate,
            'underestimation_rate': under_rate,
            'gini_coefficient': gini,
            'overstock': overstock
        })

    kpi_df = pd.DataFrame(kpi_list)
    avg_kpis = kpi_df.mean().to_dict()

    print(f"Method: {method_name} (averaged over {len(kpi_list)} POS)")
    print(f"- Bias: {avg_kpis['bias']:.3f}")
    print(f"- Weighted MAE (α=0): {avg_kpis['weighted_mae_0']:.3f}")
    print(f"- Weighted MAE (α=0.5): {avg_kpis['weighted_mae_0.5']:.3f}")
    print(f"- Weighted MAE (α=1): {avg_kpis['weighted_mae_1']:.3f}")
    print(f"- Weighted MAE (α=1.5): {avg_kpis['weighted_mae_1.5']:.3f}")
    print(f"- Accuracy (exact matches): {avg_kpis['accuracy']:.3f}")
    print(f"- Overestimation Rate: {avg_kpis['overestimation_rate']:.3f}")
    print(f"- Underestimation Rate: {avg_kpis['underestimation_rate']:.3f}")
    print(f"- Gini Coefficient: {avg_kpis['gini_coefficient']:.3f}")
    print(f"- Overstock: {int(avg_kpis['overstock'])}")

    return avg_kpis


# Clausen & Larsen

In [31]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.optimize import minimize_scalar, minimize
import warnings
warnings.filterwarnings('ignore')

class ClausenLarsenParameterEstimator:
    """
    Clausen & Larsen parameter estimation for uncensoring censored demand data.
    Uses only the exploration/estimation portion of their algorithm.
    """
    
    def __init__(self, distribution='poisson'):
        """
        Initialize the parameter estimator
        
        Parameters:
        - distribution: 'poisson' or 'negative_binomial'
        """
        self.distribution = distribution
        
    def _poisson_log_likelihood_censored(self, lam, uncensored_obs, censored_obs, censoring_thresholds):
        """
        Log-likelihood function for Poisson with censored observations
        
        Parameters:
        - lam: Poisson parameter (λ)
        - uncensored_obs: array of uncensored demand observations
        - censored_obs: array of censored sales observations  
        - censoring_thresholds: array of order quantities that caused censoring
        """
        if lam <= 0:
            return -np.inf
            
        ll = 0
        
        # Uncensored observations: sum of log(f(x_i))
        if len(uncensored_obs) > 0:
            ll += np.sum(stats.poisson.logpmf(uncensored_obs, lam))
        
        # Censored observations: sum of log(1 - F(threshold_i - 1))
        if len(censored_obs) > 0:
            for threshold in censoring_thresholds:
                # P(D >= threshold) = 1 - P(D <= threshold-1)
                prob_less_than_threshold = stats.poisson.cdf(threshold - 1, lam)
                prob_geq_threshold = 1 - prob_less_than_threshold
                
                if prob_geq_threshold > 0:
                    ll += np.log(prob_geq_threshold)
                else:
                    return -np.inf
                    
        return ll
    
    def _negative_binomial_log_likelihood_censored(self, params, uncensored_obs, censored_obs, censoring_thresholds):
        """
        Log-likelihood function for Negative Binomial with censored observations
        
        Parameters:
        - params: [r, p] where r > 0 and 0 < p < 1
        - uncensored_obs: array of uncensored demand observations
        - censored_obs: array of censored sales observations
        - censoring_thresholds: array of order quantities that caused censoring
        """
        r, p = params
        if r <= 0 or p <= 0 or p >= 1:
            return -np.inf
            
        ll = 0
        
        # Uncensored observations
        if len(uncensored_obs) > 0:
            ll += np.sum(stats.nbinom.logpmf(uncensored_obs, r, p))
        
        # Censored observations
        if len(censored_obs) > 0:
            for threshold in censoring_thresholds:
                prob_less_than_threshold = stats.nbinom.cdf(threshold - 1, r, p)
                prob_geq_threshold = 1 - prob_less_than_threshold
                
                if prob_geq_threshold > 0:
                    ll += np.log(prob_geq_threshold)
                else:
                    return -np.inf
                    
        return ll
    
    def estimate_parameters(self, sales_data, censored_indicators, order_quantities=None):
        """
        Estimate distribution parameters using MLE with censored data
        
        Parameters:
        - sales_data: array of observed sales
        - censored_indicators: binary array (1 = censored, 0 = uncensored)
        - order_quantities: array of order quantities (if None, assume sales = order qty for censored obs)
        
        Returns:
        - Estimated parameters
        """
        
        # Separate uncensored and censored observations
        uncensored_mask = (censored_indicators == 0)
        censored_mask = (censored_indicators == 1)
        
        uncensored_obs = sales_data[uncensored_mask]
        censored_obs = sales_data[censored_mask]
        
        # Determine censoring thresholds
        if order_quantities is not None:
            censoring_thresholds = order_quantities[censored_mask]
        else:
            # Assume sales = order quantity for censored observations
            censoring_thresholds = censored_obs.copy()
        
        print(f"    Uncensored observations: {len(uncensored_obs)}")
        print(f"    Censored observations: {len(censored_obs)}")
        
        if len(uncensored_obs) == 0 and len(censored_obs) == 0:
            return None
        
        try:
            if self.distribution == 'poisson':
                return self._estimate_poisson_parameters(uncensored_obs, censored_obs, censoring_thresholds)
            elif self.distribution == 'negative_binomial':
                return self._estimate_negative_binomial_parameters(uncensored_obs, censored_obs, censoring_thresholds)
        except Exception as e:
            print(f"    Parameter estimation error: {e}")
            return None
    
    def _estimate_poisson_parameters(self, uncensored_obs, censored_obs, censoring_thresholds):
        """Estimate Poisson λ parameter"""
        
        # Initial guess
        if len(uncensored_obs) > 0:
            initial_lambda = max(0.1, np.mean(uncensored_obs))
        else:
            initial_lambda = max(0.1, np.mean(censoring_thresholds))
        
        # Set bounds
        max_val = max(
            np.max(uncensored_obs) if len(uncensored_obs) > 0 else 0,
            np.max(censoring_thresholds) if len(censoring_thresholds) > 0 else 0
        )
        bounds = (0.1, max(max_val * 3, 10))
        
        # Optimize
        result = minimize_scalar(
            lambda lam: -self._poisson_log_likelihood_censored(
                lam, uncensored_obs, censored_obs, censoring_thresholds
            ),
            bounds=bounds,
            method='bounded'
        )
        
        if result.success:
            return max(0.1, result.x)
        else:
            return initial_lambda
    
    def _estimate_negative_binomial_parameters(self, uncensored_obs, censored_obs, censoring_thresholds):
        """Estimate Negative Binomial [r, p] parameters"""
        
        # Method of moments for initial estimates
        if len(uncensored_obs) > 0:
            sample_mean = np.mean(uncensored_obs)
            sample_var = np.var(uncensored_obs) if len(uncensored_obs) > 1 else sample_mean * 2
        else:
            sample_mean = np.mean(censoring_thresholds)
            sample_var = sample_mean * 2  # Rough guess
        
        # Convert to negative binomial parameters
        if sample_var > sample_mean:
            p_init = sample_mean / sample_var
            r_init = sample_mean * p_init / (1 - p_init)
        else:
            # Handle case where variance <= mean
            p_init = 0.5
            r_init = sample_mean
        
        # Ensure bounds
        p_init = max(0.01, min(0.99, p_init))
        r_init = max(0.1, r_init)
        
        # Optimize
        result = minimize(
            lambda params: -self._negative_binomial_log_likelihood_censored(
                params, uncensored_obs, censored_obs, censoring_thresholds
            ),
            x0=[r_init, p_init],
            bounds=[(0.1, 100), (0.01, 0.99)],
            method='L-BFGS-B'
        )
        
        if result.success:
            return result.x
        else:
            return [r_init, p_init]
    
    def generate_uncensored_demand(self, censored_sales, estimated_params, censoring_thresholds=None):
        if censoring_thresholds is None:
            censoring_thresholds = censored_sales.copy()
        
        uncensored_demands = []
        max_k = 200  # cap for the tail sum

        for threshold in censoring_thresholds:
            k_vals = np.arange(threshold, threshold + max_k)

            if self.distribution == 'poisson':
                lam = estimated_params
                pmf_vals = stats.poisson.pmf(k_vals, lam)
                numerator = np.sum(k_vals * pmf_vals)
                denominator = 1 - stats.poisson.cdf(threshold - 1, lam)

            elif self.distribution == 'negative_binomial':
                r, p = estimated_params
                pmf_vals = stats.nbinom.pmf(k_vals, r, p)
                numerator = np.sum(k_vals * pmf_vals)
                denominator = 1 - stats.nbinom.cdf(threshold - 1, r, p)

            # Compute expected value conditional on being >= threshold
            if denominator > 0:
                expected_demand = numerator / denominator
            else:
                expected_demand = threshold  # fallback to threshold if weird case

            uncensored_demands.append(expected_demand)
        
        return np.array(uncensored_demands)


class ClausenLarsenUncensoring:
    """
    Main interface for Clausen & Larsen parameter estimation and uncensoring
    """
    
    def __init__(self, distribution='poisson', min_observations=10):
        """
        Initialize the uncensoring model
        
        Parameters:
        - distribution: 'poisson' or 'negative_binomial'
        - min_observations: minimum observations needed per store
        """
        self.distribution = distribution
        self.min_observations = min_observations
        
    def _create_time_key(self, df):
        """Create a proper time ordering key from year and week"""
        return df['Heftjahr'] * 100 + df['Heftnummer']
    
    def fit_and_uncensor(self, df):
        """
        Main method to uncensor demand data using Clausen & Larsen parameter estimation
        """
        df_result = df.copy()
        df_result['Verkauf_Uncensored'] = df_result['Verkauf'].copy()
        
        estimation_results = {}
        
        # Process each point of sale separately
        for pos in df['EHASTRA_EH_NUMMER'].unique():
            print(f"Processing Point of Sale: {pos}")
            
            pos_data = df[df['EHASTRA_EH_NUMMER'] == pos].copy()
            
            # Check if we have enough data
            if len(pos_data) < self.min_observations:
                print(f"  Skipping POS {pos}: insufficient data ({len(pos_data)} obs, need {self.min_observations})")
                continue
            
            # Sort by time
            pos_data['time_key'] = self._create_time_key(pos_data)
            pos_data = pos_data.sort_values('time_key')
            
            censored_count = (pos_data['Zensiert'] == 1).sum()
            uncensored_count = (pos_data['Zensiert'] == 0).sum()
            
            print(f"  Total observations: {len(pos_data)}")
            print(f"  Uncensored: {uncensored_count}, Censored: {censored_count}")
            
            if censored_count == 0:
                print(f"  No censoring detected for POS {pos}")
                continue
            
            try:
                # Initialize parameter estimator
                estimator = ClausenLarsenParameterEstimator(distribution=self.distribution)
                
                # Estimate parameters
                sales_data = pos_data['Verkauf'].values
                censored_indicators = pos_data['Zensiert'].values
                
                estimated_params = estimator.estimate_parameters(
                    sales_data, 
                    censored_indicators
                )
                
                if estimated_params is not None:
                    print(f"  Estimated parameters: {estimated_params}")
                    
                    # Generate uncensored demands for censored observations
                    censored_mask = pos_data['Zensiert'] == 1
                    if censored_mask.sum() > 0:
                        censored_sales = pos_data[censored_mask]['Verkauf'].values
                        
                        uncensored_demands = estimator.generate_uncensored_demand(
                            censored_sales, 
                            estimated_params
                        )
                        
                        # Update results
                        df_result.loc[pos_data[censored_mask].index, 'Verkauf_Uncensored'] = \
                            uncensored_demands
                        
                        avg_increase = (uncensored_demands - censored_sales).mean()
                        total_increase = (uncensored_demands - censored_sales).sum()
                        
                        print(f"  Average demand increase per censored observation: {avg_increase:.2f}")
                        print(f"  Total estimated lost sales: {total_increase:.0f}")
                        
                        # Store results
                        estimation_results[pos] = {
                            'distribution': self.distribution,
                            'estimated_params': estimated_params,
                            'censored_observations': censored_count,
                            'uncensored_observations': uncensored_count,
                            'avg_increase_per_censored': avg_increase,
                            'total_lost_sales': total_increase
                        }
                else:
                    print(f"  Failed to estimate parameters for POS {pos}")
                    
            except Exception as e:
                print(f"  Error processing POS {pos}: {str(e)}")
                continue
        
        return df_result, estimation_results


def uncensor_demand_data_clausen_larsen(df, distribution='poisson', min_observations=10):
    """
    Main function for Clausen & Larsen parameter estimation and uncensoring
    
    Parameters:
    - df: DataFrame with columns 'Verkauf', 'Zensiert', 'Heftnummer', 'eftjahr', 'EHASTRA_EH_NUMMER'
    - distribution: 'poisson' or 'negative_binomial' 
    - min_observations: minimum observations needed per store
    
    Returns:
    - Tuple: (uncensored_df, estimation_results_dict)
    """
    
    print(f"Starting Clausen & Larsen parameter estimation for uncensoring")
    print(f"Distribution: {distribution}")
    print(f"Minimum observations per store: {min_observations}")
    print(f"Data shape: {df.shape}")
    print(f"Unique points of sale: {df['EHASTRA_EH_NUMMER'].nunique()}")
    print(f"Total censored observations: {(df['Zensiert'] == 1).sum()}")
    print()
    
    # Validate required columns
    required_cols = ['Verkauf', 'Zensiert', 'Heftnummer', 'Heftjahr', 'EHASTRA_EH_NUMMER']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Initialize and run the uncensoring model
    model = ClausenLarsenUncensoring(
        distribution=distribution,
        min_observations=min_observations
    )
    
    result_df, estimation_results = model.fit_and_uncensor(df)
    
    print("\nUncensoring complete!")
    print(f"Original total sales: {df['Verkauf'].sum()}")
    print(f"Uncensored total demand: {result_df['Verkauf_Uncensored'].sum()}")
    print(f"Estimated total lost sales: {result_df['Verkauf_Uncensored'].sum() - df['Verkauf'].sum()}")
    
    # Summary statistics
    processed_stores = len(estimation_results)
    if processed_stores > 0:
        avg_params = {}
        if distribution == 'poisson':
            lambdas = [results['estimated_params'] for results in estimation_results.values()]
            avg_params['average_lambda'] = np.mean(lambdas)
        elif distribution == 'negative_binomial':
            r_values = [results['estimated_params'][0] for results in estimation_results.values()]
            p_values = [results['estimated_params'][1] for results in estimation_results.values()]
            avg_params['average_r'] = np.mean(r_values)
            avg_params['average_p'] = np.mean(p_values)
        
        print(f"Processed {processed_stores} stores successfully")
        print(f"Average estimated parameters: {avg_params}")
    
    return result_df, estimation_results

# Example usage:
# uncensored_df, estimation_results = uncensor_demand_data_clausen_larsen(
#     your_df, 
#     distribution='poisson',
#     min_observations=10
# )

In [32]:
df_clausen_uncensored = uncensor_demand_data_clausen_larsen(df, distribution='negative_binomial')

Starting Clausen & Larsen parameter estimation for uncensoring
Distribution: negative_binomial
Minimum observations per store: 10
Data shape: (277894, 21)
Unique points of sale: 3053
Total censored observations: 75700

Processing Point of Sale: EHA0017186
  Total observations: 36
  Uncensored: 35, Censored: 1
    Uncensored observations: 35
    Censored observations: 1
  Estimated parameters: [11.2142254  0.99     ]
  Average demand increase per censored observation: 0.06
  Total estimated lost sales: 0
Processing Point of Sale: EHA0021360
  Total observations: 10
  Uncensored: 10, Censored: 0
  No censoring detected for POS EHA0021360
Processing Point of Sale: EHA0089518
  Total observations: 128
  Uncensored: 85, Censored: 43
    Uncensored observations: 85
    Censored observations: 43
  Estimated parameters: [8.49766471 0.74969728]
  Average demand increase per censored observation: 1.61
  Total estimated lost sales: 69
Processing Point of Sale: EHA0101910
  Total observations: 132

# Grouping by Bezug

In [33]:
def calculate_kpis_by_bezug_average(dataframe, method_name):
    """
    Calculate KPIs for demand forecasting evaluation, averaged over 'Bezug' groups.
    """
    grouped = dataframe.groupby('Bezug')

    results = []

    for _, group in grouped:
        df = group.copy()
        censored_df = df[df['Zensiert'] == 1]

        if censored_df.empty:
            continue

        y_pred = censored_df['Verkauf_Uncensored'].values
        y_true = censored_df['Verkauf_MBR'].values
        n = len(y_pred)

        bias = np.sum(y_pred - y_true) / n
        accuracy = np.sum(y_pred == y_true) / n
        overestimation_rate = np.sum(y_pred > y_true) / n
        underestimation_rate = np.sum(y_pred < y_true) / n

        alphas = [0, 0.5, 1, 1.5]
        weighted_maes = {}
        for a in alphas:
            if a == 0:
                weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
            else:
                weights = np.power(y_true, a)
                weights = np.where(y_true == 0, 0, weights)
                weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0

        abs_errors = np.abs(y_pred - y_true)
        sorted_errors = np.sort(abs_errors)
        weighted_sum = np.sum((i + 1) * sorted_errors[i] for i in range(n))
        total_sum = np.sum(abs_errors)
        gini = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n if total_sum > 0 else 0
        overstock = np.sum(np.maximum(0, y_pred - y_true))

        results.append({
            'bias': bias,
            'accuracy': accuracy,
            'overestimation_rate': overestimation_rate,
            'underestimation_rate': underestimation_rate,
            'weighted_mae_0': weighted_maes[0],
            'weighted_mae_0.5': weighted_maes[0.5],
            'weighted_mae_1': weighted_maes[1],
            'weighted_mae_1.5': weighted_maes[1.5],
            'gini_coefficient': gini,
            'overstock': overstock
        })

    if not results:
        print(f"No valid (Zensiert == 1) rows found for method {method_name}")
        return {}

    avg_results = pd.DataFrame(results).mean()

    bias = avg_results['bias']
    bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"

    print(f"Method: {method_name} (averaged by Bezug)")
    print(f"- Bias: {bias:.3f} ({bias_direction})")
    print(f"- Weighted MAE (α=0): {avg_results['weighted_mae_0']:.3f}")
    print(f"- Weighted MAE (α=0.5): {avg_results['weighted_mae_0.5']:.3f}")
    print(f"- Weighted MAE (α=1): {avg_results['weighted_mae_1']:.3f}")
    print(f"- Weighted MAE (α=1.5): {avg_results['weighted_mae_1.5']:.3f}")
    print(f"- Accuracy (exact matches): {avg_results['accuracy']:.3f}")
    print(f"- Overestimation Rate: {avg_results['overestimation_rate']:.3f}")
    print(f"- Underestimation Rate: {avg_results['underestimation_rate']:.3f}")
    print(f"- Gini Coefficient: {avg_results['gini_coefficient']:.3f}")
    print(f"Overstock: {int(avg_results['overstock'])}")

    return avg_results.to_dict()


# Grouping by Heftnummer, Heftjahr Testing

In [34]:
def calculate_kpis_by_issue_average(dataframe, method_name):
    """
    Calculate KPIs for demand forecasting evaluation, averaged over (Heftnummer, Heftjahr) groups.
    """
    grouped = dataframe.groupby(['Heftnummer', 'Heftjahr'])

    results = []

    for _, group in grouped:
        df = group.copy()
        censored_df = df[df['Zensiert'] == 1]

        if censored_df.empty:
            continue
        
        y_pred = censored_df['Verkauf_Uncensored'].values
        y_true = censored_df['Verkauf_MBR'].values
        n = len(y_pred)

        bias = np.sum(y_pred - y_true) / n
        accuracy = np.sum(y_pred == y_true) / n
        overestimation_rate = np.sum(y_pred > y_true) / n
        underestimation_rate = np.sum(y_pred < y_true) / n

        alphas = [0, 0.5, 1, 1.5]
        weighted_maes = {}
        for a in alphas:
            if a == 0:
                weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
            else:
                weights = np.power(y_true, a)
                weights = np.where(y_true == 0, 0, weights)
                weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0

        abs_errors = np.abs(y_pred - y_true)
        sorted_errors = np.sort(abs_errors)
        weighted_sum = np.sum((i + 1) * sorted_errors[i] for i in range(n))
        total_sum = np.sum(abs_errors)
        gini = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n if total_sum > 0 else 0
        overstock = np.sum(np.maximum(0, y_pred - y_true))

        results.append({
            'bias': bias,
            'accuracy': accuracy,
            'overestimation_rate': overestimation_rate,
            'underestimation_rate': underestimation_rate,
            'weighted_mae_0': weighted_maes[0],
            'weighted_mae_0.5': weighted_maes[0.5],
            'weighted_mae_1': weighted_maes[1],
            'weighted_mae_1.5': weighted_maes[1.5],
            'gini_coefficient': gini,
            'overstock': overstock
        })

    if not results:
        print(f"No valid (Zensiert == 1) rows found for method {method_name}")
        return {}

    avg_results = pd.DataFrame(results).mean()

    bias = avg_results['bias']
    bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"

    print(f"Method: {method_name} (averaged by Heftnummer & Heftjahr)")
    print(f"- Bias: {bias:.3f} ({bias_direction})")
    print(f"- Weighted MAE (α=0): {avg_results['weighted_mae_0']:.3f}")
    print(f"- Weighted MAE (α=0.5): {avg_results['weighted_mae_0.5']:.3f}")
    print(f"- Weighted MAE (α=1): {avg_results['weighted_mae_1']:.3f}")
    print(f"- Weighted MAE (α=1.5): {avg_results['weighted_mae_1.5']:.3f}")
    print(f"- Accuracy (exact matches): {avg_results['accuracy']:.3f}")
    print(f"- Overestimation Rate: {avg_results['overestimation_rate']:.3f}")
    print(f"- Underestimation Rate: {avg_results['underestimation_rate']:.3f}")
    print(f"- Gini Coefficient: {avg_results['gini_coefficient']:.3f}")
    print(f"Overstock: {int(avg_results['overstock'])}")

    return avg_results.to_dict()
