# Test 1 Censoring

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path = 'Test1/I_20250212_ZQ0.35_ZG0.4_testfile.csv'
df = pd.read_csv(path)

df["is_stockout"] = (df["Zensiert"] == 1)
df = df.dropna(subset=['Verkauf_MBR'])
df = df.dropna(subset=['Verkauf'])

df.head()
len(df)

21595

# N3

In [19]:
# Grouping N3

import pandas as pd
import numpy as np

def apply_n3_uncensoring(df):
    df = df.copy()
    
    # Mark closed observations
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Compute open group means
    open_means = (
        df[~df['is_closed']]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('open_mean')
        .reset_index()
        .round()
    )
    
    # Merge open means back to df
    df = df.merge(open_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Compute the max(mean, observed) for censored rows only
    mask = df['is_closed'] & df['open_mean'].notna()
    df.loc[mask, 'Verkauf_Uncensored'] = np.maximum(df.loc[mask, 'Verkauf_Uncensored'], df.loc[mask, 'open_mean'])
    
    return df.drop(['is_closed', 'open_mean'], axis=1)

# # Naive N3
# def apply_n3_uncensoring(df):
#     """
#     N3 uncensoring: replace censored values with max(current_value, mean_of_uncensored).
#     """
#     df = df.copy()
    
#     # Calculate mean of all uncensored observations
#     uncensored_mean = df[df['Zensiert'] == 0]['Verkauf'].mean()
    
#     # Replace censored values with max(current, uncensored_mean)
#     df.loc[df['Zensiert'] == 1, 'Verkauf'] = np.maximum(
#         df.loc[df['Zensiert'] == 1, 'Verkauf'], 
#         uncensored_mean
#     )
    
#     return df

df_n3_uncensored = apply_n3_uncensoring(df)

Saving file

In [4]:
df_n3_uncensored.to_csv(path[:-4] + '_n3.csv', index=False)
df_n3_uncensored

Unnamed: 0,EHASTRA_EH_NUMMER,Heftjahr,Heftnummer,Period,Bezug_MBR,Verkauf_MBR,Remission_MBR,Bezug,Verkauf,Remission,...,SBC,XYZ,Laenge,Zensierungsquote,Zensierungsgrad,Zensiert,Entgangener Verkauf,Steigung,is_stockout,Verkauf_Uncensored
0,EHA0017186,2022,4,2022-004,5.0,1.0,4.0,5.0,1.0,4.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,1.0
1,EHA0017186,2022,5,2022-005,5.0,0.0,5.0,5.0,0.0,5.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
2,EHA0017186,2022,6,2022-006,5.0,0.0,5.0,5.0,0.0,5.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
3,EHA0017186,2022,7,2022-007,4.0,0.0,4.0,4.0,0.0,4.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
4,EHA0017186,2022,8,2022-008,3.0,0.0,3.0,3.0,0.0,3.0,...,I,Z,36,0.3160,0.4,0,0.0,0.0085,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277889,EHJ9923970,2024,25,2024-025,9.0,7.0,2.0,9.0,7.0,2.0,...,S,X,132,0.2735,0.4,0,0.0,0.0085,False,7.0
277890,EHJ9923970,2024,26,2024-026,8.0,8.0,0.0,9.0,8.0,1.0,...,S,X,132,0.2735,0.4,0,0.0,0.0085,False,8.0
277891,EHJ9923970,2024,27,2024-027,10.0,8.0,2.0,7.0,7.0,0.0,...,S,X,132,0.2735,0.4,1,1.0,0.0085,True,10.0
277892,EHJ9923970,2024,28,2024-028,10.0,9.0,1.0,5.0,5.0,0.0,...,S,X,132,0.2735,0.4,1,4.0,0.0085,True,10.0


# N2

In [61]:
# Grouping N2 with fallback imputation
def apply_n2_uncensoring(df):
    """
    N2 uncensoring with grouping: replace censored values with mean of uncensored values within each group.
    If no uncensored values exist in a group, fall back to original censored values.
    """
    df = df.copy()

    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate mean of uncensored observations for each group
    uncensored_means = (
        df[df['Zensiert'] == 0]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('uncensored_mean')
        .reset_index()
    )
    
    # Round the means
    uncensored_means['uncensored_mean'] = uncensored_means['uncensored_mean'].round()
    
    # Merge back to original DataFrame
    df = df.merge(uncensored_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Replace censored values with group mean, but only where uncensored_mean exists
    mask_censored_with_mean = (df['Zensiert'] == 1) & (df['uncensored_mean'].notna())
    df.loc[mask_censored_with_mean, 'Verkauf_Uncensored'] = df.loc[mask_censored_with_mean, 'uncensored_mean']
    
    # For groups with no uncensored observations (uncensored_mean is NaN), 
    # censored values remain as their original Verkauf values (no change needed)
    
    # Optional: Print diagnostic info
    groups_with_no_uncensored = df[df['uncensored_mean'].isna()]['EHASTRA_EH_NUMMER'].nunique()
    total_groups = df['EHASTRA_EH_NUMMER'].nunique()
    print(f"N2 imputation: {groups_with_no_uncensored}/{total_groups} groups had no uncensored data, using original values")
    
    return df.drop('uncensored_mean', axis=1)

df_n2_uncensored = apply_n2_uncensoring(df)

N2 imputation: 0/1734 groups had no uncensored data, using original values


Saving File

In [6]:
df_n2_uncensored.to_csv(path[:-4] + '_n2.csv', index=False)

# N1

In [21]:
# Grouping N1

import pandas as pd
import numpy as np

def apply_n1_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    # Identify closed observations
    is_closed = (df["Zensiert"] == 1)

    # Compute mean Verkauf per group and broadcast using transform
    group_means = df.groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored'].transform('mean').round()
    
    # Replace closed observations
    df.loc[is_closed, 'Verkauf_Uncensored'] = group_means[is_closed]
    
    return df

df_n1_uncensored = apply_n1_uncensoring(df)

# # Naive N1

# def apply_n1_uncensoring(df):
#     """
#     N1 uncensoring: replace censored values with mean of all values.
#     """
#     df = df.copy()
    
#     # Calculate mean of all observations (censored and uncensored)
#     overall_mean = df['Verkauf'].mean()
    
#     # Replace censored values with this mean
#     df.loc[df['Zensiert'] == 1, 'Verkauf'] = overall_mean
    
#     return df

Saving File

In [8]:
df_n1_uncensored.to_csv(path[:-4] + '_n1.csv', index=False)

# EM

In [22]:
import numpy as np
from scipy.stats import poisson
import pandas as pd

def apply_em_uncensoring(df, max_iter=30, tolerance=1e-6):

    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Direct boolean indexing without extra columns
    stockout_condition = (df['Zensiert'] == 1)
    
    # Process groups with minimal overhead
    for (pos), group in df.groupby(['EHASTRA_EH_NUMMER']):
        group_stockout = stockout_condition.loc[group.index]
        
        if not group_stockout.any():
            continue
            
        sales = group['Verkauf_Uncensored'].values
        is_stockout = group_stockout.values
        
        uncensored = sales[~is_stockout]
        censored = sales[is_stockout]
        
        # Quick lambda initialization
        lambda_est = np.mean(uncensored) if len(uncensored) > 0 else np.mean(sales) * 1.5
        lambda_est = max(lambda_est, 0.1)
        
        # Fast EM loop
        for _ in range(max_iter):
            lambda_old = lambda_est
            
            # Batch E-step
            surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
            exact_prob = poisson.pmf(censored, lambda_est)
            surv_prob = np.maximum(surv_prob, 1e-12)
            
            expected = lambda_est + censored * exact_prob / surv_prob
            expected = np.maximum(expected, censored.astype(float))
            
            # M-step
            lambda_est = max(np.mean(np.concatenate([uncensored, expected])), 0.1)
            
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final update
        surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
        exact_prob = poisson.pmf(censored, lambda_est)
        surv_prob = np.maximum(surv_prob, 1e-12)
        
        final_expected = lambda_est + censored * exact_prob / surv_prob
        final_expected = np.maximum(final_expected, censored.astype(float))
        
        # Update original dataframe
        stockout_indices = group.index[is_stockout]
        df.loc[stockout_indices, 'Verkauf_Uncensored'] = final_expected.round()
    
    return df

df_em_uncensored = apply_em_uncensoring(df) 

Saving File

In [10]:
df_em_uncensored.to_csv(path[:-4] + '_em.csv', index=False)

# PD (Poisson, Grouped by HF & POS)

In [23]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

def apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=20, tolerance=1e-4):
    """
    Proper Projection Detruncation implementation using Poisson distribution.
    
    Parameters:
    - tau: Parameter that controls the aggressiveness of unconstraining (0 < tau < 1)
           tau = 0.5 gives balanced results similar to EM
           smaller tau values are more aggressive
    - max_iter: Maximum number of iterations
    - tolerance: Convergence tolerance
    """
    df = df.copy()
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    def compute_pd_projection1(obs_val, lambda_est, tau):
        """
        Compute the PD projection for a single observation using Poisson distribution.
        This balances area A (original to new estimate) with area B (new estimate to infinity)
        weighted by parameter tau.
        """
        # For Poisson distribution, we need to find the projection value
        # that balances the two areas according to tau
        
        def objective(k_proj):
            k_proj = int(round(k_proj))  # Ensure integer for discrete distribution
            
            if k_proj < obs_val:
                return float('inf')  # Invalid projection
            
            # Area A: P(obs_val <= X <= k_proj) = P(X <= k_proj) - P(X <= obs_val-1)
            area_A = poisson.cdf(k_proj, lambda_est) - poisson.cdf(obs_val - 1, lambda_est)
            
            # Area B: P(X > k_proj) = 1 - P(X <= k_proj)
            area_B = 1 - poisson.cdf(k_proj, lambda_est)
            
            # PD tries to balance: tau * area_A = (1 - tau) * area_B
            # Rearranging: area_A / area_B = (1 - tau) / tau
            if area_B > 1e-10:  # Avoid division by zero
                ratio = area_A / area_B
                target_ratio = (1 - tau) / tau
                return abs(ratio - target_ratio)
            else:
                return abs(area_A - (1 - tau))
        
        # Search for optimal projection in reasonable range
        # For Poisson, search from obs_val to obs_val + reasonable upper bound
        upper_bound = int(obs_val + max(10, int(3 * np.sqrt(lambda_est))))
        
        best_k = obs_val
        best_objective = float('inf')
        
        # Discrete search since Poisson is discrete
        for k in range(int(obs_val), upper_bound + 1):
            obj_val = objective(k)
            if obj_val < best_objective:
                best_objective = obj_val
                best_k = k
        
        return best_k
    
    # Process groups
    grouped = df.groupby(['EHASTRA_EH_NUMMER', 'Heftnummer'])
    
    # can group by different things, just pass in appropriate arguments here 
    for (pos, hf), group in grouped:
        open_mask = ~group['is_closed']
        closed_mask = group['is_closed']
        
        open_sales = group.loc[open_mask, 'Verkauf_Uncensored'].values
        closed_sales = group.loc[closed_mask, 'Verkauf_Uncensored'].values
        
        if len(closed_sales) == 0:
            continue
        
        # Initialize lambda parameter using all available data
        all_sales = group['Verkauf_Uncensored'].values
        lambda_est = np.mean(all_sales)
        
        # Ensure minimum lambda for numerical stability
        lambda_est = max(lambda_est, 0.1)
        
        closed_indices = group[closed_mask].index.values
        
        # Iterative process
        for iteration in range(max_iter):
            lambda_old = lambda_est
            
            # Project closed observations using PD heuristic
            projected_values = np.array([
                compute_pd_projection1(obs, lambda_est, tau) 
                for obs in closed_sales
            ])
            
            # Re-estimate lambda using open + projected values
            all_values = np.concatenate([open_sales, projected_values])
            lambda_est = np.mean(all_values)
            
            # Ensure minimum lambda
            lambda_est = max(lambda_est, 0.1)
            
            # Check convergence
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final projection
        final_projections = np.array([
            compute_pd_projection1(obs, lambda_est, tau) 
            for obs in closed_sales
        ])
        
        # Update dataframe (already integers from Poisson)
        df.loc[closed_indices, 'Verkauf_Uncensored'] = final_projections.round()
    
    return df.drop('is_closed', axis=1)

def apply_projection_detruncation_aggressive1(df, max_iter=20, tolerance=1e-4):
    """
    More aggressive PD implementation with tau=0.3 as mentioned in the paper
    """
    return apply_projection_detruncation_fixed1(df, tau=0.3, max_iter=max_iter, tolerance=tolerance)

def apply_projection_detruncation_balanced1(df, max_iter=20, tolerance=1e-4):
    """
    Balanced PD implementation with tau=0.5 (similar to EM results)
    """
    return apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=max_iter, tolerance=tolerance)

df_projection_uncensored = apply_projection_detruncation_balanced1(df)

Saving File

In [12]:
df_projection_uncensored.to_csv(path[:-4] + '_pd.csv', index=False)

# Conrad

In [24]:
from scipy.stats import poisson
import pandas as pd
import numpy as np

def berechnung(links, rechts, n, N, r, x_summe, value_tol=0.00001, max_iterations=1000):
    """
    Till's Code
    """
    iteration = 0
    
    while iteration < max_iterations:
        mu = (links + rechts) / 2
        wert_0 = (x_summe - mu * n) * (1 - poisson.cdf(N-1, mu)) + mu * (n - r) * (1 - poisson.cdf(N-2, mu))
        
        if iteration < 3:
            print(f"Iter {iteration}: mu={mu:.4f}, wert_0={wert_0:.8f}")
        
        if abs(wert_0 - 0) < value_tol:
            print(f"Converged after {iteration} iterations: mu={mu:.4f}, wert_0={wert_0:.8f}")
            return mu
        elif wert_0 - 0 < 0:  # Your condition: je kleiner mu desto kleiner wird der wert
            rechts = mu
        elif wert_0 - 0 > 0:
            links = mu
            
        iteration += 1
    
    print(f"✗ Max iterations reached: mu={mu:.4f}")
    return mu

def test_conrad_example():
    links = 1
    rechts = 100
    n = 13
    N = 10
    r = 7
    x_summe = 58
    
    print(f"links={links}, rechts={rechts}")
    print(f"n={n}, N={N}, r={r}, x_summe={x_summe}")
    
    result = berechnung(links, rechts, n, N, r, x_summe)
    print(f"Result: μ = {result:.4f}")
    print(f"Expected: μ ≈ 10.18")
    
    return result

def create_order_specific_mu_dict(df):

    order_specific_mu_dict = {}
    
    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            N = bezug_val
            
            # Count non-stockouts
            stockouts_mask = (group['Verkauf'] == bezug_val)
            r = n - stockouts_mask.sum()  # r = number of NON-stockouts
            
            # x_summe = sum of UNCENSORED observations only
            uncensored_sales = group[~stockouts_mask]['Verkauf']
            x_summe = uncensored_sales.sum()
            
            # Skip problematic cases
            if n < 3:
                continue
            if r == n:  # No stockouts = no censoring information
                continue
            if r == 0:  # All stockouts = no uncensored observations
                continue
            
            try:
                links = 1
                rechts = 100
                mu_est = berechnung(links, rechts, n, N, r, x_summe)
                if mu_est:
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {N}: {e}")
                continue

    print(f"Successfully estimated μ for {len(order_specific_mu_dict)} groups")
    return order_specific_mu_dict

def expected_poisson_tail(mu, N, max_k=200):
    """
    Compute E[X | X >= N] for X ~ Poisson(mu)
    """
    k_vals = np.arange(N, max_k)
    pmf = poisson.pmf(k_vals, mu)
    tail_prob = 1 - poisson.cdf(N - 1, mu)
    if tail_prob < 1e-8:
        return N  # fallback: don't uncensor
    return np.sum(k_vals * pmf) / tail_prob

def uncensor_dataset(df, order_specific_mu_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if not row['is_stockout']:
            continue
            
        # Get the order-quantity-specific demand parameter
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)

        if mu is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_poisson_tail(mu, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Usage
print("Example:")
test_result = test_conrad_example()

print("\nRunning on actual data:")
order_specific_mu_dict = create_order_specific_mu_dict(df)
df_conrad_uncensored = uncensor_dataset(df, order_specific_mu_dict)

Example:
links=1, rechts=100
n=13, N=10, r=7, x_summe=58
Iter 0: mu=50.5000, wert_0=-295.50000000
Iter 1: mu=25.7500, wert_0=-122.21964104
Iter 2: mu=13.3750, wert_0=-25.84454646
Converged after 22 iterations: mu=10.1819, wert_0=0.00000504
Result: μ = 10.1819
Expected: μ ≈ 10.18

Running on actual data:
Iter 0: mu=50.5000, wert_0=-892.00000000
Iter 1: mu=25.7500, wert_0=-446.49999930
Iter 2: mu=13.3750, wert_0=-223.70554057
Converged after 24 iterations: mu=3.7998, wert_0=-0.00000516
Iter 0: mu=50.5000, wert_0=-13891.00000000
Iter 1: mu=25.7500, wert_0=-7010.49997988
Iter 2: mu=13.3750, wert_0=-3569.55337015
Converged after 26 iterations: mu=1.0588, wert_0=0.00000437
Iter 0: mu=50.5000, wert_0=-13911.00000000
Iter 1: mu=25.7500, wert_0=-6931.49984518
Iter 2: mu=13.3750, wert_0=-3438.90530262
Converged after 25 iterations: mu=1.3697, wert_0=-0.00000102
Iter 0: mu=50.5000, wert_0=-4094.00000000
Iter 1: mu=25.7500, wert_0=-2014.99966598
Iter 2: mu=13.3750, wert_0=-972.27435005
Converged a

In [16]:
results = calculate_kpis(df_conrad_uncensored, "Conrad")

Method: Conrad, on Test1/I_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.028 (overestimation)
- Weighted MAE (α=0): 1.013
- Weighted MAE (α=0.5): 1.171
- Weighted MAE (α=1): 1.488
- Weighted MAE (α=1.5): 2.005
- Accuracy (exact matches): 0.290
- Overestimation Rate: 0.406
- Underestimation Rate: 0.304
- Gini Coefficient: 0.456
Overstock: 2656


In [14]:
df_conrad_uncensored.to_csv(path[:-4] + '_conrad.csv', index=False)

In [15]:
# # Check mean vs variance by magazine
# print("=== MEAN VS VARIANCE BY MAGAZINE ===")
# good_poisson_count = 0
# total_magazines = 0

# for magazine_id in df['EHASTRA_EH_NUMMER'].unique():
#    mag_data = df[df['EHASTRA_EH_NUMMER'] == magazine_id]['Verkauf']
#    mean_val = mag_data.mean()
#    var_val = mag_data.var()
#    ratio = var_val/mean_val
   
#    if 0.7 <= ratio <= 1.3:
#        good_poisson_count += 1
#    total_magazines += 1
   
#    # print(f"Magazine {magazine_id}: Mean={mean_val:.2f}, Variance={var_val:.2f}, Ratio={ratio:.2f}")

# print(f"\nMagazines with Poisson-like ratio (0.7-1.3): {good_poisson_count}/{total_magazines}")

# # Overall (combined)
# print(f"\nCombined: Mean={df['Verkauf'].mean():.2f}, Variance={df['Verkauf'].var():.2f}, Ratio={df['Verkauf'].var()/df['Verkauf'].mean():.2f}")

# Nahmias

In [25]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def nahmias_estimation(sales, S):
    """
    Nahmias method for censored normal data
    """
    sales = np.array(sales)
    n = len(sales)
    
    observed = sales[sales < S]
    r = len(observed)
    p = r / n
    
    if r < 2 or r >= n-1 or p <= 0 or p >= 1:
        return None, None
    
    try:
        x_bar = np.mean(observed)
        s2 = np.var(observed, ddof=1)
        z = norm.ppf(p)
        
        sigma_hat2 = s2 / (1 - (z * norm.pdf(z) / p) - (norm.pdf(z)**2 / p**2))
        sigma_hat = np.sqrt(sigma_hat2)
        mu_hat = x_bar + sigma_hat * norm.pdf(z) / p
        
        if not np.isfinite(mu_hat) or not np.isfinite(sigma_hat) or sigma_hat <= 0:
            return None, None
            
        return mu_hat, sigma_hat
        
    except Exception:
        return None, None

def create_order_specific_nahmias_dict(df):
    """
    Create μ and σ estimates for each (week, order_quantity) combination
    """
    order_specific_mu_dict = {}
    order_specific_sigma_dict = {}

    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            S = bezug_val
            
            if n < 5:
                continue
            
            sales = group['Verkauf'].values
            
            try:
                mu_est, sigma_est = nahmias_estimation(sales, S)
                if mu_est is not None and sigma_est is not None:
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
                    order_specific_sigma_dict[key] = sigma_est
            except Exception as e:
                continue

    print(f"Successfully estimated μ,σ for {len(order_specific_mu_dict)} groups")
    return order_specific_mu_dict, order_specific_sigma_dict

def expected_normal_tail(mu, sigma, S):
    """
    Compute E[X | X >= S] for X ~ Normal(mu, sigma)
    """
    if sigma <= 0:
        return S
    
    z = (S - mu) / sigma
    
    if z > 6:
        return S
    
    tail_prob = 1 - norm.cdf(z)
    
    if tail_prob < 1e-10:
        return S
    
    expected_value = mu + sigma * norm.pdf(z) / tail_prob
    
    return expected_value

def uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict):
    """
    Uncensor dataset using Nahmias estimates
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        if not row['is_stockout']:
            continue
            
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)
        sigma = order_specific_sigma_dict.get(key, None)

        if mu is None or sigma is None:
            continue

        est_demand = expected_normal_tail(mu, sigma, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

def test_nahmias():
    """Test implementation"""
    mu_true = 100
    sigma_true = 30
    S = 110
    n = 100
    
    np.random.seed(42)
    demand = np.random.normal(mu_true, sigma_true, n)
    sales = np.minimum(demand, S)
    
    print("Testing Nahmias implementation:")
    print(f"True μ: {mu_true}, True σ: {sigma_true}")
    print(f"S (censoring limit): {S}")
    print(f"Sample size: {n}")
    
    mu_hat, sigma_hat = nahmias_estimation(sales, S)
    
    naive_mean = np.mean(sales)
    naive_std = np.std(sales, ddof=1)
    
    print(f"True mean: {mu_true}")
    print(f"Naive mean (sales): {naive_mean:.2f}")
    print(f"Corrected estimator (Nahmias): {mu_hat:.2f}")
    print(f"True Std.Dev.: {sigma_true}")
    print(f"Corrected Std.Dev.: {sigma_hat:.2f}")
    print(f"Naive Std.Dev. (sales): {naive_std:.2f}")
    
    return mu_hat, sigma_hat

# Usage
print("Testing implementation first:")
test_mu, test_sigma = test_nahmias()

print("\nRunning on data:")
order_specific_mu_dict, order_specific_sigma_dict = create_order_specific_nahmias_dict(df)
df_nahmias_uncensored = uncensor_dataset_nahmias(df, order_specific_mu_dict, order_specific_sigma_dict)

Testing implementation first:
Testing Nahmias implementation:
True μ: 100, True σ: 30
S (censoring limit): 110
Sample size: 100
True mean: 100
Naive mean (sales): 91.54
Corrected estimator (Nahmias): 97.74
True Std.Dev.: 30
Corrected Std.Dev.: 28.40
Naive Std.Dev. (sales): 20.61

Running on data:
Successfully estimated μ,σ for 212 groups


In [18]:
results = calculate_kpis(df_nahmias_uncensored, "Nahmias")

Method: Nahmias, on Test1/I_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.479 (underestimation)
- Weighted MAE (α=0): 0.866
- Weighted MAE (α=0.5): 1.085
- Weighted MAE (α=1): 1.457
- Weighted MAE (α=1.5): 2.017
- Accuracy (exact matches): 0.351
- Overestimation Rate: 0.164
- Underestimation Rate: 0.485
- Gini Coefficient: 0.491
Overstock: 987


In [55]:
import numpy as np
from scipy.stats import norm
 
# 1. Define parameters
mu_true = 100  # True demand
sigma_true = 30
S = 110  # Inventory level (censoring limit)
n = 100  # Sample size
 
# 2. Simulate demand
demand = np.random.normal(mu_true, sigma_true, n)
sales = np.minimum(demand, S)  # Censored sales
 
# 3. Proportion not censored
observed = sales[sales < S]
r = len(observed)
p = r / n  # Proportion observed
 
# 4. Estimator according to Nahmias (simplified version)
x_bar = np.mean(observed)
s2 = np.var(observed, ddof=1)
z = norm.ppf(p)  # Standard normal inverse CDF
 
# Estimator for Sigma²
sigma_hat2 = s2 / (1 - (z * norm.pdf(z) / p) - (norm.pdf(z)**2 / p**2))
sigma_hat = np.sqrt(sigma_hat2)
 
# Estimator for Mu
mu_hat = x_bar + sigma_hat * norm.pdf(z) / p
 
# 5. Comparison: naive estimator without correction
naive_mean = np.mean(sales)
 
# 6. Output results
print(f"True mean: {mu_true}")
print(f"Naive mean (sales): {naive_mean:.2f}")
print(f"Corrected estimator (Nahmias): {mu_hat:.2f}")
print(f"True Std.Dev.: {sigma_true}")
print(f"Corrected Std.Dev.: {sigma_hat:.2f}")
naive_std = np.std(sales, ddof=1)
print(f"Naive Std.Dev. (sales): {naive_std:.2f}")


True mean: 100
Naive mean (sales): 93.57
Corrected estimator (Nahmias): 100.17
True Std.Dev.: 30
Corrected Std.Dev.: 27.11
Naive Std.Dev. (sales): 19.06


In [17]:
df_nahmias_uncensored.to_csv(path[:-4] + '_nahmias.csv', index=False)

# Applying empirical

# Testing

In [55]:
import pandas as pd
import numpy as np

def build_empirical(df_counts):
    '''
    Given a value counts dataframe, return a dataframe indexed by demand and containing columns count, pmf value, and cmf value
    '''
    demand_distribution = pd.DataFrame(df_counts)
    demand_distribution.sort_index(inplace=True)
    demand_distribution['pmf'] = demand_distribution['count']/demand_distribution['count'].sum()
    demand_distribution['cmf'] = demand_distribution['pmf'].cumsum()
    return demand_distribution

def update_empirical(empirical, demand):
    '''
    Update empirical demand distribution with new uncensored demand
    '''
    if demand in empirical.index: 
        empirical.loc[demand, 'count'] += 1
    else:
        empirical.loc[demand, 'count'] = 1
        empirical.sort_index(inplace=True)
    
    empirical['pmf'] = empirical['count']/empirical['count'].sum()
    empirical['cmf'] = empirical['pmf'].cumsum()
    return empirical

def calculate_inventory_kpis(dataframe, method_name, critical_ratio=0.9):
    """
    Calculate KPIs for inventory management evaluation by first generating suggested order quantities
    
    Parameters:
    - dataframe: pandas DataFrame containing uncensored demand predictions with columns:
        * 'EHASTRA_EH_NUMMER': position identifier
        * 'Period': time period  
        * 'Heftjahr': year
        * '{method_name}_Demand': uncensored demand predictions
        * 'Verkauf_MBR': true demand values
    - method_name: string name of the method being evaluated
    - critical_ratio: target quantile (q) for pinball loss (default=0.9)
    """
    df = dataframe.copy()
    # Try different naming conventions for the demand column
    possible_demand_cols = [
        method_name + '_Demand',                    # e.g., 'N2_Demand'
        'Verkauf_Uncensored',                      # Default uncensored column
        method_name + '_Uncensored',               # e.g., 'Baseline_Uncensored'
        method_name.lower() + '_demand',           # lowercase version
        method_name.upper() + '_DEMAND'            # uppercase version
    ]
    
    # Also search for columns containing the method name
    pattern_matches = [col for col in df.columns if method_name.lower() in col.lower() and ('demand' in col.lower() or 'uncensored' in col.lower())]
    
    all_candidates = possible_demand_cols + pattern_matches
    
    demand_col = None
    for candidate in all_candidates:
        if candidate in df.columns:
            demand_col = candidate
            break
    
    if demand_col is None:
        raise ValueError(f"Could not find demand column for method {method_name}. "
                        f"Tried: {possible_demand_cols + pattern_matches}. "
                        f"Available columns: {df.columns.tolist()}")
    
    # Step 1: Generate suggested order quantities for 2024 using inventory optimization
    inventory_positions = []
    
    for (pos,), group in df.groupby(['EHASTRA_EH_NUMMER']):
        # Initialize empirical demand distribution from 2022-2023 data (training)
        df_train = group[group['Heftjahr'] < 2024]
        if df_train.empty: 
            continue
            
        # Build initial empirical distribution
        empirical = build_empirical(df_train[demand_col].value_counts())
        
        # Process 2024 test data sequentially
        df_test = group[group['Heftjahr'] == 2024]
        df_test = df_test.sort_values(['Period'])
        
        for row in df_test.itertuples():
            # Calculate optimal inventory using newsvendor model
            # Find the smallest demand value where CMF >= critical_ratio
            optimal_q = (empirical['cmf'] >= critical_ratio).idxmax()
            
            inventory_positions.append({
                'EHASTRA_EH_NUMMER': pos, 
                'Period': row.Period,
                'Heftjahr': row.Heftjahr,
                'Suggested_Q': optimal_q,
                'True_Demand': getattr(row, 'Verkauf_MBR'),
                'Predicted_Demand': getattr(row, demand_col)
            })
            
            # Update empirical distribution with new observed demand
            empirical = update_empirical(empirical, getattr(row, demand_col))
    
    # Step 2: Calculate KPIs on 2024 test data
    df_inventory = pd.DataFrame(inventory_positions)
    df_inventory = df_inventory.dropna(subset=['True_Demand'])
    
    if df_inventory.empty:
        print(f"No valid test data for method {method_name}")
        return None
    
    # Extract suggested and true values for 2024 only
    y_hat = df_inventory['Suggested_Q'].values  # ŷᵢ (suggested order quantity)
    y_true = df_inventory['True_Demand'].values # yᵢ (true demand)
    
    n = len(y_hat)
    q = critical_ratio  # target quantile (critical ratio)
    
    # 1. Pinball Loss (Quantile Loss)
    # L_q(y, ŷ) = q·(y - ŷ) if y ≥ ŷ, else (1-q)·(ŷ - y) if y < ŷ
    pinball_losses = np.where(
        y_true >= y_hat,
        q * (y_true - y_hat),           # Underestimation penalty
        (1 - q) * (y_hat - y_true)      # Overestimation penalty
    )
    avg_pinball_loss = np.mean(pinball_losses)
    
    # 2. Out-of-Stock (OOS) Rate
    # Number of periods with OOS / Number of periods receiving that issue
    # OOS occurs when true demand > suggested quantity
    oos_periods = np.sum(y_true > y_hat)
    oos_rate = oos_periods / n
    
    # 3. Alpha Service Level (complement of OOS rate)
    # 1 - OOS rate
    alpha_service_level = 1 - oos_rate
    
    # 4. Beta Fill Rate
    # Total Sales / Total Demand
    # Sales = min(suggested Q from optimization, true demand)
    sales = np.minimum(y_hat, y_true)
    total_sales = np.sum(sales)
    total_demand = np.sum(y_true)
    
    beta_fill_rate = total_sales / total_demand if total_demand > 0 else 0
    
    # Additional metrics for context
    total_overstock = np.sum(np.maximum(0, y_hat - y_true))
    total_understock = np.sum(np.maximum(0, y_true - y_hat))
    
    # Print results
    print(f"Method: {method_name}")
    print(f"- Pinball Loss (q={q}): {avg_pinball_loss:.3f}")
    print(f"- Average OOS Rate: {oos_rate:.3f}")
    print(f"- Average Alpha Service Level: {alpha_service_level:.3f}")
    print(f"- Average Beta Fill Rate: {beta_fill_rate:.3f}")
    print(f"- Total Overstock: {int(total_overstock)}")
    print(f"- Total Understock: {int(total_understock)}")
    print(f"- Number of observations: {n}")
    
    return {
        'method_name': method_name,
        'pinball_loss': avg_pinball_loss,
        'oos_rate': oos_rate,
        'alpha_service_level': alpha_service_level,
        'beta_fill_rate': beta_fill_rate,
        'total_overstock': total_overstock,
        'total_understock': total_understock,
        'n_observations': n,
        'critical_ratio': q
    }

# Example usage:
# results = calculate_inventory_kpis(df_n2_uncensored, "N2", critical_ratio=0.9)

In [56]:
def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

df_baseline_uncensored = apply_baseline_uncensoring(df)

In [57]:
results = calculate_inventory_kpis(df_baseline_uncensored, "Baseline")

Method: Baseline
- Pinball Loss (q=0.9): 0.450
- Average OOS Rate: 0.124
- Average Alpha Service Level: 0.876
- Average Beta Fill Rate: 0.944
- Total Overstock: 12081
- Total Understock: 1203
- Number of observations: 5094


In [58]:
results = calculate_inventory_kpis(df_n1_uncensored, "N1")

Method: N1
- Pinball Loss (q=0.9): 0.473
- Average OOS Rate: 0.156
- Average Alpha Service Level: 0.844
- Average Beta Fill Rate: 0.932
- Total Overstock: 10842
- Total Understock: 1472
- Number of observations: 5094


In [59]:
results = calculate_inventory_kpis(df_n2_uncensored, "N2")

Method: N2
- Pinball Loss (q=0.9): 0.474
- Average OOS Rate: 0.158
- Average Alpha Service Level: 0.842
- Average Beta Fill Rate: 0.932
- Total Overstock: 10836
- Total Understock: 1478
- Number of observations: 5094


In [62]:
results = calculate_inventory_kpis(df_n3_uncensored, "N3")

Method: N3
- Pinball Loss (q=0.9): 0.448
- Average OOS Rate: 0.124
- Average Alpha Service Level: 0.876
- Average Beta Fill Rate: 0.945
- Total Overstock: 12073
- Total Understock: 1194
- Number of observations: 5094


In [63]:
results = calculate_inventory_kpis(df_em_uncensored, "EM")

Method: EM
- Pinball Loss (q=0.9): 0.425
- Average OOS Rate: 0.098
- Average Alpha Service Level: 0.902
- Average Beta Fill Rate: 0.956
- Total Overstock: 13157
- Total Understock: 944
- Number of observations: 5094


In [64]:
results = calculate_inventory_kpis(df_projection_uncensored, "PD")

Method: PD
- Pinball Loss (q=0.9): 0.447
- Average OOS Rate: 0.116
- Average Alpha Service Level: 0.884
- Average Beta Fill Rate: 0.951
- Total Overstock: 13107
- Total Understock: 1071
- Number of observations: 5094


In [65]:
results = calculate_inventory_kpis(df_nahmias_uncensored, "Nahmias, Hierarchical")

Method: Nahmias, Hierarchical
- Pinball Loss (q=0.9): 0.448
- Average OOS Rate: 0.114
- Average Alpha Service Level: 0.886
- Average Beta Fill Rate: 0.950
- Total Overstock: 13001
- Total Understock: 1091
- Number of observations: 5094


In [66]:
results = calculate_inventory_kpis(df_conrad_uncensored, "Conrad, Hierarchical")

Method: Conrad, Hierarchical
- Pinball Loss (q=0.9): 0.446
- Average OOS Rate: 0.098
- Average Alpha Service Level: 0.902
- Average Beta Fill Rate: 0.954
- Total Overstock: 13777
- Total Understock: 996
- Number of observations: 5094
