# Testing for One Magazine

At the end of this notebook, there is a also script that runs over all of the magazines.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path = 'Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv'
df = pd.read_csv(path)

df["is_stockout"] = (df["Zensiert"] == 1)
df = df.dropna(subset=['Verkauf_MBR'])
df = df.dropna(subset=['Verkauf'])

Calculate KPIS for one magazine and one method.

In [6]:
import pandas as pd
import numpy as np

def calculate_kpis(dataframe, method_name):
   """
   Calculate KPIs for demand forecasting evaluation
   
   Parameters:
   - dataframe: pandas DataFrame containing 'Verkauf' (predicted) and 'Verkauf_MBR' (ground truth) columns
   - method_name: string name of the method being evaluated
   - censorship_pct: censorship percentage for display
   - reduction_pct: reduction percentage for display
   - alpha: weight parameter for Weighted MAE (default=1)
            α = 0: standard MAE (no weighting)
            α = 1: linear weighting by true demand
            α > 1: over-proportional penalization of larger errors --> 1.5
            α < 1: emphasis on smaller demands --> 0.5
   """
   df = dataframe.copy()
   censored_df = df[df['Zensiert'] == 1]
   
   # Extract predicted and true values
   y_pred = censored_df['Verkauf_Uncensored'].values  # ŷᵢ (estimated demand)
   y_true = censored_df['Verkauf_MBR'].values  # yᵢ (true demand)
   
   n = len(y_pred)
   
   # 1. Bias calculation
   bias = np.sum(y_pred - y_true) / n
   
   # 2. Accuracy (exact matches)
   exact_matches = np.sum(y_pred == y_true)
   accuracy = exact_matches / n
   
   # 3. Overestimation Rate
   overestimations = np.sum(y_pred > y_true)
   overestimation_rate = overestimations / n
   
   # 4. Underestimation Rate
   underestimations = np.sum(y_pred < y_true)
   underestimation_rate = underestimations / n
   
   # 5. Weighted MAE for different alpha values
   alphas = [0, 0.5, 1, 1.5]
   weighted_maes = {}
   
   for a in alphas:
       if a == 0:
           # Standard MAE (no weighting)
           weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
       else:
           # Weighted MAE with α parameter
           weights = np.power(y_true, a)
           # Handle case where y_true might be 0
           weights = np.where(y_true == 0, 0, weights)
           weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0
   
   # 6. Gini Coefficient
   abs_errors = np.abs(y_pred - y_true)
   mean_abs_error = np.mean(abs_errors)
   n = len(abs_errors)
   sorted_errors = np.sort(abs_errors)
   
   weighted_sum = 0.0
   total_sum = 0.0

   # 7. Overstock, out of curiosity
   overstock = np.sum(np.maximum(0, y_pred - y_true))
   
   for i in range(n):
       weighted_sum += (i + 1) * sorted_errors[i]
       total_sum += sorted_errors[i]
   
   gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
   
   # Determine bias direction
   bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"
   
   # Print results in the specified format
   print(f"Method: {method_name}, on {path}")
   print(f"- Bias: {bias:.3f} ({bias_direction})")
   print(f"- Weighted MAE (α=0): {weighted_maes[0]:.3f}")
   print(f"- Weighted MAE (α=0.5): {weighted_maes[0.5]:.3f}")
   print(f"- Weighted MAE (α=1): {weighted_maes[1]:.3f}")
   print(f"- Weighted MAE (α=1.5): {weighted_maes[1.5]:.3f}")
   print(f"- Accuracy (exact matches): {accuracy:.3f}")
   print(f"- Overestimation Rate: {overestimation_rate:.3f}")
   print(f"- Underestimation Rate: {underestimation_rate:.3f}")
   print(f"- Gini Coefficient: {gini_coefficient:.3f}")
   print(f"Overstock: {int(overstock)}")
   
   return {
       'bias': bias,
       'weighted_mae_0': weighted_maes[0],
       'weighted_mae_0.5': weighted_maes[0.5],
       'weighted_mae_1': weighted_maes[1],
       'weighted_mae_1.5': weighted_maes[1.5],
       'accuracy': accuracy,
       'overestimation_rate': overestimation_rate,
       'underestimation_rate': underestimation_rate,
       'gini_coefficient': gini_coefficient,
       'overstock': overstock
   }

# N3

In [42]:
def apply_n3_uncensoring(df):
    """
    N3: replace censored values with max(current value, average of non-censored values) within each POS.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Compute open group means and merge back
    open_means = (
        df[df['Zensiert'] == 0]
        .groupby('EHASTRA_EH_NUMMER')['Verkauf']
        .mean()
        .reset_index()
        .rename(columns={'Verkauf': 'open_mean'})
        .round()
    )
    
    df = df.merge(open_means, on='EHASTRA_EH_NUMMER', how='left')
    
    # CREATE MASK AFTER MERGE - this is the key fix
    mask = (df['Zensiert'] == 1) & df['open_mean'].notna()
    df.loc[mask, 'Verkauf_Uncensored'] = np.maximum(
        df.loc[mask, 'Verkauf'], 
        df.loc[mask, 'open_mean']
    )
    
    return df.drop('open_mean', axis=1)

# Testing on one Magazine
df_n3_uncensored = apply_n3_uncensoring(df)
calculate_kpis(df_n3_uncensored, "N3")

Method: N3, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 0.106 (overestimation)
- Weighted MAE (α=0): 0.886
- Weighted MAE (α=0.5): 1.046
- Weighted MAE (α=1): 1.267
- Weighted MAE (α=1.5): 1.550
- Accuracy (exact matches): 0.448
- Overestimation Rate: 0.266
- Underestimation Rate: 0.285
- Gini Coefficient: 0.608
Overstock: 37548


{'bias': np.float64(0.10611624834874504),
 'weighted_mae_0': np.float64(0.8859048877146631),
 'weighted_mae_0.5': np.float64(1.04554339869066),
 'weighted_mae_1': np.float64(1.2670694449212179),
 'weighted_mae_1.5': np.float64(1.5496538874583028),
 'accuracy': np.float64(0.44837516512549536),
 'overestimation_rate': np.float64(0.2664993394980185),
 'underestimation_rate': np.float64(0.2851254953764861),
 'gini_coefficient': np.float64(0.6080318989078881),
 'overstock': np.float64(37548.0)}

# N2

In [41]:
def apply_n2_uncensoring(df):
    """
    N2: replace censored values with mean of uncensored values within each POS.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate mean of uncensored observations for each group
    uncensored_means = (
        df[df['Zensiert'] == 0]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf']
        .mean()
        .rename('uncensored_mean')
        .reset_index()
        .round()
    )
    
    # Merge back to original DataFrame
    df = df.merge(uncensored_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # CREATE MASK AFTER MERGE - this is the key fix
    mask = (df['Zensiert'] == 1) & df['uncensored_mean'].notna()
    df.loc[mask, 'Verkauf_Uncensored'] = df.loc[mask, 'uncensored_mean']
    
    return df.drop('uncensored_mean', axis=1)

# Testing on one Magazine
df_n2_uncensored = apply_n2_uncensoring(df)
calculate_kpis(df_n2_uncensored, "N2")

Method: N2, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.281 (underestimation)
- Weighted MAE (α=0): 1.273
- Weighted MAE (α=0.5): 1.495
- Weighted MAE (α=1): 1.813
- Weighted MAE (α=1.5): 2.248
- Accuracy (exact matches): 0.266
- Overestimation Rate: 0.266
- Underestimation Rate: 0.468
- Gini Coefficient: 0.493
Overstock: 37548


{'bias': np.float64(-0.2808586525759577),
 'weighted_mae_0': np.float64(1.272879788639366),
 'weighted_mae_0.5': np.float64(1.4952776743238234),
 'weighted_mae_1': np.float64(1.8129855938118613),
 'weighted_mae_1.5': np.float64(2.248353854270422),
 'accuracy': np.float64(0.2657859973579921),
 'overestimation_rate': np.float64(0.2664993394980185),
 'underestimation_rate': np.float64(0.4677146631439894),
 'gini_coefficient': np.float64(0.4931204543748027),
 'overstock': np.float64(37548.0)}

# N1

In [None]:
def apply_n1_uncensoring(df):
    """
    N1: replace censored values with mean of all values within each POS.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    # Identify closed observations
    is_closed = (df["Zensiert"] == 1)

    # Compute mean Verkauf per group and broadcast using transform
    group_means = df.groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored'].transform('mean').round()
    
    # Replace closed observations
    df.loc[is_closed, 'Verkauf_Uncensored'] = group_means[is_closed]
    
    return df

# Testing on one Magazine
df_n1_uncensored = apply_n1_uncensoring(df)
calculate_kpis(df_n1_uncensored, "N1")

Method: N1, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.480 (underestimation)
- Weighted MAE (α=0): 1.196
- Weighted MAE (α=0.5): 1.464
- Weighted MAE (α=1): 1.847
- Weighted MAE (α=1.5): 2.368
- Accuracy (exact matches): 0.308
- Overestimation Rate: 0.209
- Underestimation Rate: 0.483
- Gini Coefficient: 0.524
Overstock: 27114


{'bias': np.float64(-0.47998678996036986),
 'weighted_mae_0': np.float64(1.1963408190224571),
 'weighted_mae_0.5': np.float64(1.4640724794304336),
 'weighted_mae_1': np.float64(1.8471759751925212),
 'weighted_mae_1.5': np.float64(2.3677733207398495),
 'accuracy': np.float64(0.3079656538969617),
 'overestimation_rate': np.float64(0.2088110964332893),
 'underestimation_rate': np.float64(0.483223249669749),
 'gini_coefficient': np.float64(0.5237275079941357),
 'overstock': np.float64(27114.0)}

# EM

In [None]:
def apply_em_uncensoring(df, max_iter=30, tolerance=1e-6):
    """    
    Parameters:
    - max_iter: Maximum number of iterations
    - tolerance: Convergence tolerance
    """
    
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    stockout_condition = (df['Zensiert'] == 1)
    
    for (pos,), group in df.groupby(['EHASTRA_EH_NUMMER']):
        try:
            group_stockout = stockout_condition.loc[group.index]
            
            if not group_stockout.any():
                continue
            
            sales = group['Verkauf_Uncensored'].values
            is_stockout = group_stockout.values
            
            uncensored = sales[~is_stockout]
            censored = sales[is_stockout]
            
            # Skip if no uncensored data - but keep original values
            if len(uncensored) == 0:
                continue
            
            # Quick lambda initialization
            lambda_est = np.mean(uncensored) if len(uncensored) > 0 else np.mean(sales) * 1.5
            lambda_est = max(lambda_est, 0.1)
            
            # Fast EM loop
            for iteration in range(max_iter):
                lambda_old = lambda_est
                
                # Batch E-step
                surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
                exact_prob = poisson.pmf(censored, lambda_est)
                surv_prob = np.maximum(surv_prob, 1e-12)
                
                expected = lambda_est + censored * exact_prob / surv_prob
                expected = np.maximum(expected, censored.astype(float))
                
                # M-step
                lambda_est = max(np.mean(np.concatenate([uncensored, expected])), 0.1)
                
                if abs(lambda_est - lambda_old) < tolerance:
                    break
            
            # Final update
            surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
            exact_prob = poisson.pmf(censored, lambda_est)
            surv_prob = np.maximum(surv_prob, 1e-12)
            
            final_expected = lambda_est + censored * exact_prob / surv_prob
            final_expected = np.maximum(final_expected, censored.astype(float))
            
            # Update original dataframe
            stockout_indices = group.index[is_stockout]
            df.loc[stockout_indices, 'Verkauf_Uncensored'] = final_expected.round()
            
        except Exception as e:
            print(f"EM error for POS {pos}: {e}")
            # Fill with original Verkauf values for this POS when error occurs
            group_stockout_indices = group.index[stockout_condition.loc[group.index]]
            df.loc[group_stockout_indices, 'Verkauf_Uncensored'] = df.loc[group_stockout_indices, 'Verkauf']
            continue
    
    # Replace any NaN values with original Verkauf
    nan_mask = df['Verkauf_Uncensored'].isna()
    df.loc[nan_mask, 'Verkauf_Uncensored'] = df.loc[nan_mask, 'Verkauf']
    
    return df

# Testing on one Magazine
df_em_uncensored = apply_em_uncensoring(df)
calculate_kpis(df_em_uncensored, "EM")

Method: EM, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: 1.090 (overestimation)
- Weighted MAE (α=0): 1.250
- Weighted MAE (α=0.5): 1.378
- Weighted MAE (α=1): 1.542
- Weighted MAE (α=1.5): 1.736
- Accuracy (exact matches): 0.255
- Overestimation Rate: 0.686
- Underestimation Rate: 0.058
- Gini Coefficient: 0.472
Overstock: 88579


{'bias': np.float64(1.0898678996036988),
 'weighted_mae_0': np.float64(1.2503963011889037),
 'weighted_mae_0.5': np.float64(1.3777440993516805),
 'weighted_mae_1': np.float64(1.5419341366013297),
 'weighted_mae_1.5': np.float64(1.73585172117765),
 'accuracy': np.float64(0.25501981505944515),
 'overestimation_rate': np.float64(0.6864861294583884),
 'underestimation_rate': np.float64(0.05849405548216645),
 'gini_coefficient': np.float64(0.4715153631344364),
 'overstock': np.float64(88579.0)}

# PD

In [None]:
def apply_pd_uncensoring(df, tau=0.5, max_iter=20, tolerance=1e-4):
    """PD with skip for invalid POS groups and fallback to original Verkauf"""
    df = df.copy()
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy().astype(float)

    def compute_pd_projection1(obs_val, lambda_est, tau):
        """
        Compute the PD projection for a single observation using Poisson distribution.
        This balances area A (original to new estimate) with area B (new estimate to infinity)
        weighted by parameter tau.
        """
        try:
            # Check for NaN inputs
            if pd.isna(obs_val) or pd.isna(lambda_est) or lambda_est <= 0:
                return float(obs_val) if not pd.isna(obs_val) else 0.0
            
            obs_val = int(round(obs_val))
            
            def objective(k_proj):
                k_proj = int(round(k_proj))
                
                if k_proj < obs_val:
                    return float('inf')
                
                # Area A: P(obs_val <= X <= k_proj)
                area_A = poisson.cdf(k_proj, lambda_est) - poisson.cdf(obs_val - 1, lambda_est)
                
                # Area B: P(X > k_proj)
                area_B = 1 - poisson.cdf(k_proj, lambda_est)
                
                if area_B > 1e-10: # Avoid division by zero
                    ratio = area_A / area_B
                    target_ratio = (1 - tau) / tau
                    return abs(ratio - target_ratio)
                else:
                    return abs(area_A - (1 - tau))
            
            # Search for optimal projection
            upper_bound = int(obs_val + max(10, int(3 * np.sqrt(lambda_est))))
            
            best_k = obs_val
            best_objective = float('inf')
            
            # Discrete search
            for k in range(int(obs_val), upper_bound + 1):
                obj_val = objective(k)
                if obj_val < best_objective:
                    best_objective = obj_val
                    best_k = k
            
            return float(best_k)
            
        except Exception:
            # Fallback to original value
            return float(obs_val) if not pd.isna(obs_val) else 0.0
    
    # Process groups
    grouped = df.groupby('EHASTRA_EH_NUMMER')
    
    for pos, group in grouped:
        try:
            open_mask = ~group['is_closed']
            closed_mask = group['is_closed']
            
            open_sales = group.loc[open_mask, 'Verkauf_Uncensored'].values
            closed_sales = group.loc[closed_mask, 'Verkauf_Uncensored'].values
            
            # SKIP POS WITHOUT VALID UNCENSORED DATA
            if len(open_sales) == 0 or np.all(pd.isna(open_sales)):
                # print(f"Skipping POS {pos}: no valid uncensored data")
                continue
            
            # Initialize lambda parameter
            lambda_est = np.mean(open_sales[~pd.isna(open_sales)])
            
            # SKIP IF LAMBDA IS INVALID
            if pd.isna(lambda_est) or lambda_est <= 0:
                print(f"Skipping POS {pos}: invalid lambda {lambda_est}")
                continue
            
            lambda_est = max(lambda_est, 0.1)
            closed_indices = group[closed_mask].index.values
            
            # Iterative process
            for iteration in range(max_iter):
                lambda_old = lambda_est
                
                # Project closed observations
                projected_values = np.array([
                    compute_pd_projection1(obs, lambda_est, tau) 
                    for obs in closed_sales
                ])
                
                # Re-estimate lambda
                all_values = np.concatenate([open_sales, projected_values])
                lambda_est = np.mean(all_values)
                lambda_est = max(lambda_est, 0.1)
                
                # Check convergence
                if abs(lambda_est - lambda_old) < tolerance:
                    break
            
            # Final projection
            final_projections = np.array([
                compute_pd_projection1(obs, lambda_est, tau) 
                for obs in closed_sales
            ])
            
            # Update dataframe
            df.loc[closed_indices, 'Verkauf_Uncensored'] = final_projections.round()
            
        except Exception as e:
            print(f"PD error for POS {pos}: {e}")
            # FALLBACK: Keep original values for this POS
            continue
    
    # FINAL FALLBACK: Any remaining NaN values get original Verkauf
    nan_mask = df['Verkauf_Uncensored'].isna()
    df.loc[nan_mask, 'Verkauf_Uncensored'] = df.loc[nan_mask, 'Verkauf']
    
    return df.drop('is_closed', axis=1)

# Testing on one Magazine
df_pd_uncensored = apply_pd_uncensoring(df)
calculate_kpis(df_pd_uncensored, "PD")

Skipping POS EHA0117626: invalid lambda 0.0
Skipping POS EHA0596336: invalid lambda 0.0
Skipping POS EHA1379010: invalid lambda 0.0
Skipping POS EHA1938990: invalid lambda 0.0
Skipping POS EHA1963214: invalid lambda 0.0
Skipping POS EHA2821288: invalid lambda 0.0
Skipping POS EHA3018615: invalid lambda 0.0
Skipping POS EHA4382546: invalid lambda 0.0
Skipping POS EHA4589487: invalid lambda 0.0
Skipping POS EHA5656077: invalid lambda 0.0
Skipping POS EHA5910032: invalid lambda 0.0
Skipping POS EHA6424189: invalid lambda 0.0
Skipping POS EHA8286327: invalid lambda 0.0
Skipping POS EHA8883331: invalid lambda 0.0
Skipping POS EHA9741568: invalid lambda 0.0
Skipping POS EHB1999879: invalid lambda 0.0
Skipping POS EHB2211765: invalid lambda 0.0
Skipping POS EHB2488004: invalid lambda 0.0
Skipping POS EHB2549204: invalid lambda 0.0
Skipping POS EHB2795775: invalid lambda 0.0
Skipping POS EHB3436424: invalid lambda 0.0
Skipping POS EHB5921220: invalid lambda 0.0
Skipping POS EHB6471169: invalid

{'bias': np.float64(0.3127873183619551),
 'weighted_mae_0': np.float64(1.0919947159841479),
 'weighted_mae_0.5': np.float64(1.1744353414255522),
 'weighted_mae_1': np.float64(1.309103703957983),
 'weighted_mae_1.5': np.float64(1.500775294129802),
 'accuracy': np.float64(0.41702774108322327),
 'overestimation_rate': np.float64(0.2731043593130779),
 'underestimation_rate': np.float64(0.3098678996036988),
 'gini_coefficient': np.float64(0.6440586597096092),
 'overstock': np.float64(53171.0)}

# Conrad

In [19]:
from scipy.stats import poisson
import pandas as pd
import numpy as np

def berechnung(links, rechts, n, N, r, x_summe, value_tol=0.00001, max_iterations=1000):
    """
    Till's Code
    """
    iteration = 0
    
    while iteration < max_iterations:
        mu = (links + rechts) / 2
        wert_0 = (x_summe - mu * n) * (1 - poisson.cdf(N-1, mu)) + mu * (n - r) * (1 - poisson.cdf(N-2, mu))
        
        if iteration < 3:
            print(f"Iter {iteration}: mu={mu:.4f}, wert_0={wert_0:.8f}")
        
        if abs(wert_0) < value_tol:
            print(f"Converged after {iteration} iterations: mu={mu:.4f}, wert_0={wert_0:.8f}")
            return mu
        elif wert_0 < 0:  
            rechts = mu
        elif wert_0 > 0:
            links = mu
            
        iteration += 1
    
    print(f"✗ Max iterations reached: mu={mu:.4f}")
    return mu

def test_conrad_example():
    links = 1
    rechts = 100
    n = 13
    N = 10
    r = 7
    x_summe = 58
    
    print(f"links={links}, rechts={rechts}")
    print(f"n={n}, N={N}, r={r}, x_summe={x_summe}")
    
    result = berechnung(links, rechts, n, N, r, x_summe)
    print(f"Result: μ = {result:.4f}")
    print(f"Expected: μ ≈ 10.18")
    
    return result

def create_order_specific_mu_dict(df):

    order_specific_mu_dict = {}
    
    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            N = bezug_val
            
            # Count non-stockouts
            stockouts_mask = (group['Zensiert'] == 1)
            r = n - stockouts_mask.sum()  # r = number of NON-stockouts
            
            # x_summe = sum of UNCENSORED observations only
            uncensored_sales = group[~stockouts_mask]['Verkauf']
            x_summe = uncensored_sales.sum()
            
            # Skip problematic cases
            if n < 3:
                continue
            if r == n:  # No stockouts = no censoring information
                continue
            if r == 0:  # All stockouts = no uncensored observations
                continue
            
            try:
                links = 1
                rechts = 100
                mu_est = berechnung(links, rechts, n, N, r, x_summe)
                if mu_est:
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {N}: {e}")
                continue

    print(f"Successfully estimated μ for {len(order_specific_mu_dict)} groups")
    return order_specific_mu_dict

def expected_poisson_tail(mu, N, max_k=200):
    """
    Compute E[X | X >= N] for X ~ Poisson(mu)
    """
    k_vals = np.arange(N, max_k)
    pmf = poisson.pmf(k_vals, mu)
    tail_prob = 1 - poisson.cdf(N - 1, mu)
    if tail_prob < 1e-8:
        return N  # fallback: don't uncensor
    return np.sum(k_vals * pmf) / tail_prob

# Internal function 
def apply_conrad_uncensoring_1(df, order_specific_mu_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if row['Zensiert'] == 0:
            continue
            
        # Get the order-quantity-specific demand parameter
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)

        if mu is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_poisson_tail(mu, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

# Wrapper function for script
def apply_conrad_uncensoring(df):
    """
    WRAPPER FUNCTION: This is what gets called by the main processing loop
    """
    # Step 1: Create mu dictionary from the dataset
    order_specific_mu_dict = create_order_specific_mu_dict(df)
    
    # Step 2: Apply uncensoring using the mu dictionary  
    return apply_conrad_uncensoring_1(df, order_specific_mu_dict)

# print("\nRunning on actual data:")
# order_specific_mu_dict = create_order_specific_mu_dict(df)
# df_conrad_uncensored = apply_conrad_uncensoring(df, order_specific_mu_dict)

# Testing on one Magazine
df_conrad_uncensored = apply_conrad_uncensoring(df)
calculate_kpis(df_conrad_uncensored, "Conrad")

Iter 0: mu=50.5000, wert_0=-101.00000000
Iter 1: mu=25.7500, wert_0=-51.49999998
Iter 2: mu=13.3750, wert_0=-26.74777675
Converged after 23 iterations: mu=3.9797, wert_0=0.00000935
Iter 0: mu=50.5000, wert_0=-8620.50000000
Iter 1: mu=25.7500, wert_0=-4338.74999833
Iter 2: mu=13.3750, wert_0=-2197.76755762
Converged after 16 iterations: mu=1.8527, wert_0=-0.00000916
Iter 0: mu=50.5000, wert_0=-13048.50000000
Iter 1: mu=25.7500, wert_0=-6539.24997769
Iter 2: mu=13.3750, wert_0=-3283.86478561
Converged after 26 iterations: mu=1.7449, wert_0=-0.00000848
Iter 0: mu=50.5000, wert_0=-6390.00000000
Iter 1: mu=25.7500, wert_0=-3122.99989831
Iter 2: mu=13.3750, wert_0=-1487.71558458
Converged after 25 iterations: mu=3.1311, wert_0=0.00000396
Iter 0: mu=50.5000, wert_0=-2776.00000000
Iter 1: mu=25.7500, wert_0=-1340.49959102
Iter 2: mu=13.3750, wert_0=-619.00666414
Converged after 27 iterations: mu=4.7264, wert_0=-0.00000398
Iter 0: mu=50.5000, wert_0=-3322.50000000
Iter 1: mu=25.7500, wert_0=-16

{'bias': np.float64(0.16947159841479525),
 'weighted_mae_0': np.float64(1.024795244385733),
 'weighted_mae_0.5': np.float64(1.1315752643692643),
 'weighted_mae_1': np.float64(1.3438662135411303),
 'weighted_mae_1.5': np.float64(1.6733425191740245),
 'accuracy': np.float64(0.2608322324966975),
 'overestimation_rate': np.float64(0.4714795244385733),
 'underestimation_rate': np.float64(0.2676882430647292),
 'gini_coefficient': np.float64(0.43335970760648257),
 'overstock': np.float64(45203.0)}

# Nahmias

In [26]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def nahmias_estimation(sales, S):
    """
    Nahmias method for censored normal data
    """
    sales = np.array(sales)
    n = len(sales)
    
    observed = sales[sales < S]
    r = len(observed)
    p = r / n
    
    if r < 2 or r >= n-1 or p <= 0 or p >= 1:
        return None, None
    
    try:
        x_bar = np.mean(observed)
        s2 = np.var(observed, ddof=1)
        z = norm.ppf(p)
        
        sigma_hat2 = s2 / (1 - (z * norm.pdf(z) / p) - (norm.pdf(z)**2 / p**2))
        sigma_hat = np.sqrt(sigma_hat2)
        mu_hat = x_bar + sigma_hat * norm.pdf(z) / p
        
        if not np.isfinite(mu_hat) or not np.isfinite(sigma_hat) or sigma_hat <= 0:
            return None, None
            
        return mu_hat, sigma_hat
        
    except Exception:
        return None, None

def create_order_specific_nahmias_dict(df):
    """
    Create μ and σ estimates for each (week, order_quantity) combination
    """
    order_specific_mu_dict = {}
    order_specific_sigma_dict = {}

    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            S = bezug_val
            
            if n < 5:
                continue
            
            sales = group['Verkauf'].values
            
            try:
                mu_est, sigma_est = nahmias_estimation(sales, S)
                if mu_est is not None and sigma_est is not None:
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
                    order_specific_sigma_dict[key] = sigma_est
            except Exception as e:
                continue

    # print(f"Successfully estimated μ,σ for {len(order_specific_mu_dict)} groups")
    return order_specific_mu_dict, order_specific_sigma_dict

def expected_normal_tail(mu, sigma, S):
    """
    Compute E[X | X >= S] for X ~ Normal(mu, sigma)
    """
    if sigma <= 0:
        return S
    
    z = (S - mu) / sigma
    
    if z > 6:
        return S
    
    tail_prob = 1 - norm.cdf(z)
    
    if tail_prob < 1e-10:
        return S
    
    expected_value = mu + sigma * norm.pdf(z) / tail_prob
    
    return expected_value

def apply_nahmias_uncensoring_1(df, order_specific_mu_dict, order_specific_sigma_dict):
    """
    Uncensor dataset using Nahmias estimates
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        if row['Zensiert'] == 0:
            continue
            
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)
        sigma = order_specific_sigma_dict.get(key, None)

        if mu is None or sigma is None:
            continue

        est_demand = expected_normal_tail(mu, sigma, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

def test_nahmias():
    """Test implementation"""
    mu_true = 100
    sigma_true = 30
    S = 110
    n = 100
    
    np.random.seed(42)
    demand = np.random.normal(mu_true, sigma_true, n)
    sales = np.minimum(demand, S)
    
    print("Testing Nahmias implementation:")
    print(f"True μ: {mu_true}, True σ: {sigma_true}")
    print(f"S (censoring limit): {S}")
    print(f"Sample size: {n}")
    
    mu_hat, sigma_hat = nahmias_estimation(sales, S)
    
    naive_mean = np.mean(sales)
    naive_std = np.std(sales, ddof=1)
    
    print(f"True mean: {mu_true}")
    print(f"Naive mean (sales): {naive_mean:.2f}")
    print(f"Corrected estimator (Nahmias): {mu_hat:.2f}")
    print(f"True Std.Dev.: {sigma_true}")
    print(f"Corrected Std.Dev.: {sigma_hat:.2f}")
    print(f"Naive Std.Dev. (sales): {naive_std:.2f}")
    
    return mu_hat, sigma_hat

def apply_nahmias_uncensoring(df):
    """
    WRAPPER FUNCTION: This is what gets called by the main processing loop
    """
    # Step 1: Create mu dictionary from the dataset
    order_specific_mu_dict, compute_mu_sigma_nahmias = create_order_specific_nahmias_dict(df)
    
    # Step 2: Apply uncensoring using the mu dictionary  
    return apply_nahmias_uncensoring_1(df, order_specific_mu_dict, compute_mu_sigma_nahmias)


# Testing on one Magazine
df_nahmias_uncensoring = apply_nahmias_uncensoring(df)
calculate_kpis(df_nahmias_uncensoring, "Nahmias")

Method: Nahmias, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.500 (underestimation)
- Weighted MAE (α=0): 0.872
- Weighted MAE (α=0.5): 1.057
- Weighted MAE (α=1): 1.341
- Weighted MAE (α=1.5): 1.734
- Accuracy (exact matches): 0.346
- Overestimation Rate: 0.171
- Underestimation Rate: 0.483
- Gini Coefficient: 0.485
Overstock: 14069


{'bias': np.float64(-0.5001188903566711),
 'weighted_mae_0': np.float64(0.8718229854689564),
 'weighted_mae_0.5': np.float64(1.057487934658794),
 'weighted_mae_1': np.float64(1.3411657684253888),
 'weighted_mae_1.5': np.float64(1.7343575685465282),
 'accuracy': np.float64(0.346446499339498),
 'overestimation_rate': np.float64(0.17093791281373844),
 'underestimation_rate': np.float64(0.48261558784676356),
 'gini_coefficient': np.float64(0.4853685393289462),
 'overstock': np.float64(14069.0)}

In [None]:
import pandas as pd
import numpy as np

def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# df_baseline_uncensored = apply_baseline_uncensoring(df)
# calculate_kpis(df_baseline_uncensored)

Method: baseline_A, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.500 (underestimation)
- Weighted MAE (α=0): 0.872
- Weighted MAE (α=0.5): 1.057
- Weighted MAE (α=1): 1.341
- Weighted MAE (α=1.5): 1.734
- Accuracy (exact matches): 0.346
- Overestimation Rate: 0.171
- Underestimation Rate: 0.483
- Gini Coefficient: 0.485
Overstock: 14069
Processed magazine A
Method: baseline_B, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.613 (underestimation)
- Weighted MAE (α=0): 0.893
- Weighted MAE (α=0.5): 1.165
- Weighted MAE (α=1): 1.730
- Weighted MAE (α=1.5): 2.944
- Accuracy (exact matches): 0.359
- Overestimation Rate: 0.128
- Underestimation Rate: 0.513
- Gini Coefficient: 0.512
Overstock: 6785
Processed magazine B
Method: baseline_C, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.454 (underestimation)
- Weighted MAE (α=0): 0.742
- Weighted MAE (α=0.5): 0.812
- Weighted MAE (α=1): 0.905
- Weighted MAE (α=1.5): 1.030
- Accuracy (exact matches): 0.364
- Overes

In [130]:
import pandas as pd
import numpy as np

def calculate_kpis(dataframe, method_name):
   """
   Calculate KPIs for demand forecasting evaluation
   
   Parameters:
   - dataframe: pandas DataFrame containing 'Verkauf' (predicted) and 'Verkauf_MBR' (ground truth) columns
   - method_name: string name of the method being evaluated
   - censorship_pct: censorship percentage for display
   - reduction_pct: reduction percentage for display
   - alpha: weight parameter for Weighted MAE (default=1)
            α = 0: standard MAE (no weighting)
            α = 1: linear weighting by true demand
            α > 1: over-proportional penalization of larger errors --> 1.5
            α < 1: emphasis on smaller demands --> 0.5
   """
   df = dataframe.copy()
   censored_df = df[df['Zensiert'] == 1]
   
   # Extract predicted and true values
   y_pred = censored_df['Verkauf_Uncensored'].values  # ŷᵢ (estimated demand)
   y_true = censored_df['Verkauf_MBR'].values  # yᵢ (true demand)
   
   n = len(y_pred)
   
   # 1. Bias calculation
   bias = np.sum(y_pred - y_true) / n
   
   # 2. Accuracy (exact matches)
   exact_matches = np.sum(y_pred == y_true)
   accuracy = exact_matches / n
   
   # 3. Overestimation Rate
   overestimations = np.sum(y_pred > y_true)
   overestimation_rate = overestimations / n
   
   # 4. Underestimation Rate
   underestimations = np.sum(y_pred < y_true)
   underestimation_rate = underestimations / n
   
   # 5. Weighted MAE for different alpha values
   alphas = [0, 0.5, 1, 1.5]
   weighted_maes = {}
   
   for a in alphas:
       if a == 0:
           # Standard MAE (no weighting)
           weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
       else:
           # Weighted MAE with α parameter
           weights = np.power(y_true, a)
           # Handle case where y_true might be 0
           weights = np.where(y_true == 0, 0, weights)
           weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0
   
   # 6. Gini Coefficient
   abs_errors = np.abs(y_pred - y_true)
   mean_abs_error = np.mean(abs_errors)
   n = len(abs_errors)
   sorted_errors = np.sort(abs_errors)
   
   weighted_sum = 0.0
   total_sum = 0.0

   # 7. Overstock, out of curiosity
   overstock = np.sum(np.maximum(0, y_pred - y_true))
   
   for i in range(n):
       weighted_sum += (i + 1) * sorted_errors[i]
       total_sum += sorted_errors[i]
   
   gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
   
   # Determine bias direction
   bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"
   
   # Print results in the specified format
   print(f"Method: {method_name}, on {path}")
   print(f"- Bias: {bias:.3f} ({bias_direction})")
   print(f"- Weighted MAE (α=0): {weighted_maes[0]:.3f}")
   print(f"- Weighted MAE (α=0.5): {weighted_maes[0.5]:.3f}")
   print(f"- Weighted MAE (α=1): {weighted_maes[1]:.3f}")
   print(f"- Weighted MAE (α=1.5): {weighted_maes[1.5]:.3f}")
   print(f"- Accuracy (exact matches): {accuracy:.3f}")
   print(f"- Overestimation Rate: {overestimation_rate:.3f}")
   print(f"- Underestimation Rate: {underestimation_rate:.3f}")
   print(f"- Gini Coefficient: {gini_coefficient:.3f}")
   print(f"Overstock: {int(overstock)}")
   
   return {
       'bias': bias,
       'weighted_mae_0': weighted_maes[0],
       'weighted_mae_0.5': weighted_maes[0.5],
       'weighted_mae_1': weighted_maes[1],
       'weighted_mae_1.5': weighted_maes[1.5],
       'accuracy': accuracy,
       'overestimation_rate': overestimation_rate,
       'underestimation_rate': underestimation_rate,
       'gini_coefficient': gini_coefficient,
       'overstock': overstock
   }

# Baseline

In [None]:
def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# Testing on one Magazine 
baseline_df = apply_baseline_uncensoring(df)
calculate_kpis(baseline_df, "baseline")

Method: baseline, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.953 (underestimation)
- Weighted MAE (α=0): 0.953
- Weighted MAE (α=0.5): 1.227
- Weighted MAE (α=1): 1.625
- Weighted MAE (α=1.5): 2.143
- Accuracy (exact matches): 0.404
- Overestimation Rate: 0.000
- Underestimation Rate: 0.596
- Gini Coefficient: 0.578
Overstock: 0


{'bias': np.float64(-0.9526287978863937),
 'weighted_mae_0': np.float64(0.9526287978863937),
 'weighted_mae_0.5': np.float64(1.2273189859478337),
 'weighted_mae_1': np.float64(1.6253447072419989),
 'weighted_mae_1.5': np.float64(2.142739203429853),
 'accuracy': np.float64(0.4039233817701453),
 'overestimation_rate': np.float64(0.0),
 'underestimation_rate': np.float64(0.5960766182298547),
 'gini_coefficient': np.float64(0.5780015815264463),
 'overstock': np.float64(0.0)}

In [152]:
import pandas as pd
import numpy as np

def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# Process all magazines A through I
magazines = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
all_kpis = []

for magazine in magazines:
    filename = f'test1/{magazine}_20250212_ZQ0.35_ZG0.4_testfile.csv'
    
    # Load data
    df = pd.read_csv(filename)
    
    # Apply baseline uncensoring
    baseline_df = apply_baseline_uncensoring(df)
    
    # Calculate KPIs
    kpis = calculate_kpis(baseline_df, f"baseline_{magazine}")
    all_kpis.append(kpis)
    
    print(f"Processed magazine {magazine}")

# Calculate average KPIs
print("\nCalculating averages...")

# Convert to DataFrame if KPIs are dictionaries
if isinstance(all_kpis[0], dict):
    kpi_df = pd.DataFrame(all_kpis)
    average_kpis = kpi_df.mean()
    print("\nAverage KPIs:")
    print(average_kpis)
else:
    # If KPIs are single values
    average_kpis = np.mean(all_kpis)
    print(f"\nAverage KPI: {average_kpis}")

print(f"\nProcessed {len(magazines)} magazines successfully.")

Method: baseline_A, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.953 (underestimation)
- Weighted MAE (α=0): 0.953
- Weighted MAE (α=0.5): 1.227
- Weighted MAE (α=1): 1.625
- Weighted MAE (α=1.5): 2.143
- Accuracy (exact matches): 0.404
- Overestimation Rate: 0.000
- Underestimation Rate: 0.596
- Gini Coefficient: 0.578
Overstock: 0
Processed magazine A
Method: baseline_B, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.974 (underestimation)
- Weighted MAE (α=0): 0.974
- Weighted MAE (α=0.5): 1.318
- Weighted MAE (α=1): 1.974
- Weighted MAE (α=1.5): 3.271
- Accuracy (exact matches): 0.400
- Overestimation Rate: 0.000
- Underestimation Rate: 0.600
- Gini Coefficient: 0.582
Overstock: 0
Processed magazine B
Method: baseline_C, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.732 (underestimation)
- Weighted MAE (α=0): 0.732
- Weighted MAE (α=0.5): 0.832
- Weighted MAE (α=1): 0.971
- Weighted MAE (α=1.5): 1.155
- Accuracy (exact matches): 0.434
- Overestimatio

# Bayesian (See other notebooks)

Not implemented in the loop script below due to time, run Bayesian-specific notebook and then join the output csv with the N1,N2,N3,EM,PD,Conrad,Nahmias csv file

# Agrawal (See other notebooks)

Not implemented in the loop script below due to time, run Agrawal-specific notebook and then join the output csv with the N1,N2,N3,EM,PD,Conrad,Nahmias csv file

# Script for creating csv of all features for all mags

Loops through all magazines and applies uncensoring methods (n1,n2,n3,em,pd,conrad,nahmias), evaluates KPIs

In [71]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path

def calculate_kpis_for_pos_fast(pos_uncensored_data, pos_censored_data, method_name, magazine_name, pos_id):
    """
    KPI calculation using pre-filtered data
    
    Parameters:
    - pos_uncensored_data: Pre-filtered uncensored data for this POS
    - pos_censored_data: Pre-filtered censored data for this POS  
    - method_name: string name of the method being evaluated
    - magazine_name: string name of the magazine
    - pos_id: EHASTRA_EH_NUMMER for this POS
    """

    if len(pos_censored_data) == 0 or len(pos_censored_data) < 3:  # Add the < 3 check
        return {
            'bias': np.nan,
            'weighted_mae_0': np.nan,
            'weighted_mae_0.5': np.nan,
            'weighted_mae_1': np.nan,
            'weighted_mae_1.5': np.nan,
            'accuracy': np.nan,
            'overestimation_rate': np.nan,
            'underestimation_rate': np.nan,
            'gini_coefficient': np.nan,
            'overstock': np.nan,
        }
    
    # Extract values directly (already filtered)
    y_pred = pos_uncensored_data['Verkauf_Uncensored'].values
    y_true = pos_censored_data['Verkauf_MBR'].values
    
    n = len(y_pred)
    
    # Vectorized calculations for speed
    errors = y_pred - y_true
    abs_errors = np.abs(errors)
    
    # 1. Bias calculation
    bias = np.mean(errors)
    
    # 2. Accuracy (exact matches)
    accuracy = np.mean(y_pred == y_true)
    
    # 3. Overestimation Rate
    overestimation_rate = np.mean(y_pred > y_true)
    
    # 4. Underestimation Rate  
    underestimation_rate = np.mean(y_pred < y_true)
    
    # 5. Weighted MAE for different alpha values (vectorized)
    alphas = [0, 0.5, 1, 1.5]
    weighted_maes = {}
    
    for a in alphas:
        if a == 0:
            weighted_maes[a] = np.mean(abs_errors)
        else:
            weights = np.power(y_true, a)
            weights = np.where(y_true == 0, 0, weights)
            weighted_maes[a] = np.sum(weights * abs_errors) / np.sum(weights) if np.sum(weights) > 0 else 0
    
    # 6. Gini Coefficient (optimized)
    sorted_errors = np.sort(abs_errors)
    total_sum = np.sum(sorted_errors)
    
    if total_sum > 0:
        indices = np.arange(1, n + 1)
        weighted_sum = np.sum(indices * sorted_errors)
        gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
    else:
        gini_coefficient = 0
    
    # 7. Overstock
    overstock = np.sum(np.maximum(0, errors))
    
    return {
        'bias': bias,
        'weighted_mae_0': weighted_maes[0],
        'weighted_mae_0.5': weighted_maes[0.5],
        'weighted_mae_1': weighted_maes[1],
        'weighted_mae_1.5': weighted_maes[1.5],
        'accuracy': accuracy,
        'overestimation_rate': overestimation_rate,
        'underestimation_rate': underestimation_rate,
        'gini_coefficient': gini_coefficient,
        'overstock': overstock,
    }

def calculate_pos_features(df, magazine_name):
    """
    Calculate features for each POS (EHASTRA_EH_NUMMER)
    
    Parameters:
    df (DataFrame): Input dataframe with 'EHASTRA_EH_NUMMER', 'Verkauf', and 'Laenge' columns
    magazine_name (str): Name of the magazine
    
    Returns:
    DataFrame: DataFrame with EHASTRA_EH_NUMMER and calculated features
    """
    
    results = []
    
    for pos_id, pos_data in df.groupby('EHASTRA_EH_NUMMER'):
        
        verkauf = pos_data['Verkauf'].dropna()
        
        if len(verkauf) == 0:
            continue
        
        features = {
            'EHASTRA_EH_NUMMER': pos_id,
            'magazine': magazine_name
        }
        
        # 1. Data Sparsity
        actual_data_points = len(pos_data)
        laenge = pos_data['Laenge'].iloc[0] if 'Laenge' in pos_data.columns else actual_data_points
        
        features['data_sparsity'] = laenge / actual_data_points if actual_data_points > 0 else np.nan
        
        # 2. Stockout Rate
        if 'Zensiert' in pos_data.columns:
            features['stockout_rate'] = (pos_data['Zensiert'] == 1).sum() / len(pos_data['Zensiert'])
        else:
            features['stockout_rate'] = 0
        
        # 3. Verkauf Variance
        features['verkauf_variance'] = verkauf.var()
        
        # 4. Verkauf Mean
        features['verkauf_mean'] = verkauf.mean()
        
        # 5. Coefficient of Variation
        if features['verkauf_mean'] > 0:
            features['coefficient_of_variation'] = verkauf.std() / features['verkauf_mean']
        else:
            features['coefficient_of_variation'] = 0
        
        # 6. Number of unique order quantities
        features['unique_quantities'] = verkauf.nunique()
        
        # 7. Variance to Mean Ratio
        if features['verkauf_mean'] > 0:
            features['variance_to_mean_ratio'] = features['verkauf_variance'] / features['verkauf_mean']
        else:
            features['variance_to_mean_ratio'] = 0
        
        # 8. Verkauf autocorrelation
        if len(verkauf) > 1:
            try:
                autocorr = verkauf.autocorr(lag=1)
                features['verkauf_autocorr'] = autocorr if not pd.isna(autocorr) else np.nan
            except:
                features['verkauf_autocorr'] = 0
        else:
            features['verkauf_autocorr'] = 0
        
        # 9. ADI (Average Inter-demand Interval)
        non_zero_periods = (verkauf > 0).sum()
        if non_zero_periods > 0:
            features['adi'] = len(verkauf) / non_zero_periods
        else:
            features['adi'] = len(verkauf)
        
        results.append(features)
    
    return pd.DataFrame(results)

def parse_filename(filename):
    """
    Parse filename to extract magazine letter and parameters
    
    Example: A_20250212_ZQ0.35_ZG0.4_testfile.csv -> ('A', '20250212', 'ZQ0.35', 'ZG0.4')
    """
    parts = filename.split('_')
    if len(parts) >= 4:
        magazine = parts[0]
        date = parts[1]
        zq = parts[2]
        zg = parts[3]
        return magazine, date, zq, zg
    return None, None, None, None

def process_all_datasets(data_directory, output_filename='combined_magazine_kpis_features_step1.csv'):
    """
    Process specific datasets and create a combined CSV with KPIs and POS features
    Each row represents one POS/Magazine combination with methods as column prefixes
    Only processes files with ZQ0.35_ZG0.4 pattern
    
    Parameters:
    - data_directory: path to directory containing CSV files
    - output_filename: name of the output file
    """
    
    # Method mapping
    method_functions = {
        'N1': apply_n1_uncensoring,
        'N2': apply_n2_uncensoring,
        'N3': apply_n3_uncensoring,
        'EM': apply_em_uncensoring,
        'PD': apply_pd_uncensoring,
        'Nahmias': apply_nahmias_uncensoring,
        'Conrad': apply_conrad_uncensoring,
        'Baseline': apply_baseline_uncensoring
        #'Bayesian': apply_bayesian_uncensoring,
        #'Agrawal': apply_agrawal_uncensoring
    }
    
    # Find only the specific files with ZQ0.35_ZG0.4 pattern
    pattern = os.path.join(data_directory, "*_20250212_ZQ0.35_ZG0.4_testfile.csv")
    csv_files = glob.glob(pattern)
    
    if not csv_files:
        print(f"No CSV files found matching pattern *_20250212_ZQ0.35_ZG0.4_testfile.csv in {data_directory}")
        return
    
    print(f"Found {len(csv_files)} CSV files to process (ZQ0.35_ZG0.4 only)")
    
    all_results = []
    
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        magazine, date, zq, zg = parse_filename(filename)
        
        if not magazine:
            print(f"Could not parse filename: {filename}")
            continue
            
        print(f"Processing {filename}...")
        
        try:
            # Read the original CSV file
            original_df = pd.read_csv(csv_file)
            
            # Filter out POS locations with less than 3 censored data points
            censored_counts = original_df[original_df['Zensiert'] == 1].groupby('EHASTRA_EH_NUMMER').size()
            valid_pos_ids = censored_counts[censored_counts >= 3].index
            original_df = original_df[original_df['EHASTRA_EH_NUMMER'].isin(valid_pos_ids)]
            
            if len(original_df) == 0:
                print(f"  No POS locations with >= 3 censored data points in {filename}")
                continue
            
            # Calculate POS features once for this magazine (they don't depend on uncensoring method)
            pos_features = calculate_pos_features(original_df, magazine)
            
            # Get unique POS IDs and convert to numpy array for faster iteration
            pos_ids = original_df['EHASTRA_EH_NUMMER'].unique()
            
            # Pre-filter censored data by POS to avoid repeated filtering
            censored_mask = original_df['Zensiert'] == 1
            censored_data = original_df[censored_mask]
            pos_censored_groups = censored_data.groupby('EHASTRA_EH_NUMMER')
            
            # Convert pos_features to dictionary for faster lookup
            pos_features_dict = pos_features.set_index('EHASTRA_EH_NUMMER').to_dict('index')
            
            # Initialize results dictionary for each POS
            pos_results = {}
            
            # Process each uncensoring method
            for method_name, method_function in method_functions.items():
                print(f"  Applying {method_name} uncensoring...")
      
                try:
                    # Apply the uncensoring method to the entire dataset
                    uncensored_df = method_function(original_df)
                    
                    # # Pre-calculate uncensored values for censored data only
                    # uncensored_censored = uncensored_df[censored_mask]
                    # uncensored_groups = uncensored_censored.groupby('EHASTRA_EH_NUMMER')
                    # CREATE CENSORED MASK FROM THE UNCENSORED DATAFRAME - KEY FIX
                    censored_mask_for_this_method = uncensored_df['Zensiert'] == 1
                    uncensored_censored = uncensored_df[censored_mask_for_this_method]
                    uncensored_groups = uncensored_censored.groupby('EHASTRA_EH_NUMMER')
                    
                    # Process all POS locations for this method
                    for pos_id in pos_ids:
                        if pos_id in pos_censored_groups.groups and pos_id in pos_features_dict:
                            # Get pre-filtered censored data for this POS
                            pos_censored = pos_censored_groups.get_group(pos_id)
                            pos_uncensored = uncensored_groups.get_group(pos_id) if pos_id in uncensored_groups.groups else None
                            
                            if pos_uncensored is not None and len(pos_uncensored) > 0:
                                # Calculate KPIs using pre-filtered data
                                kpis = calculate_kpis_for_pos_fast(pos_uncensored, pos_censored, method_name, magazine, pos_id)
                                
                                # Initialize POS entry if it doesn't exist
                                if pos_id not in pos_results:
                                    pos_results[pos_id] = {
                                        'EHASTRA_EH_NUMMER': pos_id,
                                        'magazine': magazine,
                                        'date': date,
                                        'zq_parameter': zq,
                                        'zg_parameter': zg,
                                        'filename': filename,
                                        **pos_features_dict[pos_id]  # Add POS features
                                    }
                                
                                # Add method-specific KPIs with method prefix
                                for kpi_name, kpi_value in kpis.items():
                                    column_name = f"{method_name}_{kpi_name}"
                                    pos_results[pos_id][column_name] = kpi_value
                
                except Exception as e:
                    print(f"    Error applying {method_name}: {str(e)}")
                    continue
            
            # Add all POS results from this file to main results
            all_results.extend(list(pos_results.values()))
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    if all_results:
        # Create final DataFrame
        final_df = pd.DataFrame(all_results)
        
        # Save to CSV
        output_path = output_filename
        final_df.to_csv(output_path, index=False)
        
        print(f"\nProcessing complete!")
        print(f"Combined dataset saved as: {output_path}")
        print(f"Total rows: {len(final_df)}")
        print(f"Magazines processed: {final_df['magazine'].nunique()}")
        print(f"POS locations: {final_df['EHASTRA_EH_NUMMER'].nunique()}")
        
        return final_df
    else:
        print("No data was processed successfully.")
        return None

# Usage
data_directory = "Test1"  # Change this to your directory path
result_df = process_all_datasets(data_directory, 'NaiveEMPDConradNahmias_KPIs_FeaturesNOTFINAL.csv')

Found 9 CSV files to process (ZQ0.35_ZG0.4 only)
Processing E_20250212_ZQ0.35_ZG0.4_testfile.csv...
  Applying N1 uncensoring...
  Applying N2 uncensoring...
  Applying N3 uncensoring...
  Applying EM uncensoring...
  Applying PD uncensoring...
  Applying Nahmias uncensoring...
  Applying Conrad uncensoring...
Iter 0: mu=50.5000, wert_0=-5853.50000000
Iter 1: mu=25.7500, wert_0=-2957.74999892
Iter 2: mu=13.3750, wert_0=-1509.80515427
Converged after 25 iterations: mu=1.6521, wert_0=0.00000583
Iter 0: mu=50.5000, wert_0=-7726.00000000
Iter 1: mu=25.7500, wert_0=-3864.99998666
Iter 2: mu=13.3750, wert_0=-1934.04674838
Converged after 27 iterations: mu=1.8359, wert_0=0.00000993
Iter 0: mu=50.5000, wert_0=-5820.50000000
Iter 1: mu=25.7500, wert_0=-2875.24991842
Iter 2: mu=13.3750, wert_0=-1401.16527554
Converged after 26 iterations: mu=2.4215, wert_0=-0.00000623
Iter 0: mu=50.5000, wert_0=-4282.00000000
Iter 1: mu=25.7500, wert_0=-2103.99961009
Iter 2: mu=13.3750, wert_0=-1011.27380692
Con

KeyboardInterrupt: 

# Merging with Agrawal, Bayesian

In [None]:
import pandas as pd

def merge_all_methods_data(csv_path, bayesian_csv_path, agrawal_csv_path, output_csv_path):
    """
    Simply add Bayesian and Agrawal columns to the pivoted dataset
    
    Parameters:
    - csv_path: path to the CSV with method-prefixed columns
    - bayesian_csv_path: path to Bayesian results CSV
    - agrawal_csv_path: path to Agrawal results CSV  
    - output_csv_path: path for the merged output CSV
    """
    
    # Read all datasets
    print("Reading datasets...")
    results_df = pd.read_csv(csv_path)
    bayesian_df = pd.read_csv(bayesian_csv_path)
    agrawal_df = pd.read_csv(agrawal_csv_path)
    
    print(f"Pivoted data shape: {results_df.shape}")
    print(f"Bayesian data shape: {bayesian_df.shape}")
    print(f"Agrawal data shape: {agrawal_df.shape}")
    
    # Merge Bayesian data (POS -> EHASTRA_EH_NUMMER, Magazine -> magazine)
    print("Merging Bayesian data...")
    merged_df = results_df.merge(
        bayesian_df, 
        left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
        right_on=['POS', 'Magazine'], 
        how='left'
    )
    
    # Drop the duplicate columns from Bayesian
    columns_to_drop = ['POS', 'Magazine']
    for col in columns_to_drop:
        if col in merged_df.columns:
            merged_df = merged_df.drop(col, axis=1)
    
    # Merge Agrawal data (POS -> EHASTRA_EH_NUMMER, Magazine -> magazine)  
    print("Merging Agrawal data...")
    merged_df = merged_df.merge(
        agrawal_df, 
        left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
        right_on=['POS', 'Magazine'], 
        how='left',
        suffixes=('', '_agrawal_dup')
    )
    
    # Drop the duplicate columns from Agrawal
    columns_to_drop = ['POS', 'Magazine']
    for col in columns_to_drop:
        if col in merged_df.columns:
            merged_df = merged_df.drop(col, axis=1)
    
    print(f"Final merged dataset shape: {merged_df.shape}")
    
    # Check merge success
    bayesian_cols = [col for col in merged_df.columns if col.startswith('Bayesian_')]
    agrawal_cols = [col for col in merged_df.columns if col.startswith('Agrawal_')]
    
    if bayesian_cols:
        bayesian_matches = merged_df[bayesian_cols[0]].notna().sum()
        print(f"Successful Bayesian merges: {bayesian_matches}/{len(merged_df)}")
    
    if agrawal_cols:
        agrawal_matches = merged_df[agrawal_cols[0]].notna().sum() 
        print(f"Successful Agrawal merges: {agrawal_matches}/{len(merged_df)}")
    
    # Save the merged dataset
    merged_df.to_csv(output_csv_path, index=False)
    print(f"Final dataset saved to: {output_csv_path}")
    
    # Show column summary
    all_method_cols = [col for col in merged_df.columns if any(method in col for method in [
        'N1_', 'N2_', 'N3_', 'EM_', 'PD_', 'Nahmias_', 'Conrad_', 'Baseline_', 'Bayesian_', 'Agrawal_'
    ])]
    
    print(f"Total method-specific columns: {len(all_method_cols)}")
    
    return merged_df

# Example usage
if __name__ == "__main__":
    pivoted_file = "NaiveEMPDConradNahmias_KPIs_FeaturesNOTFINAL.csv"
    bayesian_file = "bayesian_results.csv"  # Update with actual filename
    agrawal_file = "agrawal_results.csv"    # Update with actual filename
    output_file = "all_methods_KPIs_Features_MergedNOTFINAL.csv"
    
    # Merge all datasets
    merged_df = merge_all_methods_data(
        pivoted_file, 
        bayesian_file, 
        agrawal_file, 
        output_file
    )
    
    print(f"\nFinal dataset: {len(merged_df)} rows, {len(merged_df.columns)} columns")
    print("All methods combined - ready for analysis!")

Reading datasets...
Pivoted data shape: (10685, 103)
Bayesian data shape: (10685, 14)
Agrawal data shape: (10685, 14)
Merging Bayesian data...
Merging Agrawal data...
Final merged dataset shape: (10685, 127)
Successful Bayesian merges: 10685/10685
Successful Agrawal merges: 10685/10685
Final dataset saved to: all_methods_KPIs_Features_0.csv
Total method-specific columns: 110

Final dataset: 10685 rows, 127 columns
All methods combined - ready for analysis!


# Adding best method

In [67]:
import pandas as pd
import numpy as np

def add_best_method_column_pivoted(df, primary_kpi, tiebreaker_kpi):
    """
    Add a column showing which method performs best for each POS+Magazine combination
    
    Parameters:
    - primary_kpi: The main KPI to optimize (e.g., 'weighted_mae_0')
    - tiebreaker_kpi: KPI to use for breaking ties (e.g., 'accuracy')
    
    Returns:
    - DataFrame with new column 'best_method_{primary_kpi}'
    
    """
    
    # Define which KPIs should be minimized vs maximized
    minimize_kpis = [
        'bias', 'weighted_mae_0', 'weighted_mae_0.5', 'weighted_mae_1', 'weighted_mae_1.5',
        'gini_coefficient', 'overstock', 'overestimation_rate', 'underestimation_rate'
    ]
    
    maximize_kpis = [
        'accuracy'
    ]
    
    
    # Determine optimization direction for primary and tiebreaker KPIs
    primary_minimize = primary_kpi in minimize_kpis
    tiebreaker_minimize = tiebreaker_kpi in minimize_kpis
    
    primary_kpi_cols = [col for col in df.columns if col.endswith(f'_{primary_kpi}') and not col.startswith('best_')]
    tiebreaker_kpi_cols = [col for col in df.columns if col.endswith(f'_{tiebreaker_kpi}') and not col.startswith('best_')]
    
    if not primary_kpi_cols:
        raise ValueError(f"No columns found ending with '_{primary_kpi}'")
    if not tiebreaker_kpi_cols:
        raise ValueError(f"No columns found ending with '_{tiebreaker_kpi}'")
    
    print(f"Found {len(primary_kpi_cols)} methods for {primary_kpi}: {[col.split('_')[0] for col in primary_kpi_cols]}")
    print(f"Found {len(tiebreaker_kpi_cols)} methods for {tiebreaker_kpi}: {[col.split('_')[0] for col in tiebreaker_kpi_cols]}")
    
    # Create the new column name
    new_column_name = f'best_method_{primary_kpi}'
    
    df_copy = df.copy()
    best_methods = []
    
    # For each row, find the best method
    for idx, row in df_copy.iterrows():
        # Get primary KPI values for all methods
        primary_values = {}
        for col in primary_kpi_cols:
            method_name = col.split('_')[0]  # Extract method name (e.g., 'N1' from 'N1_weighted_mae_0')
            if pd.notna(row[col]):
                primary_values[method_name] = row[col]
        
        if not primary_values:
            best_methods.append(None)
            continue
        
        # Find the best primary KPI value
        if primary_minimize:
            best_primary_value = min(primary_values.values())
            best_methods_primary = [method for method, value in primary_values.items() if value == best_primary_value]
        else:
            best_primary_value = max(primary_values.values())
            best_methods_primary = [method for method, value in primary_values.items() if value == best_primary_value]
        
        # If there's only one best method, use it
        if len(best_methods_primary) == 1:
            best_method = best_methods_primary[0]
        else:
            # Use tiebreaker KPI to resolve ties
            tiebreaker_values = {}
            for method in best_methods_primary:
                tiebreaker_col = f"{method}_{tiebreaker_kpi}"
                if tiebreaker_col in df.columns and pd.notna(row[tiebreaker_col]):
                    tiebreaker_values[method] = row[tiebreaker_col]
            
            if not tiebreaker_values:
                # If no tiebreaker values available, pick the first method alphabetically
                best_method = sorted(best_methods_primary)[0]
            else:
                if tiebreaker_minimize:
                    best_tiebreaker_value = min(tiebreaker_values.values())
                    best_method = [method for method, value in tiebreaker_values.items() if value == best_tiebreaker_value][0]
                else:
                    best_tiebreaker_value = max(tiebreaker_values.values())
                    best_method = [method for method, value in tiebreaker_values.items() if value == best_tiebreaker_value][0]
        
        best_methods.append(best_method)
    
    # Add the best method column
    df_copy[new_column_name] = best_methods
    
    return df_copy

def add_multiple_best_method_columns_pivoted(df, kpi_configs):
    """
    Add multiple best method columns for different KPI combinations
    Works with pivoted data
    
    Parameters:
    - df: DataFrame with pivoted KPI results
    - kpi_configs: List of tuples [(primary_kpi, tiebreaker_kpi), ...]
    
    Example:
    kpi_configs = [
        ('weighted_mae_0', 'accuracy'),
        ('weighted_mae_1', 'accuracy'),
        ('accuracy', 'weighted_mae_0'),
        ('bias', 'weighted_mae_0')
    ]
    """
    result_df = df.copy()
    
    for primary_kpi, tiebreaker_kpi in kpi_configs:
        print(f"\nAdding best method column for {primary_kpi} (tiebreaker: {tiebreaker_kpi})")
        result_df = add_best_method_column_pivoted(result_df, primary_kpi, tiebreaker_kpi)
    
    return result_df

def analyze_best_methods(df, kpi_name):
    """
    Analyze and display summary statistics for best methods
    """
    best_method_col = f'best_method_{kpi_name}'
    
    if best_method_col not in df.columns:
        print(f"Column {best_method_col} not found in DataFrame")
        return
    
    print(f"\n=== Best Method Analysis for {kpi_name.upper()} ===")
    
    # Overall best method counts
    print(f"\nOverall best method distribution:")
    method_counts = df[best_method_col].value_counts()
    print(method_counts)
    
    # Best methods by magazine
    print(f"\nBest methods by magazine:")
    magazine_summary = df.groupby(['magazine', best_method_col]).size().unstack(fill_value=0)
    print(magazine_summary)
    
    # Calculate percentages
    print(f"\nBest method percentages:")
    percentages = (method_counts / len(df) * 100).round(2)
    for method, pct in percentages.items():
        print(f"  {method}: {pct}%")

# Usage
if __name__ == "__main__":
    # Load existing csv file 
    df = pd.read_csv("all_methods_KPIs_Features_MergedNOTFINAL.csv")
    
    print(f"Loaded dataset with {len(df)} rows and {len(df.columns)} columns")
    print(f"Magazines: {df['magazine'].unique()}")
    
    # Define KPI configurations for analysis
    kpi_configs = [
        ('weighted_mae_0', 'accuracy'),      # Best for unweighted MAE with accuracy tiebreaker
        ('weighted_mae_1', 'accuracy'),      # Best for weighted MAE (α=1) with accuracy tiebreaker
        ('weighted_mae_1.5', 'accuracy'),    # Best for weighted MAE (α=1.5) with accuracy tiebreaker
        ('accuracy', 'weighted_mae_0'),      # Best for accuracy with MAE tiebreaker
        ('bias', 'weighted_mae_0'),          # Best for bias with MAE tiebreaker
        ('overstock', 'accuracy'),           # Best for overstock with accuracy tiebreaker
    ]
    
    # Add all best method columns
    print("Adding best method columns...")
    df_with_best = add_multiple_best_method_columns_pivoted(df, kpi_configs)
    
    # Save results
    output_filename = "all_methods_KPIs_Features_with_BestNOTFINAL.csv"
    df_with_best.to_csv(output_filename, index=False)
    print(f"\nResults saved to: {output_filename}")
    
    # Analyze results for key KPIs
    key_kpis = ['weighted_mae_0', 'weighted_mae_1', 'accuracy', 'bias']
    
    for kpi in key_kpis:
        if f'best_method_{kpi}' in df_with_best.columns:
            analyze_best_methods(df_with_best, kpi)
    
    print(f"\nFinal dataset: {len(df_with_best)} rows, {len(df_with_best.columns)} columns")
    
    # Show sample of the new columns
    best_method_cols = [col for col in df_with_best.columns if col.startswith('best_method_')]
    print(f"\nBest method columns added: {len(best_method_cols)}")
    print("Sample of best method assignments:")
    sample_cols = ['EHASTRA_EH_NUMMER', 'magazine'] + best_method_cols[:3]
    print(df_with_best[sample_cols].head())

Loaded dataset with 10685 rows and 127 columns
Magazines: ['E' 'C' 'H' 'D' 'B' 'I' 'G' 'A' 'F']
Adding best method columns...

Adding best method column for weighted_mae_0 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_0: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']

Adding best method column for weighted_mae_1 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_1: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']

Adding best method column for weighted_mae_1.5 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_1.5: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM',

# Merging Sofortremission

In [70]:
import pandas as pd

# Read the files
best_methods_df = pd.read_csv("all_methods_KPIs_Features_with_BestNOTFINAL.csv")
sofortremission_df = pd.read_csv("sofortremissionFeatures3007.csv")

# Fix the EHASTRA_EH_NUMMER format - add "EH" prefix to sofortremission
sofortremission_df['EHASTRA_EH_NUMMER'] = 'EH' + sofortremission_df['EHASTRA_EH_NUMMER'].astype(str)

# Merge
merged_df = best_methods_df.merge(
    sofortremission_df, 
    left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
    right_on=['EHASTRA_EH_NUMMER', 'VDZ'], 
    how='left'
).drop('VDZ', axis=1)

# Sort by magazine
merged_df = merged_df.sort_values('magazine')

# Save
merged_df.to_csv("Uncensoring_DT_Input.csv", index=False)

print(f"Done! {len(merged_df)} rows, {len(merged_df.columns)} columns")

Done! 10685 rows, 140 columns
