In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

path = 'Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv'
df = pd.read_csv(path)

df["is_stockout"] = (df["Zensiert"] == 1)
df = df.dropna(subset=['Verkauf_MBR'])
df = df.dropna(subset=['Verkauf'])

df.head()
len(df)

FileNotFoundError: [Errno 2] No such file or directory: 'Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv'

# N3

In [136]:
# Grouping N3

import pandas as pd
import numpy as np

def apply_n3_uncensoring(df):
    df = df.copy()
    
    # Mark closed observations
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Compute open group means
    open_means = (
        df[~df['is_closed']]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('open_mean')
        .reset_index()
        .round()
    )
    
    # Merge open means back to df
    df = df.merge(open_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Compute the max(mean, observed) for censored rows only
    mask = df['is_closed'] & df['open_mean'].notna()
    df.loc[mask, 'Verkauf_Uncensored'] = np.maximum(df.loc[mask, 'Verkauf_Uncensored'], df.loc[mask, 'open_mean'])
    
    return df.drop(['is_closed', 'open_mean'], axis=1)

# N2

In [137]:
# Grouping N2
def apply_n2_uncensoring(df):
    """
    N2 uncensoring with grouping: replace censored values with mean of uncensored values within each group.
    """
    df = df.copy()

    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate mean of uncensored observations for each group
    uncensored_means = (
        df[df['Zensiert'] == 0]
        .groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored']
        .mean()
        .rename('uncensored_mean')
        .reset_index()
        .round()
    )
    
    # Merge back to original DataFrame
    df = df.merge(uncensored_means, on=['EHASTRA_EH_NUMMER'], how='left')
    
    # Replace censored values with group mean
    df.loc[df['Zensiert'] == 1, 'Verkauf_Uncensored'] = df.loc[df['Zensiert'] == 1, 'uncensored_mean']
    
    return df.drop('uncensored_mean', axis=1)

# N1

In [None]:
# Grouping N1

import pandas as pd
import numpy as np

def apply_n1_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    # Identify closed observations
    is_closed = (df["Zensiert"] == 1)

    # Compute mean Verkauf per group and broadcast using transform
    group_means = df.groupby(['EHASTRA_EH_NUMMER'])['Verkauf_Uncensored'].transform('mean').round()
    
    # Replace closed observations
    df.loc[is_closed, 'Verkauf_Uncensored'] = group_means[is_closed]
    
    return df

# EM

In [3]:
import numpy as np
from scipy.stats import poisson
import pandas as pd

def apply_em_uncensoring(df, max_iter=30, tolerance=1e-6):

    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Direct boolean indexing without extra columns
    stockout_condition = (df['Zensiert'] == 1)
    
    # Process groups with minimal overhead
    for (pos), group in df.groupby(['EHASTRA_EH_NUMMER']):
        group_stockout = stockout_condition.loc[group.index]
        
        if not group_stockout.any():
            continue
            
        sales = group['Verkauf_Uncensored'].values
        is_stockout = group_stockout.values
        
        uncensored = sales[~is_stockout]
        censored = sales[is_stockout]
        
        # Quick lambda initialization
        lambda_est = np.mean(uncensored) if len(uncensored) > 0 else np.mean(sales) * 1.5
        lambda_est = max(lambda_est, 0.1)
        
        # Fast EM loop
        for _ in range(max_iter):
            lambda_old = lambda_est
            
            # Batch E-step
            surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
            exact_prob = poisson.pmf(censored, lambda_est)
            surv_prob = np.maximum(surv_prob, 1e-12)
            
            expected = lambda_est + censored * exact_prob / surv_prob
            expected = np.maximum(expected, censored.astype(float))
            
            # M-step
            lambda_est = max(np.mean(np.concatenate([uncensored, expected])), 0.1)
            
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final update
        surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
        exact_prob = poisson.pmf(censored, lambda_est)
        surv_prob = np.maximum(surv_prob, 1e-12)
        
        final_expected = lambda_est + censored * exact_prob / surv_prob
        final_expected = np.maximum(final_expected, censored.astype(float))
        
        # Update original dataframe
        stockout_indices = group.index[is_stockout]
        df.loc[stockout_indices, 'Verkauf_Uncensored'] = final_expected.round()
    
    return df

# new_df = apply_em_uncensoring(df)
# new_df.to_csv("test_df.csv")

# PD

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

# grouped by POS only

def apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=20, tolerance=1e-4):
    """
    Proper Projection Detruncation implementation using Poisson distribution.
    
    Parameters:
    - tau: Parameter that controls the aggressiveness of unconstraining (0 < tau < 1)
           tau = 0.5 gives balanced results similar to EM
           smaller tau values are more aggressive
    - max_iter: Maximum number of iterations
    - tolerance: Convergence tolerance
    """
    df = df.copy()
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    def compute_pd_projection1(obs_val, lambda_est, tau):
        """
        Compute the PD projection for a single observation using Poisson distribution.
        This balances area A (original to new estimate) with area B (new estimate to infinity)
        weighted by parameter tau.
        """
        # For Poisson distribution, we need to find the projection value
        # that balances the two areas according to tau
        
        def objective(k_proj):
            k_proj = int(round(k_proj))  # Ensure integer for discrete distribution
            
            if k_proj < obs_val:
                return float('inf')  # Invalid projection
            
            # Area A: P(obs_val <= X <= k_proj) = P(X <= k_proj) - P(X <= obs_val-1)
            area_A = poisson.cdf(k_proj, lambda_est) - poisson.cdf(obs_val - 1, lambda_est)
            
            # Area B: P(X > k_proj) = 1 - P(X <= k_proj)
            area_B = 1 - poisson.cdf(k_proj, lambda_est)
            
            # PD tries to balance: tau * area_A = (1 - tau) * area_B
            # Rearranging: area_A / area_B = (1 - tau) / tau
            if area_B > 1e-10:  # Avoid division by zero
                ratio = area_A / area_B
                target_ratio = (1 - tau) / tau
                return abs(ratio - target_ratio)
            else:
                return abs(area_A - (1 - tau))
        
        # Search for optimal projection in reasonable range
        # For Poisson, search from obs_val to obs_val + reasonable upper bound
        upper_bound = int(obs_val + max(10, int(3 * np.sqrt(lambda_est))))
        
        best_k = obs_val
        best_objective = float('inf')
        
        # Discrete search since Poisson is discrete
        for k in range(int(obs_val), upper_bound + 1):
            obj_val = objective(k)
            if obj_val < best_objective:
                best_objective = obj_val
                best_k = k
        
        return best_k
    
    # Process groups - now only grouping by POS
    grouped = df.groupby('EHASTRA_EH_NUMMER')
    
    for pos, group in grouped:
        open_mask = ~group['is_closed']
        closed_mask = group['is_closed']
        
        open_sales = group.loc[open_mask, 'Verkauf_Uncensored'].values
        closed_sales = group.loc[closed_mask, 'Verkauf_Uncensored'].values
        
        if len(closed_sales) == 0:
            continue
        
        # Initialize lambda parameter using all available data
        all_sales = group['Verkauf_Uncensored'].values
        lambda_est = np.mean(all_sales)
        
        # Ensure minimum lambda for numerical stability
        lambda_est = max(lambda_est, 0.1)
        
        closed_indices = group[closed_mask].index.values
        
        # Iterative process
        for iteration in range(max_iter):
            lambda_old = lambda_est
            
            # Project closed observations using PD heuristic
            projected_values = np.array([
                compute_pd_projection1(obs, lambda_est, tau) 
                for obs in closed_sales
            ])
            
            # Re-estimate lambda using open + projected values
            all_values = np.concatenate([open_sales, projected_values])
            lambda_est = np.mean(all_values)
            
            # Ensure minimum lambda
            lambda_est = max(lambda_est, 0.1)
            
            # Check convergence
            if abs(lambda_est - lambda_old) < tolerance:
                break
        
        # Final projection
        final_projections = np.array([
            compute_pd_projection1(obs, lambda_est, tau) 
            for obs in closed_sales
        ])
        
        # Update dataframe (already integers from Poisson)
        df.loc[closed_indices, 'Verkauf_Uncensored'] = final_projections.round()
    
    return df.drop('is_closed', axis=1)

def apply_projection_detruncation_aggressive1(df, max_iter=20, tolerance=1e-4):
    """
    More aggressive PD implementation with tau=0.3 as mentioned in the paper
    """
    return apply_projection_detruncation_fixed1(df, tau=0.3, max_iter=max_iter, tolerance=tolerance)

def apply_pd_uncensoring(df, max_iter=20, tolerance=1e-4):
    """
    Balanced PD implementation with tau=0.5 (similar to EM results)
    """
    return apply_projection_detruncation_fixed1(df, tau=0.5, max_iter=max_iter, tolerance=tolerance)

# df_pd_uncensored = apply_pd_uncensoring(df)

# Conrad

In [5]:
from scipy.stats import poisson
import pandas as pd
import numpy as np

# def compute_mu(left, right, n, N, r, x_sum, tolerance=0.001):
#     """
#     Binary search for mu using Conrad's method, with proper stopping and iteration limits.
#     """
#     iteration = 0

#     while (right - left) > tolerance:
#         mu = (left + right) / 2  # midpoint of current search interval

#         # Equation 2 from Conrad
#         value_0 = (x_sum - mu * n) * (1 - poisson.cdf(N - 1, mu)) + mu * (n - r) * (1 - poisson.cdf(N - 2, mu))

#         if abs(value_0) < tolerance:
#             return mu
#         elif value_0 < 0:
#             right = mu
#         else:
#             left = mu

#         iteration += 1

#     return (left + right) / 2  # final estimate if not converged early

def compute_mu(left, right, n, N, r, x_sum, value_tol=0.01, interval_tol=0.01, max_iterations=1000000000000000):
    iteration = 0

    while iteration < max_iterations and (right - left) > interval_tol:
        mu = (left + right) / 2
        value_0 = (x_sum - mu * n) * (1 - poisson.cdf(N - 1, mu)) + mu * (n - r) * (1 - poisson.cdf(N - 2, mu))

        if iteration % 10 == 0:
            print(f"Iteration {iteration}: mu={mu:.4f}, value_0={value_0:.6f}")

        if abs(value_0) < value_tol:
            print(f"Converged after {iteration} iterations: mu={mu:.4f}")
            return mu

        if value_0 < 0:
            right = mu
        else:
            left = mu

        iteration += 1

    print(f"Stopped after {iteration} iterations: mu={mu:.4f}, value_0={value_0:.6f}")
    return mu


def create_order_specific_mu_dict(df):
    """
    Create μ estimates for each (week, order_quantity) combination using Conrad's method.
    Each stockout level gets its own Poisson distribution.
    """
    order_specific_mu_dict = {}
    total_skipped = 0

    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            N = bezug_val
            r = (group['Verkauf'] == bezug_val).sum()  # stockouts
            x_sum = np.minimum(group['Verkauf'], N).sum()

            if n < 2:
                # total_skipped += 1
                # print(f"Skipping: n={n} too small")
                continue
            if r == 0:
                # total_skipped += 1
                # print(f"Skipping: no stockouts in group")
                continue
            if r == n:
                # total_skipped += 1
                # print(f"Skipping: all stockouts in group")
                continue
            
            # bounds are problematic
            try:
                mu_est = compute_mu(0.1, 4, n, N, r, x_sum)
                if mu_est:
                    # Store with (year, week, order_quantity) key
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {N}: {e}")
                continue
    #print(f"total skipped: {total_skipped}")

    return order_specific_mu_dict

def expected_poisson_tail(mu, N, max_k=200):
    """
    Compute E[X | X >= N] for X ~ Poisson(mu)
    """
    k_vals = np.arange(N, max_k)
    pmf = poisson.pmf(k_vals, mu)
    tail_prob = 1 - poisson.cdf(N - 1, mu)
    if tail_prob < 1e-8:
        return N  # fallback: don't uncensor
    return np.sum(k_vals * pmf) / tail_prob

def apply_conrad_uncensoring_1(df, order_specific_mu_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    
    Uses order-quantity-specific Poisson(μ) parameters.
    
    Modifies the DataFrame in place by updating the Verkauf column.
    """
    df = df.copy()

    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if row['Zensiert'] == 0:
            continue
            
        # Get the order-quantity-specific demand parameter
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)

        if mu is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_poisson_tail(mu, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

def apply_conrad_uncensoring(df):
    """
    WRAPPER FUNCTION: This is what gets called by the main processing loop
    """
    # Step 1: Create mu dictionary from the dataset
    order_specific_mu_dict = create_order_specific_mu_dict(df)
    
    # Step 2: Apply uncensoring using the mu dictionary  
    return apply_conrad_uncensoring_1(df, order_specific_mu_dict)

# Usage
# order_specific_mu_dict = create_order_specific_mu_dict(df)
# df_conrad_uncensored = apply_conrad_uncensoring_1(df, order_specific_mu_dict)
# results = calculate_kpis(df_conrad_uncensored, "Conrad, Altered")

In [166]:
import pandas as pd
import numpy as np

def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# Process all magazines A through I
magazines = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
all_kpis = []

for magazine in magazines:
    filename = f'test1/{magazine}_20250212_ZQ0.35_ZG0.4_testfile.csv'
    
    # Load data
    df = pd.read_csv(filename)
    
    # Apply baseline uncensoring
    baseline_df = apply_conrad_uncensoring(df)
    
    # Calculate KPIs
    kpis = calculate_kpis(baseline_df, f"baseline_{magazine}")
    all_kpis.append(kpis)
    
    print(f"Processed magazine {magazine}")

# Calculate average KPIs
print("\nCalculating averages...")

# Convert to DataFrame if KPIs are dictionaries
if isinstance(all_kpis[0], dict):
    kpi_df = pd.DataFrame(all_kpis)
    average_kpis = kpi_df.mean()
    print("\nAverage KPIs:")
    print(average_kpis)
else:
    # If KPIs are single values
    average_kpis = np.mean(all_kpis)
    print(f"\nAverage KPI: {average_kpis}")

print(f"\nProcessed {len(magazines)} magazines successfully.")

Iteration 0: mu=2.0500, value_0=-95.529164
Stopped after 9 iterations: mu=1.0064, value_0=0.306352
Iteration 0: mu=2.0500, value_0=157.670785
Stopped after 9 iterations: mu=3.0174, value_0=-1.387527
Iteration 0: mu=2.0500, value_0=264.010846
Stopped after 9 iterations: mu=3.9924, value_0=238.080823
Iteration 0: mu=2.0500, value_0=114.205064
Stopped after 9 iterations: mu=3.9924, value_0=259.426858
Iteration 0: mu=2.0500, value_0=32.023764
Stopped after 9 iterations: mu=3.9924, value_0=128.245570
Iteration 0: mu=2.0500, value_0=11.102363
Stopped after 9 iterations: mu=3.9924, value_0=93.032315
Iteration 0: mu=2.0500, value_0=3.522657
Stopped after 9 iterations: mu=3.9924, value_0=58.636756
Iteration 0: mu=2.0500, value_0=0.773547
Stopped after 9 iterations: mu=3.9924, value_0=24.982560
Iteration 0: mu=2.0500, value_0=0.264131
Stopped after 9 iterations: mu=3.9924, value_0=16.629564
Iteration 0: mu=2.0500, value_0=0.040009
Stopped after 9 iterations: mu=3.9924, value_0=4.793517
Iteration

# Nahmias

In [6]:
import numpy as np
import pandas as pd
from scipy.stats import norm

def compute_mu_sigma_nahmias(sales, S):
    """
    Compute mu and sigma using Nahmias method for censored normal data.
    
    Parameters:
    -----------
    sales : array-like
        Observed sales data (censored at S)
    S : float
        Censoring limit (inventory level)
    
    Returns:
    --------
    tuple: (mu_hat, sigma_hat) or (None, None) if estimation fails
    """
    sales = np.array(sales)
    sales = sales[~np.isnan(sales)]
    
    if len(sales) < 5:  # Need reasonable sample size
        return None, None
    
    # The uncensored observations are those < S
    observed = sales[sales < S]
    n = len(sales)
    r = len(observed)
    
    if r < 2 or r >= n-1:  # Need at least 2 uncensored, 1 censored
        return None, None
        
    p = r / n  # Proportion uncensored
    
    # # Avoid extreme proportions where method is unstable
    # if p < 0.2 or p > 0.8:
    #     return None, None
    
    x_bar = np.mean(observed)
    s2 = np.var(observed, ddof=1)
    
    if s2 <= 1e-6:
        return None, None
    
    try:
        z = norm.ppf(p)
        
        # Check for extreme z values that cause instability
        if abs(z) > 2:
            return None, None
        
        pdf_z = norm.pdf(z)
        
        denominator = 1 - (z * pdf_z / p) - (pdf_z**2 / p**2)
        
        if denominator <= 0.1:  # Need substantial positive denominator
            return None, None
            
        sigma_hat2 = s2 / denominator
        
        if sigma_hat2 <= 0 or sigma_hat2 > 1e6:
            return None, None
            
        sigma_hat = np.sqrt(sigma_hat2)
        mu_hat = x_bar + sigma_hat * pdf_z / p
        
        # Sanity check on estimates
        if abs(mu_hat) > 1e6 or sigma_hat > 1e3:
            return None, None
        
        return mu_hat, sigma_hat
        
    except Exception:
        return None, None

def create_order_specific_nahmias_dict(df):
    """
    Create μ and σ estimates for each (week, order_quantity) combination using Nahmias method.
    Each stockout level gets its own Normal distribution.
    """
    order_specific_mu_dict = {}
    order_specific_sigma_dict = {}

    for (year, week), week_data in df.groupby(['Heftjahr', 'Heftnummer']):
        for bezug_val, group in week_data.groupby('Bezug'):
            n = len(group)
            S = bezug_val
            
            # Count stockouts (sales = order quantity)
            stockouts = (group['Verkauf'] == bezug_val).sum()
            
            if n < 5 or stockouts == 0 or stockouts == n:
                continue  # Not enough variation or unusable

            try:
                mu_est, sigma_est = compute_mu_sigma_nahmias(group['Verkauf'], S)
                if mu_est is not None and sigma_est is not None:
                    # Store with (year, week, order_quantity) key
                    key = (year, week, bezug_val)
                    order_specific_mu_dict[key] = mu_est
                    order_specific_sigma_dict[key] = sigma_est
            except Exception as e:
                print(f"Error in week {week}, Bezug {bezug_val}: {e}")
                continue

    return order_specific_mu_dict, order_specific_sigma_dict

def expected_normal_tail(mu, sigma, S, max_iterations=1000):
    """
    Compute E[X | X >= S] for X ~ Normal(mu, sigma)
    Using numerical integration approximation
    """
    if sigma <= 0:
        return S
    
    # Standardize
    z = (S - mu) / sigma
    
    # If S is way above the mean, just return S (no meaningful tail)
    if z > 6:
        return S
    
    # Use the formula: E[X | X >= S] = mu + sigma * phi(z) / (1 - Phi(z))
    # where phi is PDF and Phi is CDF
    tail_prob = 1 - norm.cdf(z)
    
    if tail_prob < 1e-10:  # Essentially no tail probability
        return S
    
    expected_value = mu + sigma * norm.pdf(z) / tail_prob
    
    return expected_value

def apply_nahmias_uncensoring_1(df, order_specific_mu_dict, order_specific_sigma_dict):
    """
    Given a DataFrame with Verkauf, Bezug, is_stockout, Heftjahr, Heftnummer,
    replace Verkauf with E[X | X >= Bezug] when censored.
    
    Uses order-quantity-specific Normal(μ, σ) parameters.
    
    Modifies the DataFrame in place by updating the Verkauf column.
    """
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()

    for idx, row in df.iterrows():
        # Skip if no stockout occurred
        if row['Zensiert'] == 0:
            continue
            
        # Get the order-quantity-specific demand parameters
        key = (row['Heftjahr'], row['Heftnummer'], row['Bezug'])
        mu = order_specific_mu_dict.get(key, None)
        sigma = order_specific_sigma_dict.get(key, None)

        if mu is None or sigma is None:
            # no estimate available for this specific (week, order_quantity) — keep original value
            continue

        # POS sold out — uncensor using the specific distribution for this order quantity
        est_demand = expected_normal_tail(mu, sigma, row['Bezug'])
        df.at[idx, 'Verkauf_Uncensored'] = np.round(est_demand)

    return df

def apply_nahmias_uncensoring(df):
    """
    WRAPPER FUNCTION: This is what gets called by the main processing loop
    """
    # Step 1: Create mu dictionary from the dataset
    order_specific_mu_dict, compute_mu_sigma_nahmias = create_order_specific_nahmias_dict(df)
    
    # Step 2: Apply uncensoring using the mu dictionary  
    return apply_nahmias_uncensoring_1(df, order_specific_mu_dict, compute_mu_sigma_nahmias)


# Usage
# order_specific_mu_dict, order_specific_sigma_dict = create_order_specific_nahmias_dict(df)
# df_nahmias_uncensored = apply_nahmias_uncensoring_1(df, order_specific_mu_dict, order_specific_sigma_dict)

In [164]:
import pandas as pd
import numpy as np

def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# Process all magazines A through I
magazines = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
all_kpis = []

for magazine in magazines:
    filename = f'test1/{magazine}_20250212_ZQ0.35_ZG0.4_testfile.csv'
    
    # Load data
    df = pd.read_csv(filename)
    
    # Apply baseline uncensoring
    baseline_df = apply_nahmias_uncensoring(df)
    
    # Calculate KPIs
    kpis = calculate_kpis(baseline_df, f"baseline_{magazine}")
    all_kpis.append(kpis)
    
    print(f"Processed magazine {magazine}")

# Calculate average KPIs
print("\nCalculating averages...")

# Convert to DataFrame if KPIs are dictionaries
if isinstance(all_kpis[0], dict):
    kpi_df = pd.DataFrame(all_kpis)
    average_kpis = kpi_df.mean()
    print("\nAverage KPIs:")
    print(average_kpis)
else:
    # If KPIs are single values
    average_kpis = np.mean(all_kpis)
    print(f"\nAverage KPI: {average_kpis}")

print(f"\nProcessed {len(magazines)} magazines successfully.")

Method: baseline_A, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.500 (underestimation)
- Weighted MAE (α=0): 0.872
- Weighted MAE (α=0.5): 1.057
- Weighted MAE (α=1): 1.341
- Weighted MAE (α=1.5): 1.734
- Accuracy (exact matches): 0.346
- Overestimation Rate: 0.171
- Underestimation Rate: 0.483
- Gini Coefficient: 0.485
Overstock: 14069
Processed magazine A
Method: baseline_B, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.613 (underestimation)
- Weighted MAE (α=0): 0.893
- Weighted MAE (α=0.5): 1.165
- Weighted MAE (α=1): 1.730
- Weighted MAE (α=1.5): 2.944
- Accuracy (exact matches): 0.359
- Overestimation Rate: 0.128
- Underestimation Rate: 0.513
- Gini Coefficient: 0.512
Overstock: 6785
Processed magazine B
Method: baseline_C, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.454 (underestimation)
- Weighted MAE (α=0): 0.742
- Weighted MAE (α=0.5): 0.812
- Weighted MAE (α=1): 0.905
- Weighted MAE (α=1.5): 1.030
- Accuracy (exact matches): 0.364
- Overes

In [130]:
import pandas as pd
import numpy as np

def calculate_kpis(dataframe, method_name):
   """
   Calculate KPIs for demand forecasting evaluation
   
   Parameters:
   - dataframe: pandas DataFrame containing 'Verkauf' (predicted) and 'Verkauf_MBR' (ground truth) columns
   - method_name: string name of the method being evaluated
   - censorship_pct: censorship percentage for display
   - reduction_pct: reduction percentage for display
   - alpha: weight parameter for Weighted MAE (default=1)
            α = 0: standard MAE (no weighting)
            α = 1: linear weighting by true demand
            α > 1: over-proportional penalization of larger errors --> 1.5
            α < 1: emphasis on smaller demands --> 0.5
   """
   df = dataframe.copy()
   censored_df = df[df['Zensiert'] == 1]
   
   # Extract predicted and true values
   y_pred = censored_df['Verkauf_Uncensored'].values  # ŷᵢ (estimated demand)
   y_true = censored_df['Verkauf_MBR'].values  # yᵢ (true demand)
   
   n = len(y_pred)
   
   # 1. Bias calculation
   bias = np.sum(y_pred - y_true) / n
   
   # 2. Accuracy (exact matches)
   exact_matches = np.sum(y_pred == y_true)
   accuracy = exact_matches / n
   
   # 3. Overestimation Rate
   overestimations = np.sum(y_pred > y_true)
   overestimation_rate = overestimations / n
   
   # 4. Underestimation Rate
   underestimations = np.sum(y_pred < y_true)
   underestimation_rate = underestimations / n
   
   # 5. Weighted MAE for different alpha values
   alphas = [0, 0.5, 1, 1.5]
   weighted_maes = {}
   
   for a in alphas:
       if a == 0:
           # Standard MAE (no weighting)
           weighted_maes[a] = np.mean(np.abs(y_pred - y_true))
       else:
           # Weighted MAE with α parameter
           weights = np.power(y_true, a)
           # Handle case where y_true might be 0
           weights = np.where(y_true == 0, 0, weights)
           weighted_maes[a] = np.sum(weights * np.abs(y_pred - y_true)) / np.sum(weights) if np.sum(weights) > 0 else 0
   
   # 6. Gini Coefficient
   abs_errors = np.abs(y_pred - y_true)
   mean_abs_error = np.mean(abs_errors)
   n = len(abs_errors)
   sorted_errors = np.sort(abs_errors)
   
   weighted_sum = 0.0
   total_sum = 0.0

   # 7. Overstock, out of curiosity
   overstock = np.sum(np.maximum(0, y_pred - y_true))
   
   for i in range(n):
       weighted_sum += (i + 1) * sorted_errors[i]
       total_sum += sorted_errors[i]
   
   gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
   
   # Determine bias direction
   bias_direction = "overestimation" if bias > 0 else "underestimation" if bias < 0 else "neutral"
   
   # Print results in the specified format
   print(f"Method: {method_name}, on {path}")
   print(f"- Bias: {bias:.3f} ({bias_direction})")
   print(f"- Weighted MAE (α=0): {weighted_maes[0]:.3f}")
   print(f"- Weighted MAE (α=0.5): {weighted_maes[0.5]:.3f}")
   print(f"- Weighted MAE (α=1): {weighted_maes[1]:.3f}")
   print(f"- Weighted MAE (α=1.5): {weighted_maes[1.5]:.3f}")
   print(f"- Accuracy (exact matches): {accuracy:.3f}")
   print(f"- Overestimation Rate: {overestimation_rate:.3f}")
   print(f"- Underestimation Rate: {underestimation_rate:.3f}")
   print(f"- Gini Coefficient: {gini_coefficient:.3f}")
   print(f"Overstock: {int(overstock)}")
   
   return {
       'bias': bias,
       'weighted_mae_0': weighted_maes[0],
       'weighted_mae_0.5': weighted_maes[0.5],
       'weighted_mae_1': weighted_maes[1],
       'weighted_mae_1.5': weighted_maes[1.5],
       'accuracy': accuracy,
       'overestimation_rate': overestimation_rate,
       'underestimation_rate': underestimation_rate,
       'gini_coefficient': gini_coefficient,
       'overstock': overstock
   }

# Baseline

In [8]:
def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# baseline_df = apply_baseline_uncensoring(df)
#calculate_kpis(baseline_df, "baseline")

In [152]:
import pandas as pd
import numpy as np

def apply_baseline_uncensoring(df):
    df_result = df.copy()
    df_result['Verkauf_Uncensored'] = df_result['Verkauf'] 
    return df_result

# Process all magazines A through I
magazines = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
all_kpis = []

for magazine in magazines:
    filename = f'test1/{magazine}_20250212_ZQ0.35_ZG0.4_testfile.csv'
    
    # Load data
    df = pd.read_csv(filename)
    
    # Apply baseline uncensoring
    baseline_df = apply_baseline_uncensoring(df)
    
    # Calculate KPIs
    kpis = calculate_kpis(baseline_df, f"baseline_{magazine}")
    all_kpis.append(kpis)
    
    print(f"Processed magazine {magazine}")

# Calculate average KPIs
print("\nCalculating averages...")

# Convert to DataFrame if KPIs are dictionaries
if isinstance(all_kpis[0], dict):
    kpi_df = pd.DataFrame(all_kpis)
    average_kpis = kpi_df.mean()
    print("\nAverage KPIs:")
    print(average_kpis)
else:
    # If KPIs are single values
    average_kpis = np.mean(all_kpis)
    print(f"\nAverage KPI: {average_kpis}")

print(f"\nProcessed {len(magazines)} magazines successfully.")

Method: baseline_A, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.953 (underestimation)
- Weighted MAE (α=0): 0.953
- Weighted MAE (α=0.5): 1.227
- Weighted MAE (α=1): 1.625
- Weighted MAE (α=1.5): 2.143
- Accuracy (exact matches): 0.404
- Overestimation Rate: 0.000
- Underestimation Rate: 0.596
- Gini Coefficient: 0.578
Overstock: 0
Processed magazine A
Method: baseline_B, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.974 (underestimation)
- Weighted MAE (α=0): 0.974
- Weighted MAE (α=0.5): 1.318
- Weighted MAE (α=1): 1.974
- Weighted MAE (α=1.5): 3.271
- Accuracy (exact matches): 0.400
- Overestimation Rate: 0.000
- Underestimation Rate: 0.600
- Gini Coefficient: 0.582
Overstock: 0
Processed magazine B
Method: baseline_C, on Test1/A_20250212_ZQ0.35_ZG0.4_testfile.csv
- Bias: -0.732 (underestimation)
- Weighted MAE (α=0): 0.732
- Weighted MAE (α=0.5): 0.832
- Weighted MAE (α=1): 0.971
- Weighted MAE (α=1.5): 1.155
- Accuracy (exact matches): 0.434
- Overestimatio

# Rewriting N1, N2, N3, EM, PD

In [9]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

def apply_n1_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Use transform to get group means - NO MERGE!
    group_means = df.groupby('EHASTRA_EH_NUMMER')['Verkauf'].transform('mean')
    
    # Replace censored with group means
    censored_mask = (df['Zensiert'] == 1)
    df.loc[censored_mask, 'Verkauf_Uncensored'] = group_means[censored_mask]
    
    return df

def apply_n2_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate uncensored means per group - NO MERGE!
    uncensored_means = (
        df[df['Zensiert'] == 0]
        .groupby('EHASTRA_EH_NUMMER')['Verkauf']
        .transform('mean')
    )
    
    # Map to all rows using the POS ID
    df['group_uncensored_mean'] = df.groupby('EHASTRA_EH_NUMMER')['EHASTRA_EH_NUMMER'].transform(
        lambda group: df[df['Zensiert'] == 0].groupby('EHASTRA_EH_NUMMER')['Verkauf'].mean().get(group.iloc[0], 0)
    )
    
    # Replace censored values
    censored_mask = (df['Zensiert'] == 1)
    df.loc[censored_mask, 'Verkauf_Uncensored'] = df.loc[censored_mask, 'group_uncensored_mean']
    
    return df.drop('group_uncensored_mean', axis=1)

def apply_n3_uncensoring(df):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Calculate uncensored means per group - NO MERGE!
    uncensored_group_means = (
        df[df['Zensiert'] == 0]
        .groupby('EHASTRA_EH_NUMMER')['Verkauf']
        .mean()
    )
    
    # Map means to each row based on POS ID
    df['group_mean'] = df['EHASTRA_EH_NUMMER'].map(uncensored_group_means).fillna(0)
    
    # For censored: max(observed, group_mean)
    censored_mask = (df['Zensiert'] == 1)
    df.loc[censored_mask, 'Verkauf_Uncensored'] = np.maximum(
        df.loc[censored_mask, 'Verkauf'],
        df.loc[censored_mask, 'group_mean']
    )
    
    return df.drop('group_mean', axis=1)

def apply_em_uncensoring(df, max_iter=30, tolerance=1e-6):
    df = df.copy()
    df['Verkauf_Uncensored'] = df['Verkauf'].copy()
    
    # Direct boolean indexing without extra columns
    stockout_condition = (df['Zensiert'] == 1)
    
    # Process groups with minimal overhead
    for (pos,), group in df.groupby(['EHASTRA_EH_NUMMER']):
        try:
            group_stockout = stockout_condition.loc[group.index]
            
            if not group_stockout.any():
                continue
            
            sales = group['Verkauf_Uncensored'].values
            is_stockout = group_stockout.values
            
            uncensored = sales[~is_stockout]
            censored = sales[is_stockout]
            
            # Skip if no uncensored data - but keep original values
            if len(uncensored) == 0:
                continue
            
            # Quick lambda initialization
            lambda_est = np.mean(uncensored) if len(uncensored) > 0 else np.mean(sales) * 1.5
            lambda_est = max(lambda_est, 0.1)
            
            # Fast EM loop
            for iteration in range(max_iter):
                lambda_old = lambda_est
                
                # Batch E-step
                surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
                exact_prob = poisson.pmf(censored, lambda_est)
                surv_prob = np.maximum(surv_prob, 1e-12)
                
                expected = lambda_est + censored * exact_prob / surv_prob
                expected = np.maximum(expected, censored.astype(float))
                
                # M-step
                lambda_est = max(np.mean(np.concatenate([uncensored, expected])), 0.1)
                
                if abs(lambda_est - lambda_old) < tolerance:
                    break
            
            # Final update
            surv_prob = 1 - poisson.cdf(censored - 1, lambda_est)
            exact_prob = poisson.pmf(censored, lambda_est)
            surv_prob = np.maximum(surv_prob, 1e-12)
            
            final_expected = lambda_est + censored * exact_prob / surv_prob
            final_expected = np.maximum(final_expected, censored.astype(float))
            
            # Update original dataframe
            stockout_indices = group.index[is_stockout]
            df.loc[stockout_indices, 'Verkauf_Uncensored'] = final_expected.round()
            
        except Exception as e:
            print(f"EM error for POS {pos}: {e}")
            # Fill with original Verkauf values for this POS when error occurs
            group_stockout_indices = group.index[stockout_condition.loc[group.index]]
            df.loc[group_stockout_indices, 'Verkauf_Uncensored'] = df.loc[group_stockout_indices, 'Verkauf']
            continue
    
    # ADD THIS: Final NaN handling - replace any NaN values with original Verkauf
    nan_mask = df['Verkauf_Uncensored'].isna()
    df.loc[nan_mask, 'Verkauf_Uncensored'] = df.loc[nan_mask, 'Verkauf']
    
    return df

def apply_pd_uncensoring(df, tau=0.5, max_iter=20, tolerance=1e-4):
    """PD with skip for invalid POS groups and fallback to original Verkauf"""
    df = df.copy()
    df["is_closed"] = (df["Zensiert"] == 1)
    df['Verkauf_Uncensored'] = df['Verkauf'].copy().astype(float)

    def compute_pd_projection1(obs_val, lambda_est, tau):
        """Your PD projection with NaN protection"""
        try:
            # Check for NaN inputs
            if pd.isna(obs_val) or pd.isna(lambda_est) or lambda_est <= 0:
                return float(obs_val) if not pd.isna(obs_val) else 0.0
            
            obs_val = int(round(obs_val))
            
            def objective(k_proj):
                k_proj = int(round(k_proj))
                
                if k_proj < obs_val:
                    return float('inf')
                
                # Area A: P(obs_val <= X <= k_proj)
                area_A = poisson.cdf(k_proj, lambda_est) - poisson.cdf(obs_val - 1, lambda_est)
                
                # Area B: P(X > k_proj)
                area_B = 1 - poisson.cdf(k_proj, lambda_est)
                
                if area_B > 1e-10:
                    ratio = area_A / area_B
                    target_ratio = (1 - tau) / tau
                    return abs(ratio - target_ratio)
                else:
                    return abs(area_A - (1 - tau))
            
            # Search for optimal projection
            upper_bound = int(obs_val + max(10, int(3 * np.sqrt(lambda_est))))
            
            best_k = obs_val
            best_objective = float('inf')
            
            for k in range(int(obs_val), upper_bound + 1):
                obj_val = objective(k)
                if obj_val < best_objective:
                    best_objective = obj_val
                    best_k = k
            
            return float(best_k)
            
        except Exception:
            # Silent fallback to original value
            return float(obs_val) if not pd.isna(obs_val) else 0.0
    
    # Process groups
    grouped = df.groupby('EHASTRA_EH_NUMMER')
    
    for pos, group in grouped:
        try:
            open_mask = ~group['is_closed']
            closed_mask = group['is_closed']
            
            open_sales = group.loc[open_mask, 'Verkauf_Uncensored'].values
            closed_sales = group.loc[closed_mask, 'Verkauf_Uncensored'].values
            
            # SKIP POS WITHOUT VALID UNCENSORED DATA
            if len(open_sales) == 0 or np.all(pd.isna(open_sales)):
                print(f"Skipping POS {pos}: no valid uncensored data")
                continue
            
            # Initialize lambda parameter
            lambda_est = np.mean(open_sales[~pd.isna(open_sales)])
            
            # SKIP IF LAMBDA IS INVALID
            if pd.isna(lambda_est) or lambda_est <= 0:
                print(f"Skipping POS {pos}: invalid lambda {lambda_est}")
                continue
            
            lambda_est = max(lambda_est, 0.1)
            closed_indices = group[closed_mask].index.values
            
            # Iterative process
            for iteration in range(max_iter):
                lambda_old = lambda_est
                
                # Project closed observations
                projected_values = np.array([
                    compute_pd_projection1(obs, lambda_est, tau) 
                    for obs in closed_sales
                ])
                
                # Re-estimate lambda
                all_values = np.concatenate([open_sales, projected_values])
                lambda_est = np.mean(all_values)
                lambda_est = max(lambda_est, 0.1)
                
                # Check convergence
                if abs(lambda_est - lambda_old) < tolerance:
                    break
            
            # Final projection
            final_projections = np.array([
                compute_pd_projection1(obs, lambda_est, tau) 
                for obs in closed_sales
            ])
            
            # Update dataframe
            df.loc[closed_indices, 'Verkauf_Uncensored'] = final_projections.round()
            
        except Exception as e:
            print(f"PD error for POS {pos}: {e}")
            # FALLBACK: Keep original values for this POS
            continue
    
    # FINAL FALLBACK: Any remaining NaN values get original Verkauf
    nan_mask = df['Verkauf_Uncensored'].isna()
    df.loc[nan_mask, 'Verkauf_Uncensored'] = df.loc[nan_mask, 'Verkauf']
    
    return df.drop('is_closed', axis=1)

# Script for creating csv of all features for all mags

In [1]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path

def calculate_kpis_for_pos_fast(pos_uncensored_data, pos_censored_data, method_name, magazine_name, pos_id):
    """
    Fast KPI calculation using pre-filtered data
    
    Parameters:
    - pos_uncensored_data: Pre-filtered uncensored data for this POS
    - pos_censored_data: Pre-filtered censored data for this POS  
    - method_name: string name of the method being evaluated
    - magazine_name: string name of the magazine
    - pos_id: EHASTRA_EH_NUMMER for this POS
    """
    
    if len(pos_censored_data) == 0:
        # Return empty KPIs if no censored data for this POS
        return {
            'bias': 0,
            'weighted_mae_0': 0,
            'weighted_mae_0.5': 0,
            'weighted_mae_1': 0,
            'weighted_mae_1.5': 0,
            'accuracy': 0,
            'overestimation_rate': 0,
            'underestimation_rate': 0,
            'gini_coefficient': 0,
            'overstock': 0,
            'n_censored_observations': 0
        }
    
    # Extract values directly (already filtered)
    y_pred = pos_uncensored_data['Verkauf_Uncensored'].values
    y_true = pos_censored_data['Verkauf_MBR'].values
    
    n = len(y_pred)
    
    # Vectorized calculations for speed
    errors = y_pred - y_true
    abs_errors = np.abs(errors)
    
    # 1. Bias calculation
    bias = np.mean(errors)
    
    # 2. Accuracy (exact matches)
    accuracy = np.mean(y_pred == y_true)
    
    # 3. Overestimation Rate
    overestimation_rate = np.mean(y_pred > y_true)
    
    # 4. Underestimation Rate  
    underestimation_rate = np.mean(y_pred < y_true)
    
    # 5. Weighted MAE for different alpha values (vectorized)
    alphas = [0, 0.5, 1, 1.5]
    weighted_maes = {}
    
    for a in alphas:
        if a == 0:
            weighted_maes[a] = np.mean(abs_errors)
        else:
            weights = np.power(y_true, a)
            weights = np.where(y_true == 0, 0, weights)
            weighted_maes[a] = np.sum(weights * abs_errors) / np.sum(weights) if np.sum(weights) > 0 else 0
    
    # 6. Gini Coefficient (optimized)
    sorted_errors = np.sort(abs_errors)
    total_sum = np.sum(sorted_errors)
    
    if total_sum > 0:
        indices = np.arange(1, n + 1)
        weighted_sum = np.sum(indices * sorted_errors)
        gini_coefficient = (2 * weighted_sum) / (n * total_sum) - (n + 1) / n
    else:
        gini_coefficient = 0
    
    # 7. Overstock
    overstock = np.sum(np.maximum(0, errors))
    
    return {
        'bias': bias,
        'weighted_mae_0': weighted_maes[0],
        'weighted_mae_0.5': weighted_maes[0.5],
        'weighted_mae_1': weighted_maes[1],
        'weighted_mae_1.5': weighted_maes[1.5],
        'accuracy': accuracy,
        'overestimation_rate': overestimation_rate,
        'underestimation_rate': underestimation_rate,
        'gini_coefficient': gini_coefficient,
        'overstock': overstock,
        'n_censored_observations': n
    }

def calculate_pos_features(df, magazine_name):
    """
    Calculate features for each POS (EHASTRA_EH_NUMMER)
    
    Parameters:
    df (DataFrame): Input dataframe with 'EHASTRA_EH_NUMMER', 'Verkauf', and 'Laenge' columns
    magazine_name (str): Name of the magazine
    
    Returns:
    DataFrame: DataFrame with EHASTRA_EH_NUMMER and calculated features
    """
    
    results = []
    
    for pos_id, pos_data in df.groupby('EHASTRA_EH_NUMMER'):
        
        verkauf = pos_data['Verkauf'].dropna()
        
        if len(verkauf) == 0:
            continue
        
        features = {
            'EHASTRA_EH_NUMMER': pos_id,
            'magazine': magazine_name
        }
        
        # 1. Data Sparsity
        actual_data_points = len(pos_data)
        laenge = pos_data['Laenge'].iloc[0] if 'Laenge' in pos_data.columns else actual_data_points
        
        features['data_sparsity'] = laenge / actual_data_points if actual_data_points > 0 else 0
        
        # 2. Stockout Rate
        if 'Zensiert' in pos_data.columns:
            features['stockout_rate'] = (pos_data['Zensiert'] == 1).sum() / len(pos_data['Zensiert'])
        else:
            features['stockout_rate'] = 0
        
        # 3. Verkauf Variance
        features['verkauf_variance'] = verkauf.var()
        
        # 4. Verkauf Mean
        features['verkauf_mean'] = verkauf.mean()
        
        # 5. Coefficient of Variation
        if features['verkauf_mean'] > 0:
            features['coefficient_of_variation'] = verkauf.std() / features['verkauf_mean']
        else:
            features['coefficient_of_variation'] = 0
        
        # 6. Number of unique order quantities
        features['unique_quantities'] = verkauf.nunique()
        
        # 7. Variance to Mean Ratio
        if features['verkauf_mean'] > 0:
            features['variance_to_mean_ratio'] = features['verkauf_variance'] / features['verkauf_mean']
        else:
            features['variance_to_mean_ratio'] = 0
        
        # 8. Verkauf autocorrelation
        if len(verkauf) > 1:
            try:
                autocorr = verkauf.autocorr(lag=1)
                features['verkauf_autocorr'] = autocorr if not pd.isna(autocorr) else 0
            except:
                features['verkauf_autocorr'] = 0
        else:
            features['verkauf_autocorr'] = 0
        
        # 9. ADI (Average Inter-demand Interval)
        non_zero_periods = (verkauf > 0).sum()
        if non_zero_periods > 0:
            features['adi'] = len(verkauf) / non_zero_periods
        else:
            features['adi'] = len(verkauf)
        
        results.append(features)
    
    return pd.DataFrame(results)

def parse_filename(filename):
    """
    Parse filename to extract magazine letter and parameters
    
    Example: A_20250212_ZQ0.35_ZG0.4_testfile.csv -> ('A', '20250212', 'ZQ0.35', 'ZG0.4')
    """
    parts = filename.split('_')
    if len(parts) >= 4:
        magazine = parts[0]
        date = parts[1]
        zq = parts[2]
        zg = parts[3]
        return magazine, date, zq, zg
    return None, None, None, None

def process_all_datasets(data_directory, output_filename='combined_kpis_features_pivoted.csv'):
    """
    Process specific datasets and create a combined CSV with KPIs and POS features
    Each row represents one POS/Magazine combination with methods as column prefixes
    Only processes files with ZQ0.35_ZG0.4 pattern
    
    Parameters:
    - data_directory: path to directory containing CSV files
    - output_filename: name of the output file
    """
    
    # Method mapping
    method_functions = {
        'N1': apply_n1_uncensoring,
        'N2': apply_n2_uncensoring,
        'N3': apply_n3_uncensoring,
        'EM': apply_em_uncensoring,
        'PD': apply_pd_uncensoring,
        'Nahmias': apply_nahmias_uncensoring,
        'Conrad': apply_conrad_uncensoring,
        #'Nahmias_NG': apply_nahmias_ng_uncensoring,
        #'Conrad_NG': apply_conrad_ng_uncensoring,
        'Baseline': apply_baseline_uncensoring
    }
    
    # Find only the specific files with ZQ0.35_ZG0.4 pattern
    pattern = os.path.join(data_directory, "*_20250212_ZQ0.35_ZG0.4_testfile.csv")
    csv_files = glob.glob(pattern)
    
    if not csv_files:
        print(f"No CSV files found matching pattern *_20250212_ZQ0.35_ZG0.4_testfile.csv in {data_directory}")
        return
    
    print(f"Found {len(csv_files)} CSV files to process (ZQ0.35_ZG0.4 only)")
    
    all_results = []
    
    for csv_file in csv_files:
        filename = os.path.basename(csv_file)
        magazine, date, zq, zg = parse_filename(filename)
        
        if not magazine:
            print(f"Could not parse filename: {filename}")
            continue
            
        print(f"Processing {filename}...")
        
        try:
            # Read the original CSV file
            original_df = pd.read_csv(csv_file)
            
            # Filter out POS locations with less than 3 censored data points
            censored_counts = original_df[original_df['Zensiert'] == 1].groupby('EHASTRA_EH_NUMMER').size()
            valid_pos_ids = censored_counts[censored_counts >= 3].index
            original_df = original_df[original_df['EHASTRA_EH_NUMMER'].isin(valid_pos_ids)]
            
            if len(original_df) == 0:
                print(f"  No POS locations with >= 3 censored data points in {filename}")
                continue
            
            # Calculate POS features once for this magazine (they don't depend on uncensoring method)
            pos_features = calculate_pos_features(original_df, magazine)
            
            # Get unique POS IDs and convert to numpy array for faster iteration
            pos_ids = original_df['EHASTRA_EH_NUMMER'].unique()
            
            # Pre-filter censored data by POS to avoid repeated filtering
            censored_mask = original_df['Zensiert'] == 1
            censored_data = original_df[censored_mask]
            pos_censored_groups = censored_data.groupby('EHASTRA_EH_NUMMER')
            
            # Convert pos_features to dictionary for faster lookup
            pos_features_dict = pos_features.set_index('EHASTRA_EH_NUMMER').to_dict('index')
            
            # Initialize results dictionary for each POS
            pos_results = {}
            
            # Process each uncensoring method
            for method_name, method_function in method_functions.items():
                print(f"  Applying {method_name} uncensoring...")
                
                try:
                    # Apply the uncensoring method to the entire dataset
                    uncensored_df = method_function(original_df)
                    
                    # Pre-calculate uncensored values for censored data only
                    uncensored_censored = uncensored_df[censored_mask]
                    uncensored_groups = uncensored_censored.groupby('EHASTRA_EH_NUMMER')
                    
                    # Process all POS locations for this method
                    for pos_id in pos_ids:
                        if pos_id in pos_censored_groups.groups and pos_id in pos_features_dict:
                            # Get pre-filtered censored data for this POS
                            pos_censored = pos_censored_groups.get_group(pos_id)
                            pos_uncensored = uncensored_groups.get_group(pos_id) if pos_id in uncensored_groups.groups else None
                            
                            if pos_uncensored is not None and len(pos_uncensored) > 0:
                                # Calculate KPIs using pre-filtered data
                                kpis = calculate_kpis_for_pos_fast(pos_uncensored, pos_censored, method_name, magazine, pos_id)
                                
                                # Initialize POS entry if it doesn't exist
                                if pos_id not in pos_results:
                                    pos_results[pos_id] = {
                                        'EHASTRA_EH_NUMMER': pos_id,
                                        'magazine': magazine,
                                        'date': date,
                                        'zq_parameter': zq,
                                        'zg_parameter': zg,
                                        'filename': filename,
                                        **pos_features_dict[pos_id]  # Add POS features
                                    }
                                
                                # Add method-specific KPIs with method prefix
                                for kpi_name, kpi_value in kpis.items():
                                    column_name = f"{method_name}_{kpi_name}"
                                    pos_results[pos_id][column_name] = kpi_value
                
                except Exception as e:
                    print(f"    Error applying {method_name}: {str(e)}")
                    continue
            
            # Add all POS results from this file to main results
            all_results.extend(list(pos_results.values()))
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue
    
    if all_results:
        # Create final DataFrame
        final_df = pd.DataFrame(all_results)
        
        # Save to CSV
        output_path = os.path.join(data_directory, output_filename)
        final_df.to_csv(output_path, index=False)
        
        print(f"\nProcessing complete!")
        print(f"Combined dataset saved as: {output_path}")
        print(f"Total rows: {len(final_df)}")
        print(f"Magazines processed: {final_df['magazine'].nunique()}")
        print(f"POS locations: {final_df['EHASTRA_EH_NUMMER'].nunique()}")
        
        return final_df
    else:
        print("No data was processed successfully.")
        return None

# Usage
data_directory = "Test1"  # Change this to your actual directory path
result_df = process_all_datasets(data_directory, 'combined_magazine_kpis_features_attempt1.csv')

NameError: name 'apply_n1_uncensoring' is not defined

# Cleaning & Merging with Agrawal + Bayesian

In [184]:
import pandas as pd

def merge_all_methods_data(pivoted_csv_path, bayesian_csv_path, agrawal_csv_path, output_csv_path):
    """
    Simply add Bayesian and Agrawal columns to the pivoted dataset
    
    Parameters:
    - pivoted_csv_path: path to the pivoted CSV with method-prefixed columns
    - bayesian_csv_path: path to Bayesian results CSV
    - agrawal_csv_path: path to Agrawal results CSV  
    - output_csv_path: path for the merged output CSV
    """
    
    # Read all datasets
    print("Reading datasets...")
    pivoted_df = pd.read_csv(pivoted_csv_path)
    bayesian_df = pd.read_csv(bayesian_csv_path)
    agrawal_df = pd.read_csv(agrawal_csv_path)
    
    print(f"Pivoted data shape: {pivoted_df.shape}")
    print(f"Bayesian data shape: {bayesian_df.shape}")
    print(f"Agrawal data shape: {agrawal_df.shape}")
    
    # Merge Bayesian data (POS -> EHASTRA_EH_NUMMER, Magazine -> magazine)
    print("Merging Bayesian data...")
    merged_df = pivoted_df.merge(
        bayesian_df, 
        left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
        right_on=['POS', 'Magazine'], 
        how='left'
    )
    
    # Drop the duplicate columns from Bayesian
    columns_to_drop = ['POS', 'Magazine']
    for col in columns_to_drop:
        if col in merged_df.columns:
            merged_df = merged_df.drop(col, axis=1)
    
    # Merge Agrawal data (POS -> EHASTRA_EH_NUMMER, Magazine -> magazine)  
    print("Merging Agrawal data...")
    merged_df = merged_df.merge(
        agrawal_df, 
        left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
        right_on=['POS', 'Magazine'], 
        how='left',
        suffixes=('', '_agrawal_dup')
    )
    
    # Drop the duplicate columns from Agrawal
    columns_to_drop = ['POS', 'Magazine']
    for col in columns_to_drop:
        if col in merged_df.columns:
            merged_df = merged_df.drop(col, axis=1)
    
    print(f"Final merged dataset shape: {merged_df.shape}")
    
    # Check merge success
    bayesian_cols = [col for col in merged_df.columns if col.startswith('Bayesian_')]
    agrawal_cols = [col for col in merged_df.columns if col.startswith('Agrawal_')]
    
    if bayesian_cols:
        bayesian_matches = merged_df[bayesian_cols[0]].notna().sum()
        print(f"Successful Bayesian merges: {bayesian_matches}/{len(merged_df)}")
    
    if agrawal_cols:
        agrawal_matches = merged_df[agrawal_cols[0]].notna().sum() 
        print(f"Successful Agrawal merges: {agrawal_matches}/{len(merged_df)}")
    
    # Save the merged dataset
    merged_df.to_csv(output_csv_path, index=False)
    print(f"Final dataset saved to: {output_csv_path}")
    
    # Show column summary
    all_method_cols = [col for col in merged_df.columns if any(method in col for method in [
        'N1_', 'N2_', 'N3_', 'EM_', 'PD_', 'Nahmias_', 'Conrad_', 'Baseline_', 'Bayesian_', 'Agrawal_'
    ])]
    
    print(f"Total method-specific columns: {len(all_method_cols)}")
    
    return merged_df

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    pivoted_file = "combined_magazine_kpis_features_attempt1.csv"
    bayesian_file = "bayesian_results.csv"  # Update with actual filename
    agrawal_file = "agrawal_results.csv"    # Update with actual filename
    output_file = "final_merged_all_methods.csv"
    
    # Merge all datasets
    merged_df = merge_all_methods_data(
        pivoted_file, 
        bayesian_file, 
        agrawal_file, 
        output_file
    )
    
    print(f"\nFinal dataset: {len(merged_df)} rows, {len(merged_df.columns)} columns")
    print("All methods combined - ready for analysis!")

Reading datasets...
Pivoted data shape: (10685, 103)
Bayesian data shape: (10685, 14)
Agrawal data shape: (10685, 14)
Merging Bayesian data...
Merging Agrawal data...
Final merged dataset shape: (10685, 127)
Successful Bayesian merges: 10685/10685
Successful Agrawal merges: 10685/10685
Final dataset saved to: final_merged_all_methods.csv
Total method-specific columns: 110

Final dataset: 10685 rows, 127 columns
All methods combined - ready for analysis!


# Adding best method

In [None]:
import pandas as pd
import numpy as np

def add_best_method_column_pivoted(df, primary_kpi, tiebreaker_kpi):
    """
    Add a column showing which method performs best for each POS+Magazine combination
    
    Parameters:
    - primary_kpi: The main KPI to optimize (e.g., 'weighted_mae_0')
    - tiebreaker_kpi: KPI to use for breaking ties (e.g., 'accuracy')
    
    Returns:
    - DataFrame with new column 'best_method_{primary_kpi}'
    
    """
    
    # Define which KPIs should be minimized vs maximized
    minimize_kpis = [
        'bias', 'weighted_mae_0', 'weighted_mae_0.5', 'weighted_mae_1', 'weighted_mae_1.5',
        'gini_coefficient', 'overstock', 'overestimation_rate', 'underestimation_rate'
    ]
    
    maximize_kpis = [
        'accuracy'
    ]
    
    
    # Determine optimization direction for primary and tiebreaker KPIs
    primary_minimize = primary_kpi in minimize_kpis
    tiebreaker_minimize = tiebreaker_kpi in minimize_kpis
    
    primary_kpi_cols = [col for col in df.columns if col.endswith(f'_{primary_kpi}') and not col.startswith('best_')]
    tiebreaker_kpi_cols = [col for col in df.columns if col.endswith(f'_{tiebreaker_kpi}') and not col.startswith('best_')]
    
    if not primary_kpi_cols:
        raise ValueError(f"No columns found ending with '_{primary_kpi}'")
    if not tiebreaker_kpi_cols:
        raise ValueError(f"No columns found ending with '_{tiebreaker_kpi}'")
    
    print(f"Found {len(primary_kpi_cols)} methods for {primary_kpi}: {[col.split('_')[0] for col in primary_kpi_cols]}")
    print(f"Found {len(tiebreaker_kpi_cols)} methods for {tiebreaker_kpi}: {[col.split('_')[0] for col in tiebreaker_kpi_cols]}")
    
    # Create the new column name
    new_column_name = f'best_method_{primary_kpi}'
    
    df_copy = df.copy()
    best_methods = []
    
    # For each row, find the best method
    for idx, row in df_copy.iterrows():
        # Get primary KPI values for all methods
        primary_values = {}
        for col in primary_kpi_cols:
            method_name = col.split('_')[0]  # Extract method name (e.g., 'N1' from 'N1_weighted_mae_0')
            if pd.notna(row[col]):
                primary_values[method_name] = row[col]
        
        if not primary_values:
            best_methods.append(None)
            continue
        
        # Find the best primary KPI value
        if primary_minimize:
            best_primary_value = min(primary_values.values())
            best_methods_primary = [method for method, value in primary_values.items() if value == best_primary_value]
        else:
            best_primary_value = max(primary_values.values())
            best_methods_primary = [method for method, value in primary_values.items() if value == best_primary_value]
        
        # If there's only one best method, use it
        if len(best_methods_primary) == 1:
            best_method = best_methods_primary[0]
        else:
            # Use tiebreaker KPI to resolve ties
            tiebreaker_values = {}
            for method in best_methods_primary:
                tiebreaker_col = f"{method}_{tiebreaker_kpi}"
                if tiebreaker_col in df.columns and pd.notna(row[tiebreaker_col]):
                    tiebreaker_values[method] = row[tiebreaker_col]
            
            if not tiebreaker_values:
                # If no tiebreaker values available, pick the first method alphabetically
                best_method = sorted(best_methods_primary)[0]
            else:
                if tiebreaker_minimize:
                    best_tiebreaker_value = min(tiebreaker_values.values())
                    best_method = [method for method, value in tiebreaker_values.items() if value == best_tiebreaker_value][0]
                else:
                    best_tiebreaker_value = max(tiebreaker_values.values())
                    best_method = [method for method, value in tiebreaker_values.items() if value == best_tiebreaker_value][0]
        
        best_methods.append(best_method)
    
    # Add the best method column
    df_copy[new_column_name] = best_methods
    
    return df_copy

def add_multiple_best_method_columns_pivoted(df, kpi_configs):
    """
    Add multiple best method columns for different KPI combinations
    Works with pivoted data
    
    Parameters:
    - df: DataFrame with pivoted KPI results
    - kpi_configs: List of tuples [(primary_kpi, tiebreaker_kpi), ...]
    
    Example:
    kpi_configs = [
        ('weighted_mae_0', 'accuracy'),
        ('weighted_mae_1', 'accuracy'),
        ('accuracy', 'weighted_mae_0'),
        ('bias', 'weighted_mae_0')
    ]
    """
    result_df = df.copy()
    
    for primary_kpi, tiebreaker_kpi in kpi_configs:
        print(f"\nAdding best method column for {primary_kpi} (tiebreaker: {tiebreaker_kpi})")
        result_df = add_best_method_column_pivoted(result_df, primary_kpi, tiebreaker_kpi)
    
    return result_df

def analyze_best_methods(df, kpi_name):
    """
    Analyze and display summary statistics for best methods
    """
    best_method_col = f'best_method_{kpi_name}'
    
    if best_method_col not in df.columns:
        print(f"Column {best_method_col} not found in DataFrame")
        return
    
    print(f"\n=== Best Method Analysis for {kpi_name.upper()} ===")
    
    # Overall best method counts
    print(f"\nOverall best method distribution:")
    method_counts = df[best_method_col].value_counts()
    print(method_counts)
    
    # Best methods by magazine
    print(f"\nBest methods by magazine:")
    magazine_summary = df.groupby(['magazine', best_method_col]).size().unstack(fill_value=0)
    print(magazine_summary)
    
    # Calculate percentages
    print(f"\nBest method percentages:")
    percentages = (method_counts / len(df) * 100).round(2)
    for method, pct in percentages.items():
        print(f"  {method}: {pct}%")

# Usage
if __name__ == "__main__":
    # Load the pivoted CSV file
    df = pd.read_csv("final_merged_all_methods.csv")
    
    print(f"Loaded dataset with {len(df)} rows and {len(df.columns)} columns")
    print(f"Magazines: {df['magazine'].unique()}")
    
    # Define KPI configurations for analysis
    kpi_configs = [
        ('weighted_mae_0', 'accuracy'),      # Best for unweighted MAE with accuracy tiebreaker
        ('weighted_mae_1', 'accuracy'),      # Best for weighted MAE (α=1) with accuracy tiebreaker
        ('weighted_mae_1.5', 'accuracy'),    # Best for weighted MAE (α=1.5) with accuracy tiebreaker
        ('accuracy', 'weighted_mae_0'),      # Best for accuracy with MAE tiebreaker
        ('bias', 'weighted_mae_0'),          # Best for bias with MAE tiebreaker
        ('overstock', 'accuracy'),           # Best for overstock with accuracy tiebreaker
    ]
    
    # Add all best method columns
    print("Adding best method columns...")
    df_with_best = add_multiple_best_method_columns_pivoted(df, kpi_configs)
    
    # Save results
    output_filename = "final_merged_with_best_methods.csv"
    df_with_best.to_csv(output_filename, index=False)
    print(f"\nResults saved to: {output_filename}")
    
    # Analyze results for key KPIs
    key_kpis = ['weighted_mae_0', 'weighted_mae_1', 'accuracy', 'bias']
    
    for kpi in key_kpis:
        if f'best_method_{kpi}' in df_with_best.columns:
            analyze_best_methods(df_with_best, kpi)
    
    print(f"\nFinal dataset: {len(df_with_best)} rows, {len(df_with_best.columns)} columns")
    
    # Show sample of the new columns
    best_method_cols = [col for col in df_with_best.columns if col.startswith('best_method_')]
    print(f"\nBest method columns added: {len(best_method_cols)}")
    print("Sample of best method assignments:")
    sample_cols = ['EHASTRA_EH_NUMMER', 'magazine'] + best_method_cols[:3]
    print(df_with_best[sample_cols].head())

Loaded dataset with 10685 rows and 127 columns
Magazines: ['E' 'C' 'H' 'D' 'B' 'I' 'G' 'A' 'F']
Adding best method columns...

Adding best method column for weighted_mae_0 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_0: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']

Adding best method column for weighted_mae_1 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_1: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']

Adding best method column for weighted_mae_1.5 (tiebreaker: accuracy)
Found 10 methods for weighted_mae_1.5: ['N1', 'N2', 'N3', 'EM', 'PD', 'Nahmias', 'Conrad', 'Baseline', 'Bayesian', 'Agrawal']
Found 10 methods for accuracy: ['N1', 'N2', 'N3', 'EM',

# Merging Sofortremission

In [186]:
import pandas as pd

# Read the files
best_methods_df = pd.read_csv("final_merged_with_best_methods.csv")
sofortremission_df = pd.read_csv("sofortremissionFeatures3007.csv")

# Fix the EHASTRA_EH_NUMMER format - add "EH" prefix to sofortremission
sofortremission_df['EHASTRA_EH_NUMMER'] = 'EH' + sofortremission_df['EHASTRA_EH_NUMMER'].astype(str)

# Merge
merged_df = best_methods_df.merge(
    sofortremission_df, 
    left_on=['EHASTRA_EH_NUMMER', 'magazine'], 
    right_on=['EHASTRA_EH_NUMMER', 'VDZ'], 
    how='left'
).drop('VDZ', axis=1)

# Sort by magazine
merged_df = merged_df.sort_values('magazine')

# Save
merged_df.to_csv("Input_DT_All_Mags.csv", index=False)

print(f"Done! {len(merged_df)} rows, {len(merged_df.columns)} columns")

Done! 10685 rows, 140 columns
