In [1]:
import pandas as pd
import pyarrow.dataset as ds
import numpy as np

## Dataset Import

In [90]:
# Import FIP Dataset

s3_path_fip = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "publish/data-product/financial_inventory_projection_report_network_update/"
)

dataset = ds.dataset(
    s3_path_fip,
    format="parquet",
    partitioning="hive" 
)

table = dataset.to_table(
    filter=(
        (ds.field("corporate_brand") == "REVLIMID") &
        (ds.field("snapshot_date") > "2025-07-01") &       
        (ds.field("date") == "202612") &                    # filter only 2026 YE FIP data
        ~(
            (ds.field("snapshot_date") == "2026-01-23") &
            (ds.field("snapshot_type") == "friday")         # drop 23rd Fri snapshot data to avoid double counting
        )
    )
)

df_fip = table.to_pandas()
df_fip.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date
0,1458497,2061,202612,865.2,2859.01014,concost dp,ST,3.30445,rr,REVLIMID,PACK,COMMERCIAL,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302101,PHARMA,friday,2025-07-18
1,1424858,2023,202612,0.0,0.0,concost dp,ST,24.20032,rr,REVLIMID,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18
2,1428613,2021,202612,0.0,0.0,concost dp,ST,50.58223,rr,REVLIMID,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18
3,1431792,2021,202612,59.0,6391.59036,concost dp,ST,108.33204,rr,REVLIMID,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18
4,1431878,2023,202612,0.0,0.0,concost dp,ST,117.51409,rr,REVLIMID,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18


In [91]:
df_fip.shape

(34823, 19)

In [92]:
# s3_path_plants = (
#     "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
#     "refined/data-asset/fin_inv_proj/"
#     "bms_internal_vs_external_plants/"
#     "bms_internal_vs_external_plants.parquet"
# )

# df_plants = pd.read_parquet(s3_path_plants)
# # df_plants.head()

In [93]:
# import boto3

# s3 = boto3.client("s3")

# bucket = "m3-intel-hub-dp-us-east-1-517292-prod"
# prefix = "refined/data-asset/fin_inv_proj/sap_material_master/"

# response = s3.list_objects_v2(
#     Bucket=bucket,
#     Prefix=prefix
# )

# if "Contents" in response:
#     for obj in response["Contents"]:
#         print(obj["Key"], obj["Size"])
# else:
#     print("No objects found or no access.")

## Data Prep

In [94]:
# Create has_non_zero flag at material–plant level

df_fip["has_non_zero"] = (
    df_fip
    .groupby(["material", "plant"])["total_cost"]
    .transform(lambda x: (x != 0).any())
    .astype(int)
)

# Apply the filter
base1 = df_fip.loc[df_fip["has_non_zero"] == 1].drop(columns="has_non_zero")

In [131]:
# Add material entry flag -- change to lookup table while in production

base = base1.copy()
first_seen = (
    base.groupby(["material", "plant"])["snapshot_date"]
      .transform("min")
)

base["sku_status"] = np.where(
    base["snapshot_date"] == first_seen,
    "NEW",
    "EXISTING"
)


In [132]:
print(df_fip.shape)
base.shape

(34823, 20)


(28169, 20)

In [133]:
# fip copy df for data prep
df = base.copy()
df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])

# snapshot lookup table
snapshot_calendar = (
    df[["snapshot_type", "snapshot_date"]]
    .drop_duplicates()
    .sort_values("snapshot_date")
    .reset_index(drop=True)
)

# global previous snapshot (date-wise)
snapshot_calendar["prev_snapshot_any"] = snapshot_calendar["snapshot_date"].shift(1)

# BD13 → BD13 previous snapshot
snapshot_calendar["prev_snapshot_bd13"] = (
    snapshot_calendar
    .where(snapshot_calendar["snapshot_type"] == "bd13")
    .groupby("snapshot_type")["snapshot_date"]
    .shift(1)
)

# final prev snapshot logic
snapshot_calendar["prev_snapshot_date"] = np.where(
    snapshot_calendar["snapshot_type"] == "bd13",
    snapshot_calendar["prev_snapshot_bd13"],   # BD13 → BD13
    snapshot_calendar["prev_snapshot_any"]     # FRIDAY → previous snapshot
)

snapshot_calendar

Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_any,prev_snapshot_bd13,prev_snapshot_date
0,friday,2025-07-18,NaT,NaT,NaT
1,friday,2025-07-25,2025-07-18,NaT,2025-07-18
2,friday,2025-08-01,2025-07-25,NaT,2025-07-25
3,friday,2025-08-08,2025-08-01,NaT,2025-08-01
4,friday,2025-08-15,2025-08-08,NaT,2025-08-08
5,friday,2025-08-22,2025-08-15,NaT,2025-08-15
6,bd13,2025-08-26,2025-08-22,NaT,NaT
7,friday,2025-08-29,2025-08-26,NaT,2025-08-26
8,friday,2025-09-05,2025-08-29,NaT,2025-08-29
9,friday,2025-09-12,2025-09-05,NaT,2025-09-05


In [134]:
# Attach previous snapshot date to each row

df = df.merge(
    snapshot_calendar[["snapshot_type", "snapshot_date", "prev_snapshot_date"]],
    on=["snapshot_type", "snapshot_date"],
    how="left"
)

In [135]:
df.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,...,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,sku_status,prev_snapshot_date
0,1458497,2061,202612,865.2,2859.01014,concost dp,ST,3.30445,rr,REVLIMID,...,COMMERCIAL,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302101,PHARMA,friday,2025-07-18,NEW,NaT
1,1428613,2021,202612,0.0,0.0,concost dp,ST,50.58223,rr,REVLIMID,...,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18,NEW,NaT
2,1431792,2021,202612,59.0,6391.59036,concost dp,ST,108.33204,rr,REVLIMID,...,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18,NEW,NaT
3,1428615,2053,202612,86.0,9672.68746,concost dp,ST,112.47311,rr,REVLIMID,...,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18,NEW,NaT
4,1431577,2058,202612,17.0,1862.65005,concost dp,ST,109.56765,rr,REVLIMID,...,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,CAPSULE,3302101,PHARMA,friday,2025-07-18,NEW,NaT


In [136]:
# Prepare current and previous frames

# Current snapshot frame
current_df = df.copy()

current_df = current_df.rename(columns={
    "quantity": "quantity_curr",
    "cost_per_unit": "cost_per_unit_curr",
    "total_cost": "total_cost_curr",
})

# Previous snapshot frame
previous_df = df.rename(columns={
    "snapshot_date": "snapshot_date_prev",
    "quantity": "quantity_prev",
    "cost_per_unit": "cost_per_unit_prev",
    "total_cost": "total_cost_prev",
})[
    [
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
        "quantity_prev",
        "cost_per_unit_prev",
        "total_cost_prev",
    ]
]


# Join current to previous snapshot
rca_base = current_df.merge(
    previous_df,
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    how="left"
)

In [137]:
df.shape

(28169, 21)

In [138]:
rca_base.tail(5)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,sku_status,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev
28164,1465291,2061,202612,286.0,24878.69956,concost dp,ST,86.98846,rr,REVLIMID,...,3302101,PHARMA,friday,2026-01-30,EXISTING,2026-01-23,NaT,,,
28165,1428611,1731,202612,6.0,292.602,concost dp,ST,48.767,rr,REVLIMID,...,3302101,PHARMA,friday,2026-01-30,EXISTING,2026-01-23,NaT,,,
28166,1442928,2052,202612,158.0,16352.83094,concost dp,ST,103.49893,rr,REVLIMID,...,3302101,PHARMA,friday,2026-01-30,EXISTING,2026-01-23,NaT,,,
28167,1428582,2091,202612,73.0,5438.38101,concost dp,ST,74.49837,rr,REVLIMID,...,3302101,PHARMA,friday,2026-01-30,EXISTING,2026-01-23,NaT,,,
28168,1431690,2122,202612,108.0,14420.80044,concost dp,ST,133.52593,rr,REVLIMID,...,3302101,PHARMA,friday,2026-01-30,EXISTING,2026-01-23,NaT,,,


In [139]:
# Flags for material-plant presence / absence between snapshots

# Not present in previous snapshot 
rca_base["is_new_in_current_snapshot"] = (
    rca_base["prev_snapshot_date"].notna() &
    rca_base["quantity_prev"].isna()
)

# Present in previous but missing in current snapshot
rca_base["is_new_in_current_snapshot"] = False


# Identify valid previous snapshots (from calendar logic)
valid_prev_snapshots = (
    snapshot_calendar["prev_snapshot_date"]
        .dropna()
        .unique()
)

# Restrict previous_df BEFORE the anti-join
previous_df_valid = previous_df[
    previous_df["snapshot_date_prev"].isin(valid_prev_snapshots)
]

# Present in previous but missing in current snapshot
# Identify rows present in previous snapshot but missing in current
prev_only = previous_df_valid.merge(
    current_df[
        [
            "material",
            "plant",
            "date",
            "snapshot_type",
            "prev_snapshot_date",
        ]
    ],
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    how="left",
    indicator=True
).query("_merge == 'left_only'")


prev_only["is_dropped_in_current_snapshot"] = True

# Add missing columns for consistency
for col in rca_base.columns:
    if col not in prev_only.columns:
        prev_only[col] = np.nan

rca_base["is_dropped_in_current_snapshot"] = False

final_rca_frame = pd.concat(
    [rca_base, prev_only[rca_base.columns]],
    ignore_index=True
)

  final_rca_frame = pd.concat(


In [140]:
for col in [
    "is_new_in_current_snapshot",
    "is_dropped_in_current_snapshot",
]:
    final_rca_frame[col] = (
        final_rca_frame[col]
            .replace({1: True, 0: False})   # normalize numeric bools
            .fillna(False)                  # handle NaNs
            .astype("boolean")              # now safe
    )


  .fillna(False)                  # handle NaNs


In [141]:
final_rca_frame.shape

(29180, 27)

In [142]:
final_rca_frame.tail()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,snapshot_type,snapshot_date,sku_status,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev,is_new_in_current_snapshot,is_dropped_in_current_snapshot
29175,1435113,2061,202612,,,,,,,,...,bd13,NaT,,NaT,2026-01-23,1.0,19.77565,19.77565,False,True
29176,1428449,1731,202612,,,,,,,,...,bd13,NaT,,NaT,2026-01-23,0.0,141.11771,0.0,False,True
29177,1431641,2093,202612,,,,,,,,...,bd13,NaT,,NaT,2026-01-23,0.0,92.70567,0.0,False,True
29178,1431805,2093,202612,,,,,,,,...,bd13,NaT,,NaT,2026-01-23,660.0,86.09325,56821.545,False,True
29179,1468416,2061,202612,,,,,,,,...,bd13,NaT,,NaT,2026-01-23,0.0,558.63,0.0,False,True


In [143]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'sku_status', 'prev_snapshot_date',
       'snapshot_date_prev', 'quantity_prev', 'cost_per_unit_prev',
       'total_cost_prev', 'is_new_in_current_snapshot',
       'is_dropped_in_current_snapshot'],
      dtype='object')

In [144]:
snapshot_mapping_check = (
    final_rca_frame
    .loc[:, ["snapshot_type", "snapshot_date", "prev_snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)

snapshot_mapping_check

Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
5190,bd13,2025-08-26,NaT
8653,bd13,2025-09-18,2025-08-26
12884,bd13,2025-10-17,2025-09-18
17189,bd13,2025-11-19,2025-10-17
21462,bd13,2025-12-17,2025-11-19
26505,bd13,2026-01-23,2025-12-17
28181,bd13,NaT,NaT
0,friday,2025-07-18,NaT
860,friday,2025-07-25,2025-07-18
1726,friday,2025-08-01,2025-07-25


Error handling

1. Division by zero & invalid math -
    Previous quantity = 0
    Previous cost = 0
    Previous FIP = 0
    Volatility = 0
2. Min rolling window
   less than min periods of 3
   Newly introduced SKU
   Flat history causing volatility = 0
3. double counting due to duplicates
4. missing prev snapshot data - nulls
   SKU appears for first time
   SKU disappears and reappears
   Previous quantity / cost not available
5. Extreme values in the history
    Very large quantities or costs in the past pushing the present numbers. The outliers in the past affects the current numbers. Exlcude the historical outliers
   

# Calculations

In [145]:
# Step 1: Compute raw change metrics (always runs)


# Base deltas

final_rca_frame["delta_quantity"] = (
    final_rca_frame["quantity_curr"] - final_rca_frame["quantity_prev"]
)

final_rca_frame["delta_cost_per_unit"] = (
    final_rca_frame["cost_per_unit_curr"] - final_rca_frame["cost_per_unit_prev"]
)


# Impact decomposition

# Quantity Impact 
final_rca_frame["quantity_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["cost_per_unit_prev"]
)

# Cost Impact
final_rca_frame["cost_impact"] = (
    final_rca_frame["delta_cost_per_unit"] * final_rca_frame["quantity_prev"]
)

# Intercation
final_rca_frame["interaction_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["delta_cost_per_unit"]
)

# Total Change
final_rca_frame["total_fip_change"] = (
    final_rca_frame["quantity_impact"] +
    final_rca_frame["cost_impact"] +
    final_rca_frame["interaction_impact"]
)

# Core Metrics

# delta_quantity_pct 
final_rca_frame["delta_quantity_pct"] = (
    final_rca_frame["delta_quantity"] /
    final_rca_frame["quantity_prev"]
)

# delta_cost_per_unit_pct
final_rca_frame["delta_cost_per_unit_pct"] = (
    final_rca_frame["delta_cost_per_unit"] /
    final_rca_frame["cost_per_unit_prev"]
)

# Contribution shares (absolute, normalized)
impact_abs_sum_qc = (
    final_rca_frame["quantity_impact"].abs() +
    final_rca_frame["cost_impact"].abs()
)

impact_abs_sum_all = (
    impact_abs_sum_qc +
    final_rca_frame["interaction_impact"].abs()
)


# quantity_impact_pct_of_total 
final_rca_frame["quantity_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["quantity_impact"].abs() / impact_abs_sum_qc,
    0
)

# cost_impact_pct_of_total 
final_rca_frame["cost_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["cost_impact"].abs() / impact_abs_sum_qc,
    0
)

# interaction_pct
final_rca_frame["interaction_pct"] = np.where(
    impact_abs_sum_all > 0,
    final_rca_frame["interaction_impact"].abs() / impact_abs_sum_all,
    0
)


# Dominance Score

final_rca_frame["abs_qty_impact"] = final_rca_frame["quantity_impact"].abs()
final_rca_frame["abs_cost_impact"] = final_rca_frame["cost_impact"].abs()

final_rca_frame["dominance_score"] = np.where(
    (final_rca_frame["abs_qty_impact"] + final_rca_frame["abs_cost_impact"]) == 0,
    0.0,  # avoid divide-by-zero → treat as neutral
    (final_rca_frame["abs_qty_impact"] - final_rca_frame["abs_cost_impact"]) /
    (final_rca_frame["abs_qty_impact"] + final_rca_frame["abs_cost_impact"])
)

In [146]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,interaction_impact,total_fip_change,delta_quantity_pct,delta_cost_per_unit_pct,quantity_impact_pct_of_total,cost_impact_pct_of_total,interaction_pct,abs_qty_impact,abs_cost_impact,dominance_score
0,1458497,2061,202612,865.2,2859.01014,concost dp,ST,3.30445,rr,REVLIMID,...,,,,,0.0,0.0,0.0,,,
1,1428613,2021,202612,0.0,0.0,concost dp,ST,50.58223,rr,REVLIMID,...,,,,,0.0,0.0,0.0,,,
2,1431792,2021,202612,59.0,6391.59036,concost dp,ST,108.33204,rr,REVLIMID,...,,,,,0.0,0.0,0.0,,,
3,1428615,2053,202612,86.0,9672.68746,concost dp,ST,112.47311,rr,REVLIMID,...,,,,,0.0,0.0,0.0,,,
4,1431577,2058,202612,17.0,1862.65005,concost dp,ST,109.56765,rr,REVLIMID,...,,,,,0.0,0.0,0.0,,,


In [147]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'sku_status', 'prev_snapshot_date',
       'snapshot_date_prev', 'quantity_prev', 'cost_per_unit_prev',
       'total_cost_prev', 'is_new_in_current_snapshot',
       'is_dropped_in_current_snapshot', 'delta_quantity',
       'delta_cost_per_unit', 'quantity_impact', 'cost_impact',
       'interaction_impact', 'total_fip_change', 'delta_quantity_pct',
       'delta_cost_per_unit_pct', 'quantity_impact_pct_of_total',
       'cost_impact_pct_of_total', 'interaction_pct', 'abs_qty_impact',
       'abs_cost_impact', 'dominance_score'],
      dtype='object')

In [148]:
final_rca_frame = final_rca_frame.sort_values(
    ["material", "plant", "snapshot_date"]
)

final_rca_frame["hist_count"] = (
    final_rca_frame
    .groupby(["material", "plant"])
    .cumcount()
)

final_rca_frame["has_sufficient_6periods"] = (
    final_rca_frame["hist_count"] >= 6 )


In [149]:
# Step 2 - Noise vs Signal determination 


# Sorting

final_rca_frame = final_rca_frame.sort_values(
    ["material", "plant", "snapshot_date"]
)

#  1) Business materiality (value-based)   
# How big the change is in value terms, relative to prior fip.
final_rca_frame["relative_fip_impact"] = np.where(
    final_rca_frame["total_cost_prev"] > 0,
    final_rca_frame["total_fip_change"].abs() /
    final_rca_frame["total_cost_prev"],
    np.nan
)


# 2) Historical volatility (material–plant aware) -- exclude current date? -- shift(1)- picking up other?

# Quantity volatility (rolling, 12 snapshots) - How noisy this SKU normally is.
# quantity_volatility_12w = std(delta_quantity_pct)
final_rca_frame["quantity_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=6).std()) 
)


# Cost volatility (rolling, 12 snapshots)
final_rca_frame["cost_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_cost_per_unit_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=6).std())
)


# 3) Normalized (volatility-aware) changes

final_rca_frame["normalized_quantity_change"] = (
    final_rca_frame["delta_quantity_pct"] /
    final_rca_frame["quantity_volatility_12w"]
)

final_rca_frame["normalized_cost_change"] = (
    final_rca_frame["delta_cost_per_unit_pct"] /
    final_rca_frame["cost_volatility_12w"]
)

# Interpretation:
# ≈ 1 → normal
# ≫ 1 → unusually large vs history


In [150]:
# 4) Temporal persistence (directional consistency)

final_rca_frame["quantity_change_sign"] = np.sign(
    final_rca_frame["delta_quantity_pct"]
) 

def persistence_score(series):
    score = []
    current = 0
    prev = 0
    for v in series:
        if v == 0 or pd.isna(v):
            current = 0
        elif v == prev:
            current += 1
        else:
            current = 1
        score.append(current)
        prev = v
    return score

final_rca_frame["quantity_persistence_score"] = (
    final_rca_frame
    .groupby(["material", "plant"])["quantity_change_sign"]
    .transform(persistence_score)
)


In [151]:
#oultier v2

# --------------------------------------------
# CONFIG

ROLLING_WINDOW = 12
MIN_PERIODS = 6
EXTREME_PCT_CHANGE = 0.5     # 50% absolute change
LOW_LEVEL_RATIO = 0.01       # 1% of typical level
ROBUST_Z_THRESHOLD = 3

# --------------------------------------------
# 1) Rolling median of quantity (baseline scale)
# --------------------------------------------
final_rca_frame["quantity_rolling_median_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["quantity_curr"]
    .transform(lambda x: x.shift(1).rolling(ROLLING_WINDOW, min_periods=MIN_PERIODS).median())
)

# --------------------------------------------
# 2) Structural extreme change (scale-based)
# --------------------------------------------
final_rca_frame["is_structural_extreme_qty"] = (
    (final_rca_frame["delta_quantity_pct"].abs() >= EXTREME_PCT_CHANGE) &
    (
        (final_rca_frame["quantity_curr"] <=
         LOW_LEVEL_RATIO * final_rca_frame["quantity_rolling_median_12w"]) |
        (final_rca_frame["quantity_curr"] >=
         (1 / LOW_LEVEL_RATIO) * final_rca_frame["quantity_rolling_median_12w"])
    )
)

# --------------------------------------------
# 3) Create CLEAN delta series (exclude known outliers)
# --------------------------------------------
final_rca_frame["clean_delta_quantity_pct"] = final_rca_frame["delta_quantity_pct"]

final_rca_frame.loc[
    final_rca_frame["is_structural_extreme_qty"],
    "clean_delta_quantity_pct"
] = np.nan

# --------------------------------------------
# 4) Robust MAD-based z-score (on clean history only)
# --------------------------------------------
def robust_zscore(series):
    median = np.nanmedian(series)
    mad = np.nanmedian(np.abs(series - median))
    if mad == 0 or np.isnan(mad):
        return pd.Series(np.nan, index=series.index)
    return (series - median) / (1.4826 * mad)

final_rca_frame["robust_quantity_z"] = (
    final_rca_frame
    .groupby(["material", "plant"])["clean_delta_quantity_pct"]
    .transform(lambda x: robust_zscore(x.shift(1)))
)

# --------------------------------------------
# 5) Statistical outlier (robust)
# --------------------------------------------
final_rca_frame["is_statistical_outlier_qty"] = (
    final_rca_frame["robust_quantity_z"].abs() > ROBUST_Z_THRESHOLD
)

# --------------------------------------------
# 6) FINAL quantity outlier flag
# --------------------------------------------
final_rca_frame["is_quantity_outlier"] = (
    final_rca_frame["is_structural_extreme_qty"] |
    final_rca_frame["is_statistical_outlier_qty"]
)

# --------------------------------------------
# 7) Cleanup (optional but recommended)
# --------------------------------------------
bool_cols = [
    "is_structural_extreme_qty",
    "is_statistical_outlier_qty",
    "is_quantity_outlier",
]

final_rca_frame[bool_cols] = (
    final_rca_frame[bool_cols]
    .fillna(False)
    .astype("boolean")
)


  median = np.nanmedian(series)
  mad = np.nanmedian(np.abs(series - median))


In [152]:
# # Outlier new

# # rolling median
# final_rca_frame["quantity_rolling_median_12w"] = (
#     final_rca_frame
#     .groupby(["material", "plant"])["quantity_curr"]
#     .transform(lambda x: x.shift(1).rolling(12, min_periods=6).median())
# )

# EXTREME_PCT_CHANGE = 0.5        # 50% change
# LOW_LEVEL_RATIO = 0.01          # 1% of typical level
# Z_SCORE_THRESHOLD = 3

# final_rca_frame["is_structural_extreme_qty"] = (
#     (final_rca_frame["delta_quantity_pct"].abs() >= EXTREME_PCT_CHANGE) &
#     (
#         final_rca_frame["quantity_curr"] <=
#         LOW_LEVEL_RATIO * final_rca_frame["quantity_rolling_median_12w"]
#     )
# )

# final_rca_frame["is_statistical_outlier_qty"] = (
#     (final_rca_frame["quantity_volatility_12w"] > 0) &
#     (final_rca_frame["normalized_quantity_change"].abs() > Z_SCORE_THRESHOLD)
# )

# final_rca_frame["is_quantity_outlier"] = (
#     final_rca_frame["is_structural_extreme_qty"] |
#     final_rca_frame["is_statistical_outlier_qty"]
# )


In [153]:
# # 5) Outlier detection (z-score based - per material–plant) -old calc

# final_rca_frame["quantity_zscore"] = (
#     final_rca_frame["delta_quantity_pct"] /       # check
#     final_rca_frame["quantity_volatility_12w"]
# )

# final_rca_frame["is_quantity_outlier"] = (
#     final_rca_frame["quantity_zscore"].abs() > 3  # threshold
# )


In [154]:
# # 6) Change-point detection

# final_rca_frame["rolling_avg_quantity_12w"] = (
#     final_rca_frame
#     .groupby(["material", "plant"])["delta_quantity_pct"]
#     .transform(lambda x: x.shift(1).rolling(12, min_periods=3).mean())
# )

# final_rca_frame["change_point_detected"] = (
#     (final_rca_frame["quantity_persistence_score"] >= 2) &
#     (final_rca_frame["rolling_avg_quantity_12w"].abs() > 
#      final_rca_frame["quantity_volatility_12w"])
# )

In [155]:
# Change Point

final_rca_frame["rolling_avg_quantity_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=6).mean())
)
# --------------------------------------------
# Robust rolling MAD of delta quantity
# --------------------------------------------

final_rca_frame["delta_qty_rolling_mad_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(
        lambda x: x.shift(1)
                  .rolling(12, min_periods=6)
                  .apply(lambda s: np.nanmedian(np.abs(s - np.nanmedian(s))), raw=True)
    )
)

# --------------------------------------------
# Robust trend strength
# --------------------------------------------
final_rca_frame["robust_trend_strength"] = (
    final_rca_frame["rolling_avg_quantity_12w"].abs() /
    (1.4826 * final_rca_frame["delta_qty_rolling_mad_12w"])
)

# --------------------------------------------
# Short-window confirmation
# --------------------------------------------
final_rca_frame["rolling_avg_quantity_6w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(6, min_periods=3).mean())
)

final_rca_frame["trend_confirmed"] = (
    np.sign(final_rca_frame["rolling_avg_quantity_12w"]) ==
    np.sign(final_rca_frame["rolling_avg_quantity_6w"])
)

# --------------------------------------------
# FINAL robust change-point detection
# --------------------------------------------
final_rca_frame["change_point_detected"] = (
    (final_rca_frame["quantity_persistence_score"] >= 2) &
    (final_rca_frame["robust_trend_strength"] > 2) &
    (final_rca_frame["trend_confirmed"])
)


In [156]:
# Final Noise Vs Signal Flag

MATERIALITY_THRESHOLD = 0.5
MIN_PERSISTENCE = 2

final_rca_frame["is_noise"] = (
    final_rca_frame["is_quantity_outlier"] &
    (final_rca_frame["quantity_persistence_score"] < MIN_PERSISTENCE) &
    (~final_rca_frame["change_point_detected"])
)

final_rca_frame["is_large_noise"] = (
    final_rca_frame["is_noise"] &
    (final_rca_frame["relative_fip_impact"] > 0.5)
)

final_rca_frame.loc[
    ~final_rca_frame["has_sufficient_6periods"],
    "is_noise"
] = False                                             # insufficient data to signal

final_rca_frame["is_signal"] = ~final_rca_frame["is_noise"]


In [160]:
final_rca_frame

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,rolling_avg_quantity_12w,delta_qty_rolling_mad_12w,robust_trend_strength,rolling_avg_quantity_6w,trend_confirmed,change_point_detected,is_noise,is_large_noise,is_signal,abs_fip_change
811,1423267,2023,202612,0.0,0.000,concost dp,ST,106.07239,rr,REVLIMID,...,,,,,False,False,False,False,True,
1106,1423267,2023,202612,0.0,0.000,concost dp,ST,106.07239,rr,REVLIMID,...,,,,,False,False,False,False,True,0.0
2079,1423267,2023,202612,0.0,0.000,concost dp,ST,106.07239,rr,REVLIMID,...,,,,,False,False,False,False,True,0.0
3098,1423267,2023,202612,0.0,0.000,concost dp,ST,106.07239,rr,REVLIMID,...,,,,,False,False,False,False,True,0.0
4264,1423267,2023,202612,0.0,0.000,concost dp,ST,106.07239,rr,REVLIMID,...,,,,,False,False,False,False,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18906,1484325X0,2061,202612,11200.0,0.112,concost dp,ST,0.00001,sap,REVLIMID,...,,,,,False,False,False,False,True,
19763,1484325X0,2061,202612,11200.0,0.112,concost dp,ST,0.00001,sap,REVLIMID,...,,,,,False,False,False,False,True,0.0
20216,1484325X0,2061,202612,11200.0,0.112,concost dp,ST,0.00001,sap,REVLIMID,...,,,,,False,False,False,False,True,0.0
28286,1484325X0,2061,202612,,,,,,,,...,,,,,False,False,False,False,True,


In [163]:
final_rca_frame["abs_fip_change"] = final_rca_frame["total_fip_change"].abs()

final_rca_frame["snapshot_signal_rank"] = (
    final_rca_frame
    .where(final_rca_frame["is_signal"])
    .groupby(["snapshot_date", "corporate_brand"])["abs_fip_change"]
    .rank(method="first", ascending=False)
)
final_rca_frame["is_top10_contributor"] = (
    final_rca_frame["snapshot_signal_rank"] <= 10
)

In [164]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,robust_trend_strength,rolling_avg_quantity_6w,trend_confirmed,change_point_detected,is_noise,is_large_noise,is_signal,abs_fip_change,snapshot_signal_rank,is_top10_contributor
811,1423267,2023,202612,0.0,0.0,concost dp,ST,106.07239,rr,REVLIMID,...,,,False,False,False,False,True,,,False
1106,1423267,2023,202612,0.0,0.0,concost dp,ST,106.07239,rr,REVLIMID,...,,,False,False,False,False,True,0.0,174.0,False
2079,1423267,2023,202612,0.0,0.0,concost dp,ST,106.07239,rr,REVLIMID,...,,,False,False,False,False,True,0.0,190.0,False
3098,1423267,2023,202612,0.0,0.0,concost dp,ST,106.07239,rr,REVLIMID,...,,,False,False,False,False,True,0.0,469.0,False
4264,1423267,2023,202612,0.0,0.0,concost dp,ST,106.07239,rr,REVLIMID,...,,,False,False,False,False,True,0.0,274.0,False


In [165]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'sku_status', 'prev_snapshot_date',
       'snapshot_date_prev', 'quantity_prev', 'cost_per_unit_prev',
       'total_cost_prev', 'is_new_in_current_snapshot',
       'is_dropped_in_current_snapshot', 'delta_quantity',
       'delta_cost_per_unit', 'quantity_impact', 'cost_impact',
       'interaction_impact', 'total_fip_change', 'delta_quantity_pct',
       'delta_cost_per_unit_pct', 'quantity_impact_pct_of_total',
       'cost_impact_pct_of_total', 'interaction_pct', 'abs_qty_impact',
       'abs_cost_impact', 'dominance_score', 'hist_count',
       'has_sufficient_6periods', 'relative_fip_impact

In [166]:
v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] > "2025-11-09") 
    &   (final_rca_frame ["material"] == '1456877')]
display(v)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,robust_trend_strength,rolling_avg_quantity_6w,trend_confirmed,change_point_detected,is_noise,is_large_noise,is_signal,abs_fip_change,snapshot_signal_rank,is_top10_contributor
16594,1456877,2091,202612,3882051.0,36775880.0,concost dp,G,9.47331,rr,REVLIMID,...,,,False,False,True,True,False,36139800.0,,False
17585,1456877,2091,202612,36188.53,342825.2,concost dp,G,9.47331,rr,REVLIMID,...,12.32449,18.938963,True,True,False,False,True,342825.2,5.0,True
18554,1456877,2091,202612,6580000.0,62334380.0,concost dp,G,9.47331,rr,REVLIMID,...,,18.938963,False,False,False,False,True,,,False
19183,1456877,2091,202612,6570763.0,62246870.0,concost dp,G,9.47331,rr,REVLIMID,...,,18.938963,False,False,False,False,True,87506.99,4.0,True
19926,1456877,2091,202612,1260.0,11936.37,concost dp,G,9.47331,rr,REVLIMID,...,,14.203871,False,False,False,False,True,62234940.0,1.0,True
21401,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,,13.953919,False,False,True,True,False,674393000.0,,False
21694,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,12699.84,14138.703919,True,True,False,False,True,674062100.0,1.0,True
22802,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,5639.922,14616.049141,True,False,False,False,True,,,False
23947,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,5639.922,14616.049141,True,False,False,False,True,0.0,685.0,False
24056,1456877,2091,202612,71190000.0,1482382000.0,concost dp,G,20.82289,rr,REVLIMID,...,9856.024,11692.839312,True,False,False,False,True,807976600.0,1.0,True


In [168]:
cols = ['material', 'plant', 'date', 'quantity_curr','quantity_prev', 'total_cost_curr','cost_per_unit_curr',
    "snapshot_date",'change_point_detected', 'is_noise', 'is_large_noise', 'is_signal','is_top10_contributor']

v1 = final_rca_frame.loc[
    (final_rca_frame["snapshot_date"] > "2025-11-09") &
    (final_rca_frame["material"] == "1456877"),
    cols
]
v1

Unnamed: 0,material,plant,date,quantity_curr,quantity_prev,total_cost_curr,cost_per_unit_curr,snapshot_date,change_point_detected,is_noise,is_large_noise,is_signal,is_top10_contributor
16594,1456877,2091,202612,3882051.0,67143.9,36775880.0,9.47331,2025-11-14,False,True,True,False,False
17585,1456877,2091,202612,36188.53,0.0,342825.2,9.47331,2025-11-19,True,False,False,True,True
18554,1456877,2091,202612,6580000.0,,62334380.0,9.47331,2025-11-21,False,False,False,True,False
19183,1456877,2091,202612,6570763.0,6580000.0,62246870.0,9.47331,2025-11-28,False,False,False,True,True
19926,1456877,2091,202612,1260.0,6570763.0,11936.37,9.47331,2025-12-05,False,False,False,True,True
21401,1456877,2091,202612,71190000.0,1260.0,674404900.0,9.47331,2025-12-12,False,True,True,False,False
21694,1456877,2091,202612,71190000.0,36188.53,674404900.0,9.47331,2025-12-17,True,False,False,True,True
22802,1456877,2091,202612,71190000.0,,674404900.0,9.47331,2025-12-19,False,False,False,True,False
23947,1456877,2091,202612,71190000.0,71190000.0,674404900.0,9.47331,2025-12-26,False,False,False,True,False
24056,1456877,2091,202612,71190000.0,71190000.0,1482382000.0,20.82289,2026-01-02,False,False,False,True,True


In [89]:
# 1478550 @1037

In [54]:
# v.to_csv('revlimid_output_1.csv')

In [None]:
# cols_to_remove = [
#     'development_lifecycle_status',
#        'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
#        'corp_brand_id', 'network_or_business_unit'
# ]

# df_final = final_rca_frame.drop(columns=cols_to_remove, errors="ignore")



# v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] == "2026-01-09") 
#     &   (final_rca_frame ["material"] == '1456877')]