In [1]:
import pandas as pd
import pyarrow.dataset as ds
import numpy as np

## Dataset Import

In [34]:
# Import FIP Dataset

s3_path_fip = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "publish/data-product/financial_inventory_projection_report_network_update/"
)

dataset = ds.dataset(
    s3_path_fip,
    format="parquet",
    partitioning="hive" 
)

table = dataset.to_table(
    filter=(
        (ds.field("snapshot_date") >= "2025-10-01") &       # data after dec 2025
        (ds.field("date") == "202612") &                    # filter only 2026 YE FIP data
        ~(
            (ds.field("snapshot_date") == "2026-01-23") &
            (ds.field("snapshot_type") == "friday")         # drop 23rd Fri snapshot data to avoid double counting
        )
    )
)

df_fip = table.to_pandas()
#df_fip.head()

In [35]:
df_fip.shape

(610453, 19)

In [36]:
# s3_path_plants = (
#     "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
#     "refined/data-asset/fin_inv_proj/"
#     "bms_internal_vs_external_plants/"
#     "bms_internal_vs_external_plants.parquet"
# )

# df_plants = pd.read_parquet(s3_path_plants)
# # df_plants.head()

In [37]:
# import boto3

# s3 = boto3.client("s3")

# bucket = "m3-intel-hub-dp-us-east-1-517292-prod"
# prefix = "refined/data-asset/fin_inv_proj/sap_material_master/"

# response = s3.list_objects_v2(
#     Bucket=bucket,
#     Prefix=prefix
# )

# if "Contents" in response:
#     for obj in response["Contents"]:
#         print(obj["Key"], obj["Size"])
# else:
#     print("No objects found or no access.")

## Data Prep

In [38]:
# Create has_non_zero flag at material–plant level

df_fip["has_non_zero"] = (
    df_fip
    .groupby(["material", "plant"])["total_cost"]
    .transform(lambda x: (x != 0).any())
    .astype(int)
)

# Apply the filter
base = df_fip.loc[df_fip["has_non_zero"] == 1].drop(columns="has_non_zero")


In [39]:
print(df_fip.shape)
base.shape

(610453, 20)


(396796, 19)

In [40]:
# fip copy df for data prep
df = base.copy()
df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])


# snapshot lookup table
snapshot_calendar = (
    df[["snapshot_type", "snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)


# attach prev snapshot to snapshot calendar
snapshot_calendar["prev_snapshot_date"] = (
    snapshot_calendar
    .groupby("snapshot_type")["snapshot_date"]
    .shift(1)
)
snapshot_calendar      # comparing bd13 - bd13 snapshots and friday-friday snapshots. no bd13-fri snapshots

Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
41747,bd13,2025-10-17,NaT
155968,bd13,2025-11-19,2025-10-17
302706,bd13,2025-12-17,2025-11-19
534117,bd13,2026-01-23,2025-12-17
0,friday,2025-10-03,NaT
19023,friday,2025-10-10,2025-10-03
64447,friday,2025-10-24,2025-10-10
87218,friday,2025-10-31,2025-10-24
110108,friday,2025-11-07,2025-10-31
133045,friday,2025-11-14,2025-11-07


In [41]:
# Attach previous snapshot date to each row

df = df.merge(
    snapshot_calendar,
    on=["snapshot_type", "snapshot_date"],
    how="left"
)

In [42]:
df.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date
0,CRU CRU D-40-Ref,1037,202612,0.0,,missing,,,rr,,,,,,,,,friday,2025-10-03,NaT
1,1382955Z0,1734,202612,0.175,,missing,,,rr,,HALB,CLINICAL,API,API / DRUG SUBSTANCE,,00000nan,PHARMA,friday,2025-10-03,NaT
2,1234398,1760,202612,473.619466,146053.880553,concost dp,KG,308.37812,rr,Ipilimumab (Yervoy),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,03301503,BIOLOGICS,friday,2025-10-03,NaT
3,1457248,2061,202612,561.0,,missing,,,rr,All Other Pharmaceut,UNBW,,,,,00201790,PHARMA,friday,2025-10-03,NaT
4,1457516,2061,202612,34000.0,,missing,,,rr,All Other Pharmaceut,UNBW,,,,,00201790,PHARMA,friday,2025-10-03,NaT


In [43]:
# Prepare current and previous frames

# Current snapshot frame
current_df = df.copy()

current_df = current_df.rename(columns={
    "quantity": "quantity_curr",
    "cost_per_unit": "cost_per_unit_curr",
    "total_cost": "total_cost_curr",
})

# Previous snapshot frame
previous_df = df.rename(columns={
    "snapshot_date": "snapshot_date_prev",
    "quantity": "quantity_prev",
    "cost_per_unit": "cost_per_unit_prev",
    "total_cost": "total_cost_prev",
})[
    [
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
        "quantity_prev",
        "cost_per_unit_prev",
        "total_cost_prev",
    ]
]


# Join current to previous snapshot
rca_base = current_df.merge(
    previous_df,
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    how="left"
)

In [44]:
df.shape

(396796, 20)

In [45]:
rca_base.tail(5)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev
397033,1279436,2057,202612,104.0,70117.17648,concost dp,ST,674.20362,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,104.0,674.20362,70117.17648
397034,1273903,2058,202612,184.0,13544.378,concost dp,ST,73.61075,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,794.0,73.61075,58446.9355
397035,1430657,1610,202612,135.0,7252.3755,concost dp,ST,53.7213,rr,Ipilimumab (Yervoy),...,INJECTION,3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,135.0,53.7213,7252.3755
397036,1431873,2061,202612,0.0,0.0,concost dp,ST,151.54812,rr,ZEPOSIA (ozanimod),...,CAPSULE,212057,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,170.0,151.54812,25763.1804
397037,1445072,1731,202612,0.0,0.0,concost dp,ST,87.84526,rr,ZEPOSIA (ozanimod),...,CAPSULE,212057,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,0.0,87.84526,0.0


In [46]:
# Flags for material-plant presence / absence between snapshots

# Not present in previous snapshot 
rca_base["is_new_in_current_snapshot"] = (
    rca_base["prev_snapshot_date"].notna() &
    rca_base["quantity_prev"].isna()
)

# Present in previous but missing in current snapshot
rca_base["is_new_in_current_snapshot"] = False


# Identify valid previous snapshots (from calendar logic)
valid_prev_snapshots = (
    snapshot_calendar["prev_snapshot_date"]
        .dropna()
        .unique()
)

# Restrict previous_df BEFORE the anti-join
previous_df_valid = previous_df[
    previous_df["snapshot_date_prev"].isin(valid_prev_snapshots)
]

# Present in previous but missing in current snapshot
# Identify rows present in previous snapshot but missing in current
prev_only = previous_df_valid.merge(
    current_df[
        [
            "material",
            "plant",
            "date",
            "snapshot_type",
            "prev_snapshot_date",
        ]
    ],
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    how="left",
    indicator=True
).query("_merge == 'left_only'")



prev_only["is_dropped_in_current_snapshot"] = True

# Add missing columns for consistency
for col in rca_base.columns:
    if col not in prev_only.columns:
        prev_only[col] = np.nan

rca_base["is_dropped_in_current_snapshot"] = False

final_rca_frame = pd.concat(
    [rca_base, prev_only[rca_base.columns]],
    ignore_index=True
)

  final_rca_frame = pd.concat(


In [47]:
for col in [
    "is_new_in_current_snapshot",
    "is_dropped_in_current_snapshot",
]:
    final_rca_frame[col] = (
        final_rca_frame[col]
            .replace({1: True, 0: False})   # normalize numeric bools
            .fillna(False)                  # handle NaNs
            .astype("boolean")              # now safe
    )


  .fillna(False)                  # handle NaNs


In [48]:
final_rca_frame.shape

(403701, 26)

In [49]:
final_rca_frame.tail()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev,is_new_in_current_snapshot,is_dropped_in_current_snapshot
403696,1449263,1731,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,635.25,415.59545,264007.009613,False,True
403697,1465848,2061,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,520.0,9.15934,4762.8568,False,True
403698,1229703,1513,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,0.0,52.901448,0.0,False,True
403699,3333228,2061,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,0.0,,0.0,False,True
403700,1406520,2057,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,3.0,33.25831,99.77493,False,True


In [50]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot'],
      dtype='object')

In [51]:
snapshot_mapping_check = (
    final_rca_frame
    .loc[:, ["snapshot_type", "snapshot_date", "prev_snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)

snapshot_mapping_check


Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
36278,bd13,2025-10-17,NaT
136071,bd13,2025-11-19,2025-10-17
237204,bd13,2025-12-17,2025-11-19
358109,bd13,2026-01-23,2025-12-17
397455,bd13,NaT,NaT
0,friday,2025-10-03,NaT
16387,friday,2025-10-10,2025-10-03
56126,friday,2025-10-24,2025-10-10
76116,friday,2025-10-31,2025-10-24
96191,friday,2025-11-07,2025-10-31


Error handling

1. Division by zero & invalid math -
    Previous quantity = 0
    Previous cost = 0
    Previous FIP = 0
    Volatility = 0
2. Min rolling window
   less than min periods of 3
   Newly introduced SKU
   Flat history causing volatility = 0
3. double counting due to duplicates
4. missing prev snapshot data - nulls
   SKU appears for first time
   SKU disappears and reappears
   Previous quantity / cost not available
5. Extreme values in the history
    Very large quantities or costs in the past pushing the present numbers. The outliers in the past affects the current numbers. Exlcude the historical outliers
   

# Calculations

In [52]:
# Step 1: Compute raw change metrics (always runs)


# Base deltas

final_rca_frame["delta_quantity"] = (
    final_rca_frame["quantity_curr"] - final_rca_frame["quantity_prev"]
)

final_rca_frame["delta_cost_per_unit"] = (
    final_rca_frame["cost_per_unit_curr"] - final_rca_frame["cost_per_unit_prev"]
)


# Impact decomposition

# Quantity Impact 
final_rca_frame["quantity_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["cost_per_unit_prev"]
)

# Cost Impact
final_rca_frame["cost_impact"] = (
    final_rca_frame["delta_cost_per_unit"] * final_rca_frame["quantity_prev"]
)

# Intercation
final_rca_frame["interaction_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["delta_cost_per_unit"]
)

# Total Change
final_rca_frame["total_fip_change"] = (
    final_rca_frame["quantity_impact"] +
    final_rca_frame["cost_impact"] +
    final_rca_frame["interaction_impact"]
)

# Core Metrics

# delta_quantity_pct 
final_rca_frame["delta_quantity_pct"] = (
    final_rca_frame["delta_quantity"] /
    final_rca_frame["quantity_prev"]
)

# delta_cost_per_unit_pct
final_rca_frame["delta_cost_per_unit_pct"] = (
    final_rca_frame["delta_cost_per_unit"] /
    final_rca_frame["cost_per_unit_prev"]
)

# Contribution shares (absolute, normalized)
impact_abs_sum_qc = (
    final_rca_frame["quantity_impact"].abs() +
    final_rca_frame["cost_impact"].abs()
)

impact_abs_sum_all = (
    impact_abs_sum_qc +
    final_rca_frame["interaction_impact"].abs()
)


# quantity_impact_pct_of_total 
final_rca_frame["quantity_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["quantity_impact"].abs() / impact_abs_sum_qc,
    0
)

# cost_impact_pct_of_total 
final_rca_frame["cost_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["cost_impact"].abs() / impact_abs_sum_qc,
    0
)

# interaction_pct
final_rca_frame["interaction_pct"] = np.where(
    impact_abs_sum_all > 0,
    final_rca_frame["interaction_impact"].abs() / impact_abs_sum_all,
    0
)


In [53]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,delta_cost_per_unit,quantity_impact,cost_impact,interaction_impact,total_fip_change,delta_quantity_pct,delta_cost_per_unit_pct,quantity_impact_pct_of_total,cost_impact_pct_of_total,interaction_pct
0,CRU CRU D-40-Ref,1037,202612,0.0,,missing,,,rr,,...,,,,,,,,0.0,0.0,0.0
1,1382955Z0,1734,202612,0.175,,missing,,,rr,,...,,,,,,,,0.0,0.0,0.0
2,1234398,1760,202612,473.619466,146053.880553,concost dp,KG,308.37812,rr,Ipilimumab (Yervoy),...,,,,,,,,0.0,0.0,0.0
3,1457248,2061,202612,561.0,,missing,,,rr,All Other Pharmaceut,...,,,,,,,,0.0,0.0,0.0
4,1457516,2061,202612,34000.0,,missing,,,rr,All Other Pharmaceut,...,,,,,,,,0.0,0.0,0.0


In [54]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot',
       'delta_quantity', 'delta_cost_per_unit', 'quantity_impact',
       'cost_impact', 'interaction_impact', 'total_fip_change',
       'delta_quantity_pct', 'delta_cost_per_unit_pct',
       'quantity_impact_pct_of_total', 'cost_impact_pct_of_total',
       'interaction_pct'],
      dtype='object')

In [55]:
# Step 2 - Noise vs Signal determination 


# Sorting

final_rca_frame = final_rca_frame.sort_values(
    ["material", "plant", "snapshot_date"]
)

#  1) Business materiality (value-based)   
# How big the change is in value terms, relative to prior fip.
final_rca_frame["relative_fip_impact"] = np.where(
    final_rca_frame["total_cost_prev"] > 0,
    final_rca_frame["total_fip_change"].abs() /
    final_rca_frame["total_cost_prev"],
    np.nan
)


# 2) Historical volatility (material–plant aware) -- exclude current date? -- shift(1)- picking up other?

# Quantity volatility (rolling, 12 snapshots) - How noisy this SKU normally is.
# quantity_volatility_12w = std(delta_quantity_pct)
final_rca_frame["quantity_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).std()) 
)


# Cost volatility (rolling, 12 snapshots)
final_rca_frame["cost_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_cost_per_unit_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).std())
)


# 3) Normalized (volatility-aware) changes

final_rca_frame["normalized_quantity_change"] = (
    final_rca_frame["delta_quantity_pct"] /
    final_rca_frame["quantity_volatility_12w"]
)

final_rca_frame["normalized_cost_change"] = (
    final_rca_frame["delta_cost_per_unit_pct"] /
    final_rca_frame["cost_volatility_12w"]
)

# Interpretation:
# ≈ 1 → normal
# ≫ 1 → unusually large vs history


In [56]:
# 4) Temporal persistence (directional consistency)

final_rca_frame["quantity_change_sign"] = np.sign(
    final_rca_frame["delta_quantity_pct"]
) 

def persistence_score(series):
    score = []
    current = 0
    prev = 0
    for v in series:
        if v == 0 or pd.isna(v):
            current = 0
        elif v == prev:
            current += 1
        else:
            current = 1
        score.append(current)
        prev = v
    return score

final_rca_frame["quantity_persistence_score"] = (
    final_rca_frame
    .groupby(["material", "plant"])["quantity_change_sign"]
    .transform(persistence_score)
)


In [57]:
#oultier v2



# --------------------------------------------
# CONFIG
# --------------------------------------------
ROLLING_WINDOW = 12
MIN_PERIODS = 6
EXTREME_PCT_CHANGE = 0.5     # 50% absolute change
LOW_LEVEL_RATIO = 0.01       # 1% of typical level
ROBUST_Z_THRESHOLD = 3

# --------------------------------------------
# 1) Rolling median of quantity (baseline scale)
# --------------------------------------------
final_rca_frame["quantity_rolling_median_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["quantity_curr"]
    .transform(lambda x: x.shift(1).rolling(ROLLING_WINDOW, min_periods=MIN_PERIODS).median())
)

# --------------------------------------------
# 2) Structural extreme change (scale-based)
# --------------------------------------------
final_rca_frame["is_structural_extreme_qty"] = (
    (final_rca_frame["delta_quantity_pct"].abs() >= EXTREME_PCT_CHANGE) &
    (
        (final_rca_frame["quantity_curr"] <=
         LOW_LEVEL_RATIO * final_rca_frame["quantity_rolling_median_12w"]) |
        (final_rca_frame["quantity_curr"] >=
         (1 / LOW_LEVEL_RATIO) * final_rca_frame["quantity_rolling_median_12w"])
    )
)

# --------------------------------------------
# 3) Create CLEAN delta series (exclude known outliers)
# --------------------------------------------
final_rca_frame["clean_delta_quantity_pct"] = final_rca_frame["delta_quantity_pct"]

final_rca_frame.loc[
    final_rca_frame["is_structural_extreme_qty"],
    "clean_delta_quantity_pct"
] = np.nan

# --------------------------------------------
# 4) Robust MAD-based z-score (on clean history only)
# --------------------------------------------
def robust_zscore(series):
    median = np.nanmedian(series)
    mad = np.nanmedian(np.abs(series - median))
    if mad == 0 or np.isnan(mad):
        return pd.Series(np.nan, index=series.index)
    return (series - median) / (1.4826 * mad)

final_rca_frame["robust_quantity_z"] = (
    final_rca_frame
    .groupby(["material", "plant"])["clean_delta_quantity_pct"]
    .transform(lambda x: robust_zscore(x.shift(1)))
)

# --------------------------------------------
# 5) Statistical outlier (robust)
# --------------------------------------------
final_rca_frame["is_statistical_outlier_qty"] = (
    final_rca_frame["robust_quantity_z"].abs() > ROBUST_Z_THRESHOLD
)

# --------------------------------------------
# 6) FINAL quantity outlier flag
# --------------------------------------------
final_rca_frame["is_quantity_outlier"] = (
    final_rca_frame["is_structural_extreme_qty"] |
    final_rca_frame["is_statistical_outlier_qty"]
)

# --------------------------------------------
# 7) Cleanup (optional but recommended)
# --------------------------------------------
bool_cols = [
    "is_structural_extreme_qty",
    "is_statistical_outlier_qty",
    "is_quantity_outlier",
]

final_rca_frame[bool_cols] = (
    final_rca_frame[bool_cols]
    .fillna(False)
    .astype("boolean")
)


  median = np.nanmedian(series)
  mad = np.nanmedian(np.abs(series - median))


In [58]:
# # Outlier new

# # rolling median
# final_rca_frame["quantity_rolling_median_12w"] = (
#     final_rca_frame
#     .groupby(["material", "plant"])["quantity_curr"]
#     .transform(lambda x: x.shift(1).rolling(12, min_periods=6).median())
# )

# EXTREME_PCT_CHANGE = 0.5        # 50% change
# LOW_LEVEL_RATIO = 0.01          # 1% of typical level
# Z_SCORE_THRESHOLD = 3

# final_rca_frame["is_structural_extreme_qty"] = (
#     (final_rca_frame["delta_quantity_pct"].abs() >= EXTREME_PCT_CHANGE) &
#     (
#         final_rca_frame["quantity_curr"] <=
#         LOW_LEVEL_RATIO * final_rca_frame["quantity_rolling_median_12w"]
#     )
# )

# final_rca_frame["is_statistical_outlier_qty"] = (
#     (final_rca_frame["quantity_volatility_12w"] > 0) &
#     (final_rca_frame["normalized_quantity_change"].abs() > Z_SCORE_THRESHOLD)
# )

# final_rca_frame["is_quantity_outlier"] = (
#     final_rca_frame["is_structural_extreme_qty"] |
#     final_rca_frame["is_statistical_outlier_qty"]
# )


In [59]:
# # 5) Outlier detection (z-score based - per material–plant) -old calc

# final_rca_frame["quantity_zscore"] = (
#     final_rca_frame["delta_quantity_pct"] /       # check
#     final_rca_frame["quantity_volatility_12w"]
# )

# final_rca_frame["is_quantity_outlier"] = (
#     final_rca_frame["quantity_zscore"].abs() > 3  # threshold
# )


In [60]:
# # 6) Change-point detection

# final_rca_frame["rolling_avg_quantity_12w"] = (
#     final_rca_frame
#     .groupby(["material", "plant"])["delta_quantity_pct"]
#     .transform(lambda x: x.shift(1).rolling(12, min_periods=3).mean())
# )

# final_rca_frame["change_point_detected"] = (
#     (final_rca_frame["quantity_persistence_score"] >= 2) &
#     (final_rca_frame["rolling_avg_quantity_12w"].abs() > 
#      final_rca_frame["quantity_volatility_12w"])
# )

In [61]:
# Change Point

final_rca_frame["rolling_avg_quantity_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).mean())
)
# --------------------------------------------
# Robust rolling MAD of delta quantity
# --------------------------------------------

final_rca_frame["delta_qty_rolling_mad_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(
        lambda x: x.shift(1)
                  .rolling(12, min_periods=6)
                  .apply(lambda s: np.nanmedian(np.abs(s - np.nanmedian(s))), raw=True)
    )
)

# --------------------------------------------
# Robust trend strength
# --------------------------------------------
final_rca_frame["robust_trend_strength"] = (
    final_rca_frame["rolling_avg_quantity_12w"].abs() /
    (1.4826 * final_rca_frame["delta_qty_rolling_mad_12w"])
)

# --------------------------------------------
# Short-window confirmation
# --------------------------------------------
final_rca_frame["rolling_avg_quantity_6w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(6, min_periods=3).mean())
)

final_rca_frame["trend_confirmed"] = (
    np.sign(final_rca_frame["rolling_avg_quantity_12w"]) ==
    np.sign(final_rca_frame["rolling_avg_quantity_6w"])
)

# --------------------------------------------
# FINAL robust change-point detection
# --------------------------------------------
final_rca_frame["change_point_detected"] = (
    (final_rca_frame["quantity_persistence_score"] >= 2) &
    (final_rca_frame["robust_trend_strength"] > 2) &
    (final_rca_frame["trend_confirmed"])
)


In [62]:
# Final Noise Vs Signal Flag

MATERIALITY_THRESHOLD = 0.5
MIN_PERSISTENCE = 2

final_rca_frame["is_noise"] = (
    final_rca_frame["is_quantity_outlier"] &
    (final_rca_frame["quantity_persistence_score"] < MIN_PERSISTENCE) &
    (~final_rca_frame["change_point_detected"])
)

final_rca_frame["is_large_noise"] = (
    final_rca_frame["is_noise"] &
    (final_rca_frame["relative_fip_impact"] > 0.5)
)

final_rca_frame["is_signal"] = ~final_rca_frame["is_noise"]


In [63]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,is_quantity_outlier,rolling_avg_quantity_12w,delta_qty_rolling_mad_12w,robust_trend_strength,rolling_avg_quantity_6w,trend_confirmed,change_point_detected,is_noise,is_large_noise,is_signal
32546,#101631,#139,202612,21538.0,2238389.0,Oracle EBS Cost,TBD,103.927442,rr,,...,False,,,,,False,False,False,False,True
41835,#101631,#139,202612,0.0,0.0,Oracle EBS Cost,TBD,103.927442,rr,,...,False,,,,,False,False,False,False,True
57842,#101631,#139,202612,0.0,0.0,Oracle EBS Cost,TBD,103.927442,rr,,...,False,,,,,False,False,False,False,True
91639,#101631,#139,202612,0.0,0.0,Oracle EBS Cost,TBD,103.927442,rr,,...,False,,,,,False,False,False,False,True
98387,#101631,#139,202612,0.0,0.0,Oracle EBS Cost,TBD,103.927442,rr,,...,False,,,,,False,False,False,False,True


In [64]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot',
       'delta_quantity', 'delta_cost_per_unit', 'quantity_impact',
       'cost_impact', 'interaction_impact', 'total_fip_change',
       'delta_quantity_pct', 'delta_cost_per_unit_pct',
       'quantity_impact_pct_of_total', 'cost_impact_pct_of_total',
       'interaction_pct', 'relative_fip_impact', 'quantity_volatility_12w',
       'cost_volatility_12w', 'normalized_quantity_change',
       'normalized_cost_change', 'q

In [67]:
v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] > "2025-11-09") 
    &   (final_rca_frame ["material"] == '1456877')]
display(v)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,is_quantity_outlier,rolling_avg_quantity_12w,delta_qty_rolling_mad_12w,robust_trend_strength,rolling_avg_quantity_6w,trend_confirmed,change_point_detected,is_noise,is_large_noise,is_signal
119357,1456877,2091,202612,3882051.0,36775880.0,concost dp,G,9.47331,rr,REVLIMID,...,True,,,,,False,False,True,True,False
140387,1456877,2091,202612,36188.53,342825.2,concost dp,G,9.47331,rr,REVLIMID,...,False,18.938963,,,18.938963,True,False,False,False,True
166710,1456877,2091,202612,6580000.0,62334380.0,concost dp,G,9.47331,rr,REVLIMID,...,True,18.938963,,,18.938963,True,False,False,False,True
178980,1456877,2091,202612,6570763.0,62246870.0,concost dp,G,9.47331,rr,REVLIMID,...,False,14.377967,,,14.377967,True,False,False,False,True
198312,1456877,2091,202612,1260.0,11936.37,concost dp,G,9.47331,rr,REVLIMID,...,False,11.502093,,,11.502093,True,False,False,False,True
236573,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,True,9.418443,0.348192,18.24468,11.302131,True,False,True,True,False
246384,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,True,8079.358665,0.69498,7841.163,11311.102131,True,True,False,False,True
274689,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,False,7315.213554,0.848096,5817.789,11692.978308,True,False,False,False,True
295530,1456877,2091,202612,71190000.0,674404900.0,concost dp,G,9.47331,rr,REVLIMID,...,False,6502.412048,0.69498,6310.708,9744.14859,True,False,False,False,True
300747,1456877,2091,202612,71190000.0,1482382000.0,concost dp,G,20.82289,rr,REVLIMID,...,False,5852.170843,0.348192,11336.38,9744.03276,True,False,False,False,True


In [72]:
cols = ['material', 'plant', 'date', 'quantity_curr','quantity_prev', 'total_cost_curr','cost_per_unit_curr',
    "snapshot_date",'change_point_detected', 'is_noise', 'is_large_noise', 'is_signal']

v1 = final_rca_frame.loc[
    (final_rca_frame["snapshot_date"] > "2025-11-09") &
    (final_rca_frame["material"] == "1456877"),
    cols
]
v1

Unnamed: 0,material,plant,date,quantity_curr,quantity_prev,total_cost_curr,cost_per_unit_curr,snapshot_date,change_point_detected,is_noise,is_large_noise,is_signal
119357,1456877,2091,202612,3882051.0,67143.9,36775880.0,9.47331,2025-11-14,False,True,True,False
140387,1456877,2091,202612,36188.53,0.0,342825.2,9.47331,2025-11-19,False,False,False,True
166710,1456877,2091,202612,6580000.0,3882051.0,62334380.0,9.47331,2025-11-21,False,False,False,True
178980,1456877,2091,202612,6570763.0,6580000.0,62246870.0,9.47331,2025-11-28,False,False,False,True
198312,1456877,2091,202612,1260.0,6570763.0,11936.37,9.47331,2025-12-05,False,False,False,True
236573,1456877,2091,202612,71190000.0,1260.0,674404900.0,9.47331,2025-12-12,False,True,True,False
246384,1456877,2091,202612,71190000.0,36188.53,674404900.0,9.47331,2025-12-17,True,False,False,True
274689,1456877,2091,202612,71190000.0,71190000.0,674404900.0,9.47331,2025-12-19,False,False,False,True
295530,1456877,2091,202612,71190000.0,71190000.0,674404900.0,9.47331,2025-12-26,False,False,False,True
300747,1456877,2091,202612,71190000.0,71190000.0,1482382000.0,20.82289,2026-01-02,False,False,False,True


In [None]:
1478550 @1037

In [54]:
# v.to_csv('revlimid_output_1.csv')

In [None]:
# cols_to_remove = [
#     'development_lifecycle_status',
#        'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
#        'corp_brand_id', 'network_or_business_unit'
# ]

# df_final = final_rca_frame.drop(columns=cols_to_remove, errors="ignore")



# v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] == "2026-01-09") 
#     &   (final_rca_frame ["material"] == '1456877')]