In [1]:
import pandas as pd
import pyarrow.dataset as ds
import numpy as np

## Dataset Import

In [16]:
# Import FIP Dataset

s3_path_fip = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "publish/data-product/financial_inventory_projection_report_network_update/"
)

dataset = ds.dataset(
    s3_path_fip,
    format="parquet",
    partitioning="hive" 
)

table = dataset.to_table(
    filter=(
        #(ds.field("corporate_brand") == "REVLIMID") & # data after dec 2025
        (ds.field("date") == "202612") &  
        (ds.field("snapshot_date") >= "2025-07-01") &      # filter only 2026 YE FIP data
        ~(
            (ds.field("snapshot_date") == "2026-01-23") &
            (ds.field("snapshot_type") == "friday")         # drop 23rd Fri snapshot data to avoid double counting
        )
    )
)

df_fip = table.to_pandas()
#df_fip.head()

In [17]:
df_fip.shape

(894509, 19)

In [6]:
s3_path_plants = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "refined/data-asset/fin_inv_proj/"
    "bms_internal_vs_external_plants/"
    "bms_internal_vs_external_plants.parquet"
)

df_plants = pd.read_parquet(s3_path_plants)
df_plants.head()

Unnamed: 0,Plant,Plant Name,Street,City,Country,Region,Plant Type
0,1,Werk 0001,Berliner Alle 103,Berlin,DE,,Internal
1,3,Plant 0003 (is-ht-sw),123 main street,palo alto,US,CA,Internal
2,901,3RD PARTIES PLANT,,,GR,,Internal
3,1700,,,,DE,,Internal
4,1001,Plainsboro NJ ER Squibb,777 Scudders Mill Road,Plainsboro,US,NJ,Internal


In [15]:
import pandas as pd
import s3fs

fs = s3fs.S3FileSystem()  # uses SageMaker execution role

parquet_files = fs.glob(
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "refined/data-asset/fin_inv_proj/"
    "sap_material_master/data/*.parquet"
)

print(len(parquet_files))  # should be > 0

df_mm = pd.read_parquet(
    parquet_files,
    engine="pyarrow",
    dtype_backend="pyarrow",
    filesystem=fs
)

df_mm.head()


160


Unnamed: 0,plant,plant_name,material_number,material_description,enterprise_material_id,enterprise_material_desc,enterprise_material_active_indicator,company_code,profit_center,deletion_flag_for_all_material_data_of_a_valuation_type,...,copkg_comp_kit_num,device_type_name,administrable_dosage_form_child,administrable_dosage_form_parent,combination_type_name,commercial_status,req_ndc_ind_txt,src_system,modified_x20on,modified_x20by
0,1036,SK biotek Ireland Ltd,1008763,BEARING 8049-30900,,,,17,10319,,...,,,,,,,,,,
1,1036,SK biotek Ireland Ltd,1008764,COUPLING F GRIDS 40 T,,,,17,10319,,...,,,,,,,,,,
2,1036,SK biotek Ireland Ltd,1008765,BEARING 8050-30960,,,,17,10319,,...,,,,,,,,,,
3,1036,SK biotek Ireland Ltd,1008766,SHAFT,,,,17,10319,,...,,,,,,,,,,
4,1036,SK biotek Ireland Ltd,1008767,SHAFT SLEEVE,,,,17,10319,,...,,,,,,,,,,


In [10]:
import boto3

s3 = boto3.client("s3")

bucket = "m3-intel-hub-dp-us-east-1-517292-prod"
prefix = "refined/data-asset/fin_inv_proj/sap_material_master/"

response = s3.list_objects_v2(
    Bucket=bucket,
    Prefix=prefix
)

if "Contents" in response:
    for obj in response["Contents"]:
        print(obj["Key"], obj["Size"])
else:
    print("No objects found or no access.")

refined/data-asset/fin_inv_proj/sap_material_master/ 0
refined/data-asset/fin_inv_proj/sap_material_master/data/00000-71-ebeb135c-bebd-4485-b2c6-b22f08d78270-0-00001.parquet 640698
refined/data-asset/fin_inv_proj/sap_material_master/data/00000-73-100b23aa-a143-4e19-be49-3e0ccb929619-0-00001.parquet 640698
refined/data-asset/fin_inv_proj/sap_material_master/data/00000-73-5daae73f-fb49-420d-8be1-b0761df11eb9-0-00001.parquet 640698
refined/data-asset/fin_inv_proj/sap_material_master/data/00000-73-7e2f65b5-7b6c-4672-9efd-c8cc62d26177-0-00001.parquet 640698
refined/data-asset/fin_inv_proj/sap_material_master/data/00000-73-ea0b6c06-932d-4c1d-abb7-2bb9815ad969-0-00001.parquet 640698
refined/data-asset/fin_inv_proj/sap_material_master/data/00001-72-ebeb135c-bebd-4485-b2c6-b22f08d78270-0-00001.parquet 624995
refined/data-asset/fin_inv_proj/sap_material_master/data/00001-74-100b23aa-a143-4e19-be49-3e0ccb929619-0-00001.parquet 624995
refined/data-asset/fin_inv_proj/sap_material_master/data/00001-

## Data Prep

In [57]:
# Create has_non_zero flag at material–plant level

df_fip["has_non_zero"] = (
    df_fip
    .groupby(["material", "plant"])["total_cost"]
    .transform(lambda x: (x != 0).any())
    .astype(int)
)

# Apply the filter
base = df_fip.loc[df_fip["has_non_zero"] == 1].drop(columns="has_non_zero")


In [58]:
print(df_fip.shape)
base.shape

(385660, 20)


(92589, 19)

In [59]:
# fip copy df for data prep
df = base.copy()
df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])


# snapshot lookup table
snapshot_calendar = (
    df[["snapshot_type", "snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)


# attach prev snapshot to snapshot calendar
snapshot_calendar["prev_snapshot_date"] = (
    snapshot_calendar
    .groupby("snapshot_type")["snapshot_date"]
    .shift(1)
)
snapshot_calendar      # comparing bd13 - bd13 snapshots and friday-friday snapshots. no bd13-fri snapshots

Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
78159,bd13,2025-12-17,NaT
309446,bd13,2026-01-23,2025-12-17
332,friday,2025-12-05,NaT
39169,friday,2025-12-12,2025-12-05
117174,friday,2025-12-19,2025-12-12
156266,friday,2025-12-26,2025-12-19
195322,friday,2026-01-02,2025-12-26
232451,friday,2026-01-09,2026-01-02
271376,friday,2026-01-16,2026-01-09
347624,friday,2026-01-30,2026-01-16


In [60]:
# Attach previous snapshot date to each row

df = df.merge(
    snapshot_calendar,
    on=["snapshot_type", "snapshot_date"],
    how="left"
)

In [61]:
df.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date
0,1278101,1721,202612,0.0,0.0,concost dp,TH,121.374929,sap,Entecavir,HALB,COMMERCIAL,DRUG PRODUCT,INTERMEDIATE,TABLET,2201397,PHARMA,friday,2025-12-05,NaT
1,1436539,2071,202612,47.0,67.21,concost dp,ST,1.43,sap,BREYANZI ((lisocabt),PACK,,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
2,1436602,2071,202612,3877.0,736.63,concost dp,ST,0.19,sap,BREYANZI ((lisocabt),PACK,COMMERCIAL,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
3,1436617,2071,202612,167.0,1.67,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT
4,1436657,2071,202612,20.0,0.2,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT


In [62]:
# Prepare current and previous frames

# Current snapshot frame
current_df = df.copy()

current_df = current_df.rename(columns={
    "quantity": "quantity_curr",
    "cost_per_unit": "cost_per_unit_curr",
    "total_cost": "total_cost_curr",
})

# Previous snapshot frame
previous_df = df.rename(columns={
    "snapshot_date": "snapshot_date_prev",
    "quantity": "quantity_prev",
    "cost_per_unit": "cost_per_unit_prev",
    "total_cost": "total_cost_prev",
})[
    [
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
        "quantity_prev",
        "cost_per_unit_prev",
        "total_cost_prev",
    ]
]


# Join current to previous snapshot
rca_base = current_df.merge(
    previous_df,
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    how="left"
)

In [63]:
df.shape

(92589, 20)

In [12]:
rca_base.tail(5)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev
92584,1426743,1718,202612,84.8,6244.069072,concost dp,ST,73.63289,rr,SPRYCEL (dasatinib),...,TABLET,3301502,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,84.8,73.63289,6244.069072
92585,1279436,2057,202612,104.0,70117.17648,concost dp,ST,674.20362,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,104.0,674.20362,70117.17648
92586,1273903,2058,202612,184.0,13544.378,concost dp,ST,73.61075,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,794.0,73.61075,58446.9355
92587,1430657,1610,202612,135.0,7252.3755,concost dp,ST,53.7213,rr,Ipilimumab (Yervoy),...,INJECTION,3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,135.0,53.7213,7252.3755
92588,1431873,2061,202612,0.0,0.0,concost dp,ST,151.54812,rr,ZEPOSIA (ozanimod),...,CAPSULE,212057,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,170.0,151.54812,25763.1804


In [64]:
# Flags for material-plant presence / absence between snapshots

# Not present in previous snapshot 
rca_base["is_new_in_current_snapshot"] = (
    rca_base["prev_snapshot_date"].notna() &
    rca_base["quantity_prev"].isna()
)

# Present in previous but missing in current snapshot
rca_base["is_new_in_current_snapshot"] = False


# Identify valid previous snapshots (from calendar logic)
valid_prev_snapshots = (
    snapshot_calendar["prev_snapshot_date"]
        .dropna()
        .unique()
)

# Restrict previous_df BEFORE the anti-join
previous_df_valid = previous_df[
    previous_df["snapshot_date_prev"].isin(valid_prev_snapshots)
]

# Present in previous but missing in current snapshot
# Identify rows present in previous snapshot but missing in current
prev_only = previous_df_valid.merge(
    current_df[
        [
            "material",
            "plant",
            "date",
            "snapshot_type",
            "prev_snapshot_date",
        ]
    ],
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    how="left",
    indicator=True
).query("_merge == 'left_only'")



prev_only["is_dropped_in_current_snapshot"] = True

# Add missing columns for consistency
for col in rca_base.columns:
    if col not in prev_only.columns:
        prev_only[col] = np.nan

rca_base["is_dropped_in_current_snapshot"] = False

final_rca_frame = pd.concat(
    [rca_base, prev_only[rca_base.columns]],
    ignore_index=True
)

  final_rca_frame = pd.concat(


In [65]:
for col in [
    "is_new_in_current_snapshot",
    "is_dropped_in_current_snapshot",
]:
    final_rca_frame[col] = (
        final_rca_frame[col]
            .replace({1: True, 0: False})   # normalize numeric bools
            .fillna(False)                  # handle NaNs
            .astype("boolean")              # now safe
    )


  .fillna(False)                  # handle NaNs


In [66]:
final_rca_frame.shape

(93764, 26)

In [16]:
final_rca_frame.tail()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev,is_new_in_current_snapshot,is_dropped_in_current_snapshot
93759,1431416,2057,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,5.0,1035.09143,5175.45715,False,True
93760,1449263,1731,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,635.25,415.59545,264007.009613,False,True
93761,1465848,2061,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,520.0,9.15934,4762.8568,False,True
93762,1229703,1513,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,0.0,52.901448,0.0,False,True
93763,1406520,2057,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-16,3.0,33.25831,99.77493,False,True


In [17]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot'],
      dtype='object')

In [67]:
snapshot_mapping_check = (
    final_rca_frame
    .loc[:, ["snapshot_type", "snapshot_date", "prev_snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)

snapshot_mapping_check


Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
18732,bd13,2025-12-17,NaT
74561,bd13,2026-01-23,2025-12-17
92664,bd13,NaT,NaT
0,friday,2025-12-05,NaT
9349,friday,2025-12-12,2025-12-05
28119,friday,2025-12-19,2025-12-12
37514,friday,2025-12-26,2025-12-19
46864,friday,2026-01-02,2025-12-26
56184,friday,2026-01-09,2026-01-02
65565,friday,2026-01-16,2026-01-09


# Calculations

In [68]:
# Step 1: Compute raw change metrics (always runs)


# Base deltas

final_rca_frame["delta_quantity"] = (
    final_rca_frame["quantity_curr"] - final_rca_frame["quantity_prev"]
)

final_rca_frame["delta_cost_per_unit"] = (
    final_rca_frame["cost_per_unit_curr"] - final_rca_frame["cost_per_unit_prev"]
)


# Impact decomposition

# Quantity Impact 
final_rca_frame["quantity_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["cost_per_unit_prev"]
)

# Cost Impact
final_rca_frame["cost_impact"] = (
    final_rca_frame["delta_cost_per_unit"] * final_rca_frame["quantity_prev"]
)

# Intercation
final_rca_frame["interaction_impact"] = (
    final_rca_frame["delta_quantity"] *
    final_rca_frame["delta_cost_per_unit"]
)

# Total Change
final_rca_frame["total_fip_change"] = (
    final_rca_frame["quantity_impact"] +
    final_rca_frame["cost_impact"] +
    final_rca_frame["interaction_impact"]
)

# Core Metrics

# delta_quantity_pct 
final_rca_frame["delta_quantity_pct"] = (
    final_rca_frame["delta_quantity"] /
    final_rca_frame["quantity_prev"]
)

# delta_cost_per_unit_pct
final_rca_frame["delta_cost_per_unit_pct"] = (
    final_rca_frame["delta_cost_per_unit"] /
    final_rca_frame["cost_per_unit_prev"]
)

# Contribution shares (absolute, normalized)
impact_abs_sum_qc = (
    final_rca_frame["quantity_impact"].abs() +
    final_rca_frame["cost_impact"].abs()
)

impact_abs_sum_all = (
    impact_abs_sum_qc +
    final_rca_frame["interaction_impact"].abs()
)


# quantity_impact_pct_of_total 
final_rca_frame["quantity_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["quantity_impact"].abs() / impact_abs_sum_qc,
    0
)

# cost_impact_pct_of_total 
final_rca_frame["cost_impact_pct_of_total"] = np.where(
    impact_abs_sum_qc > 0,
    final_rca_frame["cost_impact"].abs() / impact_abs_sum_qc,
    0
)

# interaction_pct
final_rca_frame["interaction_pct"] = np.where(
    impact_abs_sum_all > 0,
    final_rca_frame["interaction_impact"].abs() / impact_abs_sum_all,
    0
)


In [69]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,delta_cost_per_unit,quantity_impact,cost_impact,interaction_impact,total_fip_change,delta_quantity_pct,delta_cost_per_unit_pct,quantity_impact_pct_of_total,cost_impact_pct_of_total,interaction_pct
0,1278101,1721,202612,0.0,0.0,concost dp,TH,121.374929,sap,Entecavir,...,,,,,,,,0.0,0.0,0.0
1,1436539,2071,202612,47.0,67.21,concost dp,ST,1.43,sap,BREYANZI ((lisocabt),...,,,,,,,,0.0,0.0,0.0
2,1436602,2071,202612,3877.0,736.63,concost dp,ST,0.19,sap,BREYANZI ((lisocabt),...,,,,,,,,0.0,0.0,0.0
3,1436617,2071,202612,167.0,1.67,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),...,,,,,,,,0.0,0.0,0.0
4,1436657,2071,202612,20.0,0.2,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),...,,,,,,,,0.0,0.0,0.0


In [70]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot',
       'delta_quantity', 'delta_cost_per_unit', 'quantity_impact',
       'cost_impact', 'interaction_impact', 'total_fip_change',
       'delta_quantity_pct', 'delta_cost_per_unit_pct',
       'quantity_impact_pct_of_total', 'cost_impact_pct_of_total',
       'interaction_pct'],
      dtype='object')

In [71]:
# Step 2 - Noise vs Signal determination 


# Sorting

final_rca_frame = final_rca_frame.sort_values(
    ["material", "plant", "snapshot_date"]
)

#  1) Business materiality (value-based)   
# How big the change is in value terms, relative to prior fip.
final_rca_frame["relative_fip_impact"] = np.where(
    final_rca_frame["total_cost_prev"] > 0,
    final_rca_frame["total_fip_change"].abs() /
    final_rca_frame["total_cost_prev"],
    np.nan
)


# 2) Historical volatility (material–plant aware) -- exclude current date? -- shift(1)- picking up other?

# Quantity volatility (rolling, 12 snapshots) - How noisy this SKU normally is.
# quantity_volatility_12w = std(delta_quantity_pct)
final_rca_frame["quantity_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).std()) 
)


# Cost volatility (rolling, 12 snapshots)
final_rca_frame["cost_volatility_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_cost_per_unit_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).std())
)


# 3) Normalized (volatility-aware) changes

final_rca_frame["normalized_quantity_change"] = (
    final_rca_frame["delta_quantity_pct"] /
    final_rca_frame["quantity_volatility_12w"]
)

final_rca_frame["normalized_cost_change"] = (
    final_rca_frame["delta_cost_per_unit_pct"] /
    final_rca_frame["cost_volatility_12w"]
)

# Interpretation:
# ≈ 1 → normal
# ≫ 1 → unusually large vs history


In [72]:
# 4) Temporal persistence (directional consistency)

final_rca_frame["quantity_change_sign"] = np.sign(
    final_rca_frame["delta_quantity_pct"]
) 

def persistence_score(series):
    score = []
    current = 0
    prev = 0
    for v in series:
        if v == 0 or pd.isna(v):
            current = 0
        elif v == prev:
            current += 1
        else:
            current = 1
        score.append(current)
        prev = v
    return score

final_rca_frame["quantity_persistence_score"] = (
    final_rca_frame
    .groupby(["material", "plant"])["quantity_change_sign"]
    .transform(persistence_score)
)


In [73]:
# 5) Outlier detection (z-score based - per material–plant)

final_rca_frame["quantity_zscore"] = (
    final_rca_frame["delta_quantity_pct"] /       # check
    final_rca_frame["quantity_volatility_12w"]
)

final_rca_frame["is_quantity_outlier"] = (
    final_rca_frame["quantity_zscore"].abs() > 3  # threshold
)


In [74]:
# 6) Change-point detection

final_rca_frame["rolling_avg_quantity_12w"] = (
    final_rca_frame
    .groupby(["material", "plant"])["delta_quantity_pct"]
    .transform(lambda x: x.shift(1).rolling(12, min_periods=3).mean())
)

final_rca_frame["change_point_detected"] = (
    (final_rca_frame["quantity_persistence_score"] >= 2) &
    (final_rca_frame["rolling_avg_quantity_12w"].abs() > 
     final_rca_frame["quantity_volatility_12w"])
)

In [75]:
# Final Noise Vs Signal Flag

MATERIALITY_THRESHOLD = 0.5
MIN_PERSISTENCE = 2

final_rca_frame["is_noise"] = (
    final_rca_frame["is_quantity_outlier"] &
    (final_rca_frame["quantity_persistence_score"] < MIN_PERSISTENCE) &
    (~final_rca_frame["change_point_detected"])
)

final_rca_frame["is_large_noise"] = (
    final_rca_frame["is_noise"] &
    (final_rca_frame["relative_fip_impact"] > 0.5)
)

final_rca_frame["is_signal"] = ~final_rca_frame["is_noise"]


In [76]:
final_rca_frame.head()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,normalized_cost_change,quantity_change_sign,quantity_persistence_score,quantity_zscore,is_quantity_outlier,rolling_avg_quantity_12w,change_point_detected,is_noise,is_large_noise,is_signal
9001,#101800,#139,202612,1896.0,198740.240718,Oracle EBS Cost,TBD,104.820802,rr,,...,,,0,,False,,False,False,False,True
16954,#101800,#139,202612,1896.0,198740.240718,Oracle EBS Cost,TBD,104.820802,rr,,...,,0.0,0,,False,,False,False,False,True
26363,#101800,#139,202612,1896.0,198740.240718,Oracle EBS Cost,TBD,104.820802,rr,,...,,,0,,False,,False,False,False,True
32839,#101800,#139,202612,1896.0,198740.240718,Oracle EBS Cost,TBD,104.820802,rr,,...,,0.0,0,,False,,False,False,False,True
45387,#101800,#139,202612,1896.0,198740.240718,Oracle EBS Cost,TBD,104.820802,rr,,...,,0.0,0,,False,,False,False,False,True


In [77]:
final_rca_frame.columns

Index(['material', 'plant', 'date', 'quantity_curr', 'total_cost_curr',
       'concost_source', 'unit_of_measure', 'cost_per_unit_curr', 'source',
       'corporate_brand', 'material_type', 'development_lifecycle_status',
       'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
       'corp_brand_id', 'network_or_business_unit', 'snapshot_type',
       'snapshot_date', 'prev_snapshot_date', 'snapshot_date_prev',
       'quantity_prev', 'cost_per_unit_prev', 'total_cost_prev',
       'is_new_in_current_snapshot', 'is_dropped_in_current_snapshot',
       'delta_quantity', 'delta_cost_per_unit', 'quantity_impact',
       'cost_impact', 'interaction_impact', 'total_fip_change',
       'delta_quantity_pct', 'delta_cost_per_unit_pct',
       'quantity_impact_pct_of_total', 'cost_impact_pct_of_total',
       'interaction_pct', 'relative_fip_impact', 'quantity_volatility_12w',
       'cost_volatility_12w', 'normalized_quantity_change',
       'normalized_cost_change', 'q

In [78]:
v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] == "2026-01-09") 
    &   (final_rca_frame ["material"] == '1456877')]
v

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,normalized_cost_change,quantity_change_sign,quantity_persistence_score,quantity_zscore,is_quantity_outlier,rolling_avg_quantity_12w,change_point_detected,is_noise,is_large_noise,is_signal
60435,1456877,2091,202612,0.0,0.0,concost dp,G,20.82289,rr,REVLIMID,...,0.0,-1.0,1,-3.5e-05,False,14124.75,False,False,False,True


In [54]:
v.to_csv('revlimid_output_1.csv')

In [None]:
# cols_to_remove = [
#     'development_lifecycle_status',
#        'enterprise_category', 'enterprise_sub_category', 'dosage_form_parent',
#        'corp_brand_id', 'network_or_business_unit'
# ]

# df_final = final_rca_frame.drop(columns=cols_to_remove, errors="ignore")



# v = final_rca_frame[(final_rca_frame["corporate_brand"]=='REVLIMID') & (final_rca_frame ["snapshot_date"] == "2026-01-09") 
#     &   (final_rca_frame ["material"] == '1456877')]