In [23]:
import pandas as pd
import pyarrow.dataset as ds
import numpy as np

## Dataset Import

In [24]:
# Import FIP Dataset

s3_path_fip = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "publish/data-product/financial_inventory_projection_report_network_update/"
)

dataset = ds.dataset(
    s3_path_fip,
    format="parquet",
    partitioning="hive" 
)

table = dataset.to_table(
    filter=(
        (ds.field("snapshot_date") >= "2025-12-01") &       # data after dec 2025
        (ds.field("date") == "202612") &                    # filter only 2026 YE FIP data
        ~(
            (ds.field("snapshot_date") == "2026-01-23") &
            (ds.field("snapshot_type") == "friday")         # drop 23rd Fri snapshot data to avoid double counting
        )
    )
)

df_fip = table.to_pandas()
df_fip.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date
0,1217818,1622,202612,0.0,0.0,missing,,,sap,Tempra,RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,4506130.0,PHARMA,friday,2025-12-05
1,1231993,1557,202612,0.0,0.0,missing,,,sap,Aripiprazole,HALB,COMMERCIAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,2701577.0,PHARMA,friday,2025-12-05
2,1416303,1721,202612,0.0,0.0,concost dp,ST,1.321201,sap,Metformin,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,TABLET,221118.0,PHARMA,friday,2025-12-05
3,1335676,1487,202612,990.0,0.0,missing,,,sap,,FIN,CLINICAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,,PHARMA,friday,2025-12-05
4,1345028,1487,202612,32.0,0.0,missing,,,sap,,FIN,CLINICAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,,PHARMA,friday,2025-12-05


In [25]:
df_fip.shape

(385660, 19)

In [None]:
# s3_path_plants = (
#     "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
#     "refined/data-asset/fin_inv_proj/"
#     "bms_internal_vs_external_plants/"
#     "bms_internal_vs_external_plants.parquet"
# )

# df_plants = pd.read_parquet(s3_path_plants)
# # df_plants.head()

In [2]:
# import boto3

# s3 = boto3.client("s3")

# bucket = "m3-intel-hub-dp-us-east-1-517292-prod"
# prefix = "refined/data-asset/fin_inv_proj/sap_material_master/"

# response = s3.list_objects_v2(
#     Bucket=bucket,
#     Prefix=prefix
# )

# if "Contents" in response:
#     for obj in response["Contents"]:
#         print(obj["Key"], obj["Size"])
# else:
#     print("No objects found or no access.")

## Data Prep

In [26]:
# Create has_non_zero flag at material–plant level

df_fip["has_non_zero"] = (
    df_fip
    .groupby(["material", "plant"])["total_cost"]
    .transform(lambda x: (x != 0).any())
    .astype(int)
)

# Apply the filter
base = df_fip.loc[df_fip["has_non_zero"] == 1].drop(columns="has_non_zero")


In [27]:
print(df_fip.shape)
base.shape

(385660, 20)


(92589, 19)

In [72]:
# fip copy df for data prep
df = base.copy()
df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])


# snapshot lookup table
snapshot_calendar = (
    df[["snapshot_type", "snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)


# attach prev snapshot to snapshot calendar
snapshot_calendar["prev_snapshot_date"] = (
    snapshot_calendar
    .groupby("snapshot_type")["snapshot_date"]
    .shift(1)
)
snapshot_calendar      # comparing bd13 - bd13 snapshots and friday-friday snapshots. no bd13-fri snapshots

Unnamed: 0,snapshot_type,snapshot_date,prev_snapshot_date
78159,bd13,2025-12-17,NaT
309446,bd13,2026-01-23,2025-12-17
332,friday,2025-12-05,NaT
39169,friday,2025-12-12,2025-12-05
117174,friday,2025-12-19,2025-12-12
156266,friday,2025-12-26,2025-12-19
195322,friday,2026-01-02,2025-12-26
232451,friday,2026-01-09,2026-01-02
271376,friday,2026-01-16,2026-01-09
347624,friday,2026-01-30,2026-01-16


In [73]:
# Attach previous snapshot date to each row

df = df.merge(
    snapshot_calendar,
    on=["snapshot_type", "snapshot_date"],
    how="left"
)

In [74]:
df.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date
0,1278101,1721,202612,0.0,0.0,concost dp,TH,121.374929,sap,Entecavir,HALB,COMMERCIAL,DRUG PRODUCT,INTERMEDIATE,TABLET,2201397,PHARMA,friday,2025-12-05,NaT
1,1436539,2071,202612,47.0,67.21,concost dp,ST,1.43,sap,BREYANZI ((lisocabt),PACK,,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
2,1436602,2071,202612,3877.0,736.63,concost dp,ST,0.19,sap,BREYANZI ((lisocabt),PACK,COMMERCIAL,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
3,1436617,2071,202612,167.0,1.67,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT
4,1436657,2071,202612,20.0,0.2,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT


In [75]:
# Prepare current and previous frames

# Current snapshot frame
current_df = df.copy()

current_df = current_df.rename(columns={
    "quantity": "quantity_curr",
    "cost_per_unit": "cost_per_unit_curr",
    "total_cost": "total_cost_curr",
})

# Previous snapshot frame
previous_df = df.rename(columns={
    "snapshot_date": "snapshot_date_prev",
    "quantity": "quantity_prev",
    "cost_per_unit": "cost_per_unit_prev",
    "total_cost": "total_cost_prev",
})[
    [
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
        "quantity_prev",
        "cost_per_unit_prev",
        "total_cost_prev",
    ]
]


# Join current to previous snapshot
rca_base = current_df.merge(
    previous_df,
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    how="left"
)

In [76]:
df.shape

(92589, 20)

In [77]:
rca_base.tail(5)

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev
92584,1426743,1718,202612,84.8,6244.069072,concost dp,ST,73.63289,rr,SPRYCEL (dasatinib),...,TABLET,3301502,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,84.8,73.63289,6244.069072
92585,1279436,2057,202612,104.0,70117.17648,concost dp,ST,674.20362,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,104.0,674.20362,70117.17648
92586,1273903,2058,202612,184.0,13544.378,concost dp,ST,73.61075,rr,Ipilimumab (Yervoy),...,"INJECTION, SOLUTION, CONCENTRATE",3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,794.0,73.61075,58446.9355
92587,1430657,1610,202612,135.0,7252.3755,concost dp,ST,53.7213,rr,Ipilimumab (Yervoy),...,INJECTION,3301503,BIOLOGICS,friday,2026-01-30,2026-01-16,2026-01-16,135.0,53.7213,7252.3755
92588,1431873,2061,202612,0.0,0.0,concost dp,ST,151.54812,rr,ZEPOSIA (ozanimod),...,CAPSULE,212057,PHARMA,friday,2026-01-30,2026-01-16,2026-01-16,170.0,151.54812,25763.1804


In [78]:
# Flags for material-plant presence / absence between snapshots

# Not present in previous snapshot 
rca_base["not_present_in_previous_snapshot"] = (
    rca_base["prev_snapshot_date"].notna() &
    rca_base["quantity_prev"].isna()
)

# Present in previous but missing in current snapshot
rca_base["not_present_in_current_snapshot"] = False


# Present in previous but missing in current snapshot
# Identify rows present in previous snapshot but missing in current
prev_only = previous_df.merge(
    current_df[
        [
            "material",
            "plant",
            "date",
            "snapshot_type",
            "prev_snapshot_date",
        ]
    ],
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    how="left",
    indicator=True
).query("_merge == 'left_only'")


prev_only["not_present_in_current_snapshot"] = True

# Add missing columns for consistency
for col in rca_base.columns:
    if col not in prev_only.columns:
        prev_only[col] = np.nan

rca_base["not_present_in_current_snapshot"] = False

final_rca_frame = pd.concat(
    [rca_base, prev_only[rca_base.columns]],
    ignore_index=True
)

  final_rca_frame = pd.concat(


In [86]:
for col in [
    "not_present_in_previous_snapshot",
    "not_present_in_current_snapshot",
]:
    final_rca_frame[col] = (
        final_rca_frame[col]
            .replace({1: True, 0: False})   # normalize numeric bools
            .fillna(False)                  # handle NaNs
            .astype("boolean")              # now safe
    )


  .fillna(False)                  # handle NaNs


In [87]:
final_rca_frame.shape

(111792, 26)

In [88]:
final_rca_frame.tail()

Unnamed: 0,material,plant,date,quantity_curr,total_cost_curr,concost_source,unit_of_measure,cost_per_unit_curr,source,corporate_brand,...,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date,snapshot_date_prev,quantity_prev,cost_per_unit_prev,total_cost_prev,not_present_in_previous_snapshot,not_present_in_current_snapshot
111787,1426743,1718,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-30,84.8,73.63289,6244.069072,False,True
111788,1279436,2057,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-30,104.0,674.20362,70117.17648,False,True
111789,1273903,2058,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-30,184.0,73.61075,13544.378,False,True
111790,1430657,1610,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-30,135.0,53.7213,7252.3755,False,True
111791,1431873,2061,202612,,,,,,,,...,,friday,NaT,NaT,2026-01-30,0.0,151.54812,0.0,False,True
