In [23]:
import pandas as pd
import pyarrow.dataset as ds
import numpy as np

## Dataset Import

In [24]:
# Import FIP Dataset

s3_path_fip = (
    "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
    "publish/data-product/financial_inventory_projection_report_network_update/"
)

dataset = ds.dataset(
    s3_path_fip,
    format="parquet",
    partitioning="hive" 
)

table = dataset.to_table(
    filter=(
        (ds.field("snapshot_date") >= "2025-12-01") &       # data after dec 2025
        (ds.field("date") == "202612") &                    # filter only 2026 YE FIP data
        ~(
            (ds.field("snapshot_date") == "2026-01-23") &
            (ds.field("snapshot_type") == "friday")         # drop 23rd Fri snapshot data to avoid double counting
        )
    )
)

df_fip = table.to_pandas()
df_fip.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date
0,1217818,1622,202612,0.0,0.0,missing,,,sap,Tempra,RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,4506130.0,PHARMA,friday,2025-12-05
1,1231993,1557,202612,0.0,0.0,missing,,,sap,Aripiprazole,HALB,COMMERCIAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,2701577.0,PHARMA,friday,2025-12-05
2,1416303,1721,202612,0.0,0.0,concost dp,ST,1.321201,sap,Metformin,FIN,COMMERCIAL,MARKET UNIT,SINGLE PRODUCT,TABLET,221118.0,PHARMA,friday,2025-12-05
3,1335676,1487,202612,990.0,0.0,missing,,,sap,,FIN,CLINICAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,,PHARMA,friday,2025-12-05
4,1345028,1487,202612,32.0,0.0,missing,,,sap,,FIN,CLINICAL,OTHER PROCESS MATERIALS,OTHER PROCESS MATERIALS,,,PHARMA,friday,2025-12-05


In [25]:
df_fip.shape

(385660, 19)

In [None]:
# s3_path_plants = (
#     "s3://m3-intel-hub-dp-us-east-1-517292-prod/"
#     "refined/data-asset/fin_inv_proj/"
#     "bms_internal_vs_external_plants/"
#     "bms_internal_vs_external_plants.parquet"
# )

# df_plants = pd.read_parquet(s3_path_plants)
# # df_plants.head()

In [2]:
# import boto3

# s3 = boto3.client("s3")

# bucket = "m3-intel-hub-dp-us-east-1-517292-prod"
# prefix = "refined/data-asset/fin_inv_proj/sap_material_master/"

# response = s3.list_objects_v2(
#     Bucket=bucket,
#     Prefix=prefix
# )

# if "Contents" in response:
#     for obj in response["Contents"]:
#         print(obj["Key"], obj["Size"])
# else:
#     print("No objects found or no access.")

## Data Prep

In [26]:
# Create has_non_zero flag at materialâ€“plant level

df_fip["has_non_zero"] = (
    df_fip
    .groupby(["material", "plant"])["total_cost"]
    .transform(lambda x: (x != 0).any())
    .astype(int)
)

# Apply the filter
base = df_fip.loc[df_fip["has_non_zero"] == 1].drop(columns="has_non_zero")


In [27]:
print(df_fip.shape)
base.shape

(385660, 20)


(92589, 19)

In [28]:
# fip copy df for data prep
df = base.copy()
df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])


# snapshot lookup table
snapshot_calendar = (
    df[["snapshot_type", "snapshot_date"]]
    .drop_duplicates()
    .sort_values(["snapshot_type", "snapshot_date"])
)


# attach prev snapshot to snapshot calendar
snapshot_calendar["prev_snapshot_date"] = (
    snapshot_calendar
    .groupby("snapshot_type")["snapshot_date"]
    .shift(1)
)
snapshot_calendar      # comparing bd13 - bd13 snapshots and friday-friday snapshots. no bd13-fri snapshots

In [31]:
# Attach previous snapshot date to each row

df = df.merge(
    snapshot_calendar,
    on=["snapshot_type", "snapshot_date"],
    how="left"
)

In [39]:
df.head()

Unnamed: 0,material,plant,date,quantity,total_cost,concost_source,unit_of_measure,cost_per_unit,source,corporate_brand,material_type,development_lifecycle_status,enterprise_category,enterprise_sub_category,dosage_form_parent,corp_brand_id,network_or_business_unit,snapshot_type,snapshot_date,prev_snapshot_date
0,1278101,1721,202612,0.0,0.0,concost dp,TH,121.374929,sap,Entecavir,HALB,COMMERCIAL,DRUG PRODUCT,INTERMEDIATE,TABLET,2201397,PHARMA,friday,2025-12-05,NaT
1,1436539,2071,202612,47.0,67.21,concost dp,ST,1.43,sap,BREYANZI ((lisocabt),PACK,,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
2,1436602,2071,202612,3877.0,736.63,concost dp,ST,0.19,sap,BREYANZI ((lisocabt),PACK,COMMERCIAL,PACKAGE COMPONENT,PACKAGE COMPONENT,,3302160,CTO,friday,2025-12-05,NaT
3,1436617,2071,202612,167.0,1.67,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT
4,1436657,2071,202612,20.0,0.2,concost dp,ST,0.01,sap,BREYANZI ((lisocabt),RAW,COMMERCIAL,RAW MATERIAL,RAW MATERIAL,,3302160,CTO,friday,2025-12-05,NaT


In [36]:
# Prepare current and previous frames

# Current snapshot frame
current_df = df.copy()

current_df = current_df.rename(columns={
    "quantity": "quantity_curr",
    "cost_per_unit": "cost_per_unit_curr",
    "total_cost": "total_cost_curr",
})

## Previous snapshot frame
previous_df = df.rename(columns={
    "snapshot_date": "snapshot_date_prev",
    "quantity": "quantity_prev",
    "cost_per_unit": "cost_per_unit_prev",
    "total_cost": "total_cost_prev",
})[
    [
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
        "quantity_prev",
        "cost_per_unit_prev",
        "total_cost_prev",
    ]
]


# Join current to previous snapshot
rca_base = current_df.merge(
    previous_df,
    left_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "prev_snapshot_date",
    ],
    right_on=[
        "material",
        "plant",
        "date",
        "snapshot_type",
        "snapshot_date_prev",
    ],
    how="left"
)

In [40]:
df.shape

(92589, 20)