In [1]:
import pandas as pd
import numpy as np
from IPython.display import HTML
import base64

In [2]:
# Load the raw inventory data
df_inv = pd.read_csv(r"C:\Users\bukky\bakery_chain\bakery_chain\raw dataset\dirty_inventory.csv")

# Preview the first few rows
df_inv.head(30)

Unnamed: 0,Branch ID,Product ID,Product Name,Current Stock,Reorder Point,Supplier Lead Time (Days),Cost per Unit ($)
0,B01,101,Fruit Biscuits,500,200.0,7,2.5
1,B02,101,Fruit Biscuits,500,200.0,7,2.5
2,B03,101,Fruit Biscuits,500,200.0,7,2.5
3,B04,101,Fruit Biscuits,500,200.0,7,2.5
4,B01,102,Plum Cake,300,100.0,5,5.0
5,B02,102,fruit biscuits,300,100.0,5,5.0
6,B03,102,Plum Cake,300,100.0,5,5.0
7,B04,102,Plum Cake,300,100.0,5,5.0
8,B01,103,Almond Cookies,200,150.0,10,3.0
9,B02,103,Almond Cookies,200,150.0,10,3.0


In [3]:
# Check columns, data types, and missing values
df_inv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Branch ID                  60 non-null     object 
 1   Product ID                 60 non-null     int64  
 2   Product Name               60 non-null     object 
 3   Current Stock              60 non-null     int64  
 4   Reorder Point              59 non-null     float64
 5   Supplier Lead Time (Days)  60 non-null     int64  
 6   Cost per Unit ($)          60 non-null     float64
dtypes: float64(2), int64(3), object(2)
memory usage: 3.4+ KB


In [4]:
# Get summary statistics
df_inv.describe(include='all')

Unnamed: 0,Branch ID,Product ID,Product Name,Current Stock,Reorder Point,Supplier Lead Time (Days),Cost per Unit ($)
count,60,60.0,60,60.0,59.0,60.0,60.0
unique,4,,16,,,,
top,B01,,Fruit Biscuits,,,,
freq,15,,5,,,,
mean,,107.916667,,445.0,175.084746,5.833333,2.558333
std,,4.442826,,156.75199,67.27127,1.486284,1.395063
min,,101.0,,200.0,90.0,4.0,-2.5
25%,,104.0,,300.0,110.0,5.0,2.0
50%,,108.0,,400.0,170.0,5.0,2.3
75%,,112.0,,525.0,200.0,7.0,2.65


In [5]:
df_inv.rename(columns={
    "Branch ID": "branch_id",
    "Product ID": "product_id",
    "Product Name": "product_name",
    "Current Stock": "current_stock",
    "Reorder Point": "reorder_point",
    "Supplier Lead Time (Days)": "supplier_lead_time_days",
    "Cost per Unit ($)": "cost_per_unit"
}, inplace=True)

In [6]:
# Fill missing reorder_point with the median
median_reorder = df_inv["reorder_point"].median()
df_inv["reorder_point"] = df_inv["reorder_point"].fillna(median_reorder)

In [7]:
# Preview the first few rows
df_inv.head(30)

Unnamed: 0,branch_id,product_id,product_name,current_stock,reorder_point,supplier_lead_time_days,cost_per_unit
0,B01,101,Fruit Biscuits,500,200.0,7,2.5
1,B02,101,Fruit Biscuits,500,200.0,7,2.5
2,B03,101,Fruit Biscuits,500,200.0,7,2.5
3,B04,101,Fruit Biscuits,500,200.0,7,2.5
4,B01,102,Plum Cake,300,100.0,5,5.0
5,B02,102,fruit biscuits,300,100.0,5,5.0
6,B03,102,Plum Cake,300,100.0,5,5.0
7,B04,102,Plum Cake,300,100.0,5,5.0
8,B01,103,Almond Cookies,200,150.0,10,3.0
9,B02,103,Almond Cookies,200,150.0,10,3.0


In [8]:
# Replace negative cost_per_unit with NaN
df_inv["cost_per_unit"] = df_inv["cost_per_unit"].apply(lambda x: x if x >= 0 else np.nan)

In [9]:
# Preview the first few rows
df_inv.head(30)

Unnamed: 0,branch_id,product_id,product_name,current_stock,reorder_point,supplier_lead_time_days,cost_per_unit
0,B01,101,Fruit Biscuits,500,200.0,7,2.5
1,B02,101,Fruit Biscuits,500,200.0,7,2.5
2,B03,101,Fruit Biscuits,500,200.0,7,2.5
3,B04,101,Fruit Biscuits,500,200.0,7,2.5
4,B01,102,Plum Cake,300,100.0,5,5.0
5,B02,102,fruit biscuits,300,100.0,5,5.0
6,B03,102,Plum Cake,300,100.0,5,5.0
7,B04,102,Plum Cake,300,100.0,5,5.0
8,B01,103,Almond Cookies,200,150.0,10,3.0
9,B02,103,Almond Cookies,200,150.0,10,3.0


In [11]:
# Fill missing cost_per_unit using product_name
# Group by product_name and get the median cost per product
product_cost_map = (
    df_inv[df_inv["cost_per_unit"].notna()]
    .groupby("product_name")["cost_per_unit"]
    .median()
    .to_dict()
)

#fill missing value based on mapping
df_inv["cost_per_unit"] = df_inv.apply(
    lambda row: product_cost_map.get(row["product_name"], np.nan)
    if pd.isna(row["cost_per_unit"]) else row["cost_per_unit"],
    axis=1
)

In [12]:
# Preview the first few rows
df_inv.head(30)

Unnamed: 0,branch_id,product_id,product_name,current_stock,reorder_point,supplier_lead_time_days,cost_per_unit
0,B01,101,Fruit Biscuits,500,200.0,7,2.5
1,B02,101,Fruit Biscuits,500,200.0,7,2.5
2,B03,101,Fruit Biscuits,500,200.0,7,2.5
3,B04,101,Fruit Biscuits,500,200.0,7,2.5
4,B01,102,Plum Cake,300,100.0,5,5.0
5,B02,102,fruit biscuits,300,100.0,5,5.0
6,B03,102,Plum Cake,300,100.0,5,5.0
7,B04,102,Plum Cake,300,100.0,5,5.0
8,B01,103,Almond Cookies,200,150.0,10,3.0
9,B02,103,Almond Cookies,200,150.0,10,3.0


In [20]:
#remove duplicates
df_inv = df_inv.drop_duplicates()

In [21]:
# Preview the first few rows
df_inv.head(30)

Unnamed: 0,branch_id,product_id,product_name,current_stock,reorder_point,supplier_lead_time_days,cost_per_unit
0,B01,101,Fruit Biscuits,500,200.0,7,2.5
1,B02,101,Fruit Biscuits,500,200.0,7,2.5
2,B03,101,Fruit Biscuits,500,200.0,7,2.5
3,B04,101,Fruit Biscuits,500,200.0,7,2.5
4,B01,102,Plum Cake,300,100.0,5,5.0
5,B02,102,fruit biscuits,300,100.0,5,5.0
6,B03,102,Plum Cake,300,100.0,5,5.0
7,B04,102,Plum Cake,300,100.0,5,5.0
8,B01,103,Almond Cookies,200,150.0,10,3.0
9,B02,103,Almond Cookies,200,150.0,10,3.0


In [22]:
# Check columns, data types, and missing values
df_inv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 0 to 59
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   branch_id                59 non-null     object 
 1   product_id               59 non-null     int64  
 2   product_name             59 non-null     object 
 3   current_stock            59 non-null     int64  
 4   reorder_point            59 non-null     float64
 5   supplier_lead_time_days  59 non-null     int64  
 6   cost_per_unit            59 non-null     float64
dtypes: float64(2), int64(3), object(2)
memory usage: 3.7+ KB


In [23]:
def create_download_link(df_inv, filename="cleaned_inventory.csv"):
    csv = df_inv.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    return HTML(f'<a download="{filename}" href="data:text/csv;base64,{b64}">Download CSV</a>')

create_download_link(df_inv)