In [26]:
import pandas as pd
import numpy as np
from IPython.display import HTML
import base64

In [27]:
# Load the raw inventory data
df_sales = pd.read_csv(r"C:\Users\bukky\bakery_chain\bakery_chain\raw dataset\dirty_sales.csv")

In [28]:
# Preview the first few rows
df_sales.head(30)

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
0,B01,101,Fruit Biscuits,273,1047.22,2024-03-21
1,B04,107,Whole Wheat Bread,71,233.26,2024-04-08
2,B02,109,Bagels,268,484.64,2024-06-03
3,B01,104,Chocolate Cake,156,1730.35,2024-02-21
4,B03,109,Bagels,286,576.17,2024-05-07
5,B03,107,Whole Wheat Bread,282,1192.18,2024-05-16
6,B02,111,Banana Bread,133,824.72,2024-05-26
7,B02,112,Multigrain Bread,156,624.98,2024-01-01
8,B01,113,Garlic Bread,201,959.7,2024-04-08
9,B01,101,Fruit Biscuits,18,101.77,2024-05-24


In [29]:
# Check columns and data types
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   branch_id      500 non-null    object 
 1   product_id     500 non-null    int64  
 2   product_name   500 non-null    object 
 3   units_sold     500 non-null    int64  
 4   sales_revenue  500 non-null    float64
 5   date_of_sale   500 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 23.6+ KB


In [30]:
# View summary stats
df_sales.describe(include='all')

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
count,500,500.0,500,500.0,500.0,500
unique,4,,16,,,173
top,B03,,Multigrain Bread,,,2024-05-26
freq,128,,48,,,8
mean,,108.464,,161.15,779.56068,
std,,4.187574,,81.211293,640.116641,
min,,101.0,,12.0,-1169.34,
25%,,105.0,,91.0,369.98,
50%,,109.0,,158.0,691.105,
75%,,112.0,,233.0,1075.0475,


In [31]:
#rename columns to snake_case
df_sales.rename(columns={
    "Branch ID": "branch_id",
    "Product ID": "product_id",
    "Product Name": "product_name",
    "Units Sold": "units_sold",
    "Sales Revenue ($)": "sales_revenue",
    "Date of Sale": "date_of_sale"
}, inplace=True)

In [32]:
# View summary stats
df_sales.describe(include='all')

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
count,500,500.0,500,500.0,500.0,500
unique,4,,16,,,173
top,B03,,Multigrain Bread,,,2024-05-26
freq,128,,48,,,8
mean,,108.464,,161.15,779.56068,
std,,4.187574,,81.211293,640.116641,
min,,101.0,,12.0,-1169.34,
25%,,105.0,,91.0,369.98,
50%,,109.0,,158.0,691.105,
75%,,112.0,,233.0,1075.0475,


In [33]:
#convert date_of_sale to datetime
df_sales["date_of_sale"] = pd.to_datetime(df_sales["date_of_sale"], errors="coerce")

In [35]:
# Preview the first few rows
df_sales.head(50)

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
0,B01,101,Fruit Biscuits,273,1047.22,2024-03-21
1,B04,107,Whole Wheat Bread,71,233.26,2024-04-08
2,B02,109,Bagels,268,484.64,2024-06-03
3,B01,104,Chocolate Cake,156,1730.35,2024-02-21
4,B03,109,Bagels,286,576.17,2024-05-07
5,B03,107,Whole Wheat Bread,282,1192.18,2024-05-16
6,B02,111,Banana Bread,133,824.72,2024-05-26
7,B02,112,Multigrain Bread,156,624.98,2024-01-01
8,B01,113,Garlic Bread,201,959.7,2024-04-08
9,B01,101,Fruit Biscuits,18,101.77,2024-05-24


In [36]:
#clean Negative units_sold
df_sales["units_sold"] = df_sales["units_sold"].abs()

In [37]:
#Validate or Fill sales_revenue
# Check for missing revenue
if df_sales["sales_revenue"].isna().sum() > 0:
    unit_price_map = (
        df_sales[df_sales["sales_revenue"].notna()]
        .groupby("product_name")
        .apply(lambda x: (x["sales_revenue"] / x["units_sold"]).median())
        .to_dict()
    )

    df_sales["sales_revenue"] = df_sales.apply(
        lambda row: row["units_sold"] * unit_price_map.get(row["product_name"], np.nan)
        if pd.isna(row["sales_revenue"]) else row["sales_revenue"],
        axis=1
    )

In [38]:
# Preview the first few rows
df_sales.head(50)

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
0,B01,101,Fruit Biscuits,273,1047.22,2024-03-21
1,B04,107,Whole Wheat Bread,71,233.26,2024-04-08
2,B02,109,Bagels,268,484.64,2024-06-03
3,B01,104,Chocolate Cake,156,1730.35,2024-02-21
4,B03,109,Bagels,286,576.17,2024-05-07
5,B03,107,Whole Wheat Bread,282,1192.18,2024-05-16
6,B02,111,Banana Bread,133,824.72,2024-05-26
7,B02,112,Multigrain Bread,156,624.98,2024-01-01
8,B01,113,Garlic Bread,201,959.7,2024-04-08
9,B01,101,Fruit Biscuits,18,101.77,2024-05-24


In [39]:
df_sales["sales_revenue"] = df_sales["sales_revenue"].abs()

In [40]:
# Preview the first few rows
df_sales.head(50)

Unnamed: 0,branch_id,product_id,product_name,units_sold,sales_revenue,date_of_sale
0,B01,101,Fruit Biscuits,273,1047.22,2024-03-21
1,B04,107,Whole Wheat Bread,71,233.26,2024-04-08
2,B02,109,Bagels,268,484.64,2024-06-03
3,B01,104,Chocolate Cake,156,1730.35,2024-02-21
4,B03,109,Bagels,286,576.17,2024-05-07
5,B03,107,Whole Wheat Bread,282,1192.18,2024-05-16
6,B02,111,Banana Bread,133,824.72,2024-05-26
7,B02,112,Multigrain Bread,156,624.98,2024-01-01
8,B01,113,Garlic Bread,201,959.7,2024-04-08
9,B01,101,Fruit Biscuits,18,101.77,2024-05-24


In [41]:
# Check columns and data types
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   branch_id      500 non-null    object        
 1   product_id     500 non-null    int64         
 2   product_name   500 non-null    object        
 3   units_sold     500 non-null    int64         
 4   sales_revenue  500 non-null    float64       
 5   date_of_sale   500 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 23.6+ KB


In [42]:
#Final Sanity Check
df_sales.info()
df_sales.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   branch_id      500 non-null    object        
 1   product_id     500 non-null    int64         
 2   product_name   500 non-null    object        
 3   units_sold     500 non-null    int64         
 4   sales_revenue  500 non-null    float64       
 5   date_of_sale   500 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 23.6+ KB


branch_id        0
product_id       0
product_name     0
units_sold       0
sales_revenue    0
date_of_sale     0
dtype: int64

In [43]:
def create_download_link(df_sales, filename="cleaned_sales.csv"):
    csv = df_sales.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    return HTML(f'<a download="{filename}" href="data:text/csv;base64,{b64}">Download CSV</a>')

create_download_link(df_sales)