In [1]:
import pandas as pd
import numpy as np
from IPython.display import HTML
import base64

In [7]:
# Load the raw distribution data
df_dist = pd.read_csv(r"C:\Users\bukky\bakery_chain\bakery_chain\raw dataset\dirty_distribution.csv")

# Preview the first few rows
df_dist.head(30)

Unnamed: 0,Branch ID,Order ID,Delivery Date,Delivery Time (Hours),Route ID,Transportation Cost ($),Customer Location
0,B04,1,2024-03-23,2,R4,91.0,"Mesa, AZ"
1,B04,2,2024-05-04,5,R2,44.0,"Tolleson, AZ"
2,B02,3,2024-04-14,6,R5,48.0,"Bradenton, FL"
3,B02,4,2024-05-04,2,R3,69.0,"Savannah, GA"
4,B02,5,2024-06-30,-5,R5,42.0,"Bradenton, FL"
5,B01,6,2024-06-10,2,R5,69.0,"Modesto, CA"
6,B01,7,2024-03-08,3,R2,101.0,"Birmingham, AL"
7,B04,8,2024-06-27,7,R4,64.0,"Birmingham, AL"
8,B04,9,2024-04-11,6,R4,74.0,
9,B02,10,2024-05-31,2,R2,100.0,"Tolleson, AZ"


In [8]:
# View column names and data types
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Branch ID                601 non-null    object 
 1   Order ID                 601 non-null    int64  
 2   Delivery Date            601 non-null    object 
 3   Delivery Time (Hours)    601 non-null    int64  
 4   Route ID                 601 non-null    object 
 5   Transportation Cost ($)  600 non-null    float64
 6   Customer Location        600 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 33.0+ KB


In [9]:
# View summary statistics
df_dist.describe(include='all')

Unnamed: 0,Branch ID,Order ID,Delivery Date,Delivery Time (Hours),Route ID,Transportation Cost ($),Customer Location
count,601,601.0,601,601.0,601,600.0,600
unique,4,,176,,5,,10
top,B02,,2024-03-21,,R2,,"Villa Rica, GA"
freq,162,,9,,138,,75
mean,,300.004992,,4.387687,,80.605,
std,,173.629438,,1.746744,,22.493159,
min,,1.0,,-5.0,,40.0,
25%,,150.0,,3.0,,62.0,
50%,,300.0,,4.0,,80.0,
75%,,450.0,,6.0,,100.25,


In [10]:
# Rename to snake_case for easier handling
df_dist.rename(columns={
    "Branch ID": "branch_id",
    "Order ID": "order_id",
    "Delivery Date": "delivery_date",
    "Delivery Time (Hours)": "delivery_time_hours",
    "Route ID": "route_id",
    "Transportation Cost ($)": "transportation_cost",
    "Customer Location": "customer_location"
}, inplace=True)

In [11]:
df_dist.columns

Index(['branch_id', 'order_id', 'delivery_date', 'delivery_time_hours',
       'route_id', 'transportation_cost', 'customer_location'],
      dtype='object')

In [12]:
# Convert delivery_date to datetime format
df_dist["delivery_date"] = pd.to_datetime(df_dist["delivery_date"], errors='coerce')

In [13]:
# View column names and data types
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   branch_id            601 non-null    object        
 1   order_id             601 non-null    int64         
 2   delivery_date        601 non-null    datetime64[ns]
 3   delivery_time_hours  601 non-null    int64         
 4   route_id             601 non-null    object        
 5   transportation_cost  600 non-null    float64       
 6   customer_location    600 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 33.0+ KB


In [15]:
# Replace negative delivery times with NaN
df_dist["delivery_time_hours"] = df_dist["delivery_time_hours"].apply(lambda x: x if x >= 0 else np.nan)


In [16]:
# Preview the first few rows
df_dist.head(30)

Unnamed: 0,branch_id,order_id,delivery_date,delivery_time_hours,route_id,transportation_cost,customer_location
0,B04,1,2024-03-23,2.0,R4,91.0,"Mesa, AZ"
1,B04,2,2024-05-04,5.0,R2,44.0,"Tolleson, AZ"
2,B02,3,2024-04-14,6.0,R5,48.0,"Bradenton, FL"
3,B02,4,2024-05-04,2.0,R3,69.0,"Savannah, GA"
4,B02,5,2024-06-30,,R5,42.0,"Bradenton, FL"
5,B01,6,2024-06-10,2.0,R5,69.0,"Modesto, CA"
6,B01,7,2024-03-08,3.0,R2,101.0,"Birmingham, AL"
7,B04,8,2024-06-27,7.0,R4,64.0,"Birmingham, AL"
8,B04,9,2024-04-11,6.0,R4,74.0,
9,B02,10,2024-05-31,2.0,R2,100.0,"Tolleson, AZ"


In [17]:
# Calculate the median delivery time (excluding NaNs)
median_delivery_time = df_dist["delivery_time_hours"].median()

# Replace NaN values with the median
df_dist["delivery_time_hours"] = df_dist["delivery_time_hours"].fillna(median_delivery_time)


In [18]:
# Preview the first few rows
df_dist.head(30)

Unnamed: 0,branch_id,order_id,delivery_date,delivery_time_hours,route_id,transportation_cost,customer_location
0,B04,1,2024-03-23,2.0,R4,91.0,"Mesa, AZ"
1,B04,2,2024-05-04,5.0,R2,44.0,"Tolleson, AZ"
2,B02,3,2024-04-14,6.0,R5,48.0,"Bradenton, FL"
3,B02,4,2024-05-04,2.0,R3,69.0,"Savannah, GA"
4,B02,5,2024-06-30,4.0,R5,42.0,"Bradenton, FL"
5,B01,6,2024-06-10,2.0,R5,69.0,"Modesto, CA"
6,B01,7,2024-03-08,3.0,R2,101.0,"Birmingham, AL"
7,B04,8,2024-06-27,7.0,R4,64.0,"Birmingham, AL"
8,B04,9,2024-04-11,6.0,R4,74.0,
9,B02,10,2024-05-31,2.0,R2,100.0,"Tolleson, AZ"


In [21]:
#Convert delivery_time_hours to int
df_dist["delivery_time_hours"] = df_dist["delivery_time_hours"].astype("Int64")


In [22]:
# Preview the first few rows
df_dist.head(30)

Unnamed: 0,branch_id,order_id,delivery_date,delivery_time_hours,route_id,transportation_cost,customer_location
0,B04,1,2024-03-23,2,R4,91.0,"Mesa, AZ"
1,B04,2,2024-05-04,5,R2,44.0,"Tolleson, AZ"
2,B02,3,2024-04-14,6,R5,48.0,"Bradenton, FL"
3,B02,4,2024-05-04,2,R3,69.0,"Savannah, GA"
4,B02,5,2024-06-30,4,R5,42.0,"Bradenton, FL"
5,B01,6,2024-06-10,2,R5,69.0,"Modesto, CA"
6,B01,7,2024-03-08,3,R2,101.0,"Birmingham, AL"
7,B04,8,2024-06-27,7,R4,64.0,"Birmingham, AL"
8,B04,9,2024-04-11,6,R4,74.0,
9,B02,10,2024-05-31,2,R2,100.0,"Tolleson, AZ"


In [23]:
# View column names and data types
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   branch_id            601 non-null    object        
 1   order_id             601 non-null    int64         
 2   delivery_date        601 non-null    datetime64[ns]
 3   delivery_time_hours  601 non-null    Int64         
 4   route_id             601 non-null    object        
 5   transportation_cost  600 non-null    float64       
 6   customer_location    600 non-null    object        
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 33.6+ KB


In [24]:
# Fill missing transportation cost with the median
df_dist["transportation_cost"] = df_dist["transportation_cost"].fillna(df_dist["transportation_cost"].median())


In [25]:
# View column names and data types
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 601 entries, 0 to 600
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   branch_id            601 non-null    object        
 1   order_id             601 non-null    int64         
 2   delivery_date        601 non-null    datetime64[ns]
 3   delivery_time_hours  601 non-null    Int64         
 4   route_id             601 non-null    object        
 5   transportation_cost  601 non-null    float64       
 6   customer_location    600 non-null    object        
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 33.6+ KB


In [26]:

# Fill missing customer location with most frequent location
most_common_location = df_dist["customer_location"].mode()[0]
df_dist["customer_location"] = df_dist["customer_location"].fillna(most_common_location)


In [27]:
# Preview the first few rows
df_dist.head(30)

Unnamed: 0,branch_id,order_id,delivery_date,delivery_time_hours,route_id,transportation_cost,customer_location
0,B04,1,2024-03-23,2,R4,91.0,"Mesa, AZ"
1,B04,2,2024-05-04,5,R2,44.0,"Tolleson, AZ"
2,B02,3,2024-04-14,6,R5,48.0,"Bradenton, FL"
3,B02,4,2024-05-04,2,R3,69.0,"Savannah, GA"
4,B02,5,2024-06-30,4,R5,42.0,"Bradenton, FL"
5,B01,6,2024-06-10,2,R5,69.0,"Modesto, CA"
6,B01,7,2024-03-08,3,R2,101.0,"Birmingham, AL"
7,B04,8,2024-06-27,7,R4,64.0,"Birmingham, AL"
8,B04,9,2024-04-11,6,R4,74.0,"Villa Rica, GA"
9,B02,10,2024-05-31,2,R2,100.0,"Tolleson, AZ"


In [28]:
# Drop exact duplicate rows if any
df_dist = df_dist.drop_duplicates()

In [29]:
# View column names and data types
df_dist.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   branch_id            600 non-null    object        
 1   order_id             600 non-null    int64         
 2   delivery_date        600 non-null    datetime64[ns]
 3   delivery_time_hours  600 non-null    Int64         
 4   route_id             600 non-null    object        
 5   transportation_cost  600 non-null    float64       
 6   customer_location    600 non-null    object        
dtypes: Int64(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 38.1+ KB


In [30]:
def create_download_link(df_dist, filename="cleaned_distribution.csv"):
    csv = df_dist.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    return HTML(f'<a download="{filename}" href="data:text/csv;base64,{b64}">Download CSV</a>')

create_download_link(df_dist)