# Zomato Delivery Data Exploration

Quick exploratory checks on the cleaned delivery dataset to validate the preprocessing pipeline and surface first-look insights.

In [1]:
# Import analysis libraries and configure plotting style for consistency.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the processed dataset produced by src.data_cleaning so we work with curated values.
processed_path = '../data/processed/zomato_deliveries_clean.csv'
df = pd.read_csv(processed_path)

# Display the first few rows to confirm schema and general shape.
df.head()

In [None]:
# Summarise null counts for key engineered columns to spot remaining data gaps quickly.
key_columns = ['Time_Orderd_clean', 'Time_Order_picked_clean', 'Order_Date_clean',
               'order_to_pick_minutes', 'pickup_to_delivery_minutes']
null_summary = df[key_columns].isna().sum().to_frame(name='missing_values')
null_summary

In [None]:
# Plot the distribution of delivery duration to understand central tendency and spread.
fig, ax = plt.subplots()
sns.histplot(df['Time_taken (min)'], bins=25, kde=True, ax=ax)
ax.set_title('Delivery Duration Distribution')
ax.set_xlabel('Time taken (minutes)')
ax.set_ylabel('Order count')
plt.show()

In [None]:
# Compare delivery duration across traffic conditions to observe potential slowdowns.
fig, ax = plt.subplots()
traffic_order = ['Low', 'Medium', 'High', 'Jam']
sns.boxplot(data=df, x='Road_traffic_density', y='Time_taken (min)', order=traffic_order, ax=ax)
ax.set_title('Delivery Time by Traffic Density')
ax.set_xlabel('Traffic density')
ax.set_ylabel('Time taken (minutes)')
plt.show()

In [None]:
# Examine festival periods to see if they correlate with longer pickup windows.
fig, ax = plt.subplots()
sns.barplot(data=df.dropna(subset=['order_to_pick_minutes', 'Festival']),
            x='Festival', y='order_to_pick_minutes', estimator='median', ax=ax)
ax.set_title('Median Order-to-Pickup Time by Festival Flag')
ax.set_xlabel('Festival')
ax.set_ylabel('Order-to-pickup (minutes)')
plt.show()

In [None]:
# Visualise geographic coverage by city while highlighting remaining missing order times.
fig, ax = plt.subplots()
city_summary = (
    df.groupby('City')['Time_Orderd_clean']
      .apply(lambda s: s.isna().mean())
      .sort_values()
      .rename('share_missing_order_time')
      .reset_index()
)
sns.barplot(data=city_summary, x='City', y='share_missing_order_time', ax=ax)
ax.set_title('Share of Missing Order Times by City')
ax.set_xlabel('City')
ax.set_ylabel('Missing share (0-1)')
plt.show()