Data Cleaning

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data_raw/food_orders_raw.csv')

Step 1: Handle missing values


In [2]:
# Step 1: Missing (safe columns from Kaggle: time, weather, etc.)

num_cols = ['Time_taken(min)', 'Food_preparing_time(min)', 'Delivery_time(min)', 'Price_of_total_order']  # Real cols
cat_cols = ['Weather', 'Road_traffic_density', 'Type_of_order', 'Type_of_vehicle']
for col in num_cols + cat_cols:
    if col in df.columns:
        if col in num_cols:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown')

Step 2: Remove duplicates


In [3]:
# Step 2: Duplicates (use ID or index if no Order_ID)

df.drop_duplicates(inplace=True)
print("After duplicates:", df.shape)

After duplicates: (100000, 25)


Step 3: Fix invalid ratings

In [4]:
# Step 3: Ratings (if present, else skip)

rating_cols = ['Rating_by_delivery_person', 'Restaurant_rating']
for col in rating_cols:
    if col in df.columns:
        df[col] = df[col].clip(lower=1, upper=5)

Step 4: Cap outliers

In [5]:
# Step 4: Outliers cap
outlier_cols = ['Time_taken(min)', 'Price_of_total_order']
def cap_outliers(series):
    return series.clip(lower=series.quantile(0.01), upper=series.quantile(0.99))
for col in outlier_cols:
    if col in df.columns:
        df[col] = cap_outliers(df[col])

Step 5: Logical consistency

In [6]:
# Step 5: Logical (adapt to dataset)
if 'Case_id' in df.columns:  # Proxy for status
    pass  # Add if Cancelled col available

Step 6: Save cleaned data

In [7]:
# Step 6: Save
import os
os.makedirs('../data_processed', exist_ok=True)
df.to_csv('../data_processed/food_orders_cleaned.csv', index=False)
print("✅ Saved cleaned:", df.shape)

✅ Saved cleaned: (100000, 25)
