In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def scrape_zepto():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get("https://www.zeptonow.com/pn/mumbai")
    time.sleep(5)

    products = []
    try:
        for _ in range(3):  # Scroll 3 times
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(2)

        product_cards = driver.find_elements(By.CLASS_NAME, "ProductCard__Container-sc-__sc-1k3x8z-1")
        print(f"Found {len(product_cards)} products.")

        for card in product_cards:
            try:
                name = card.find_element(By.CLASS_NAME, "ProductCard__Name-sc-__sc-1k3x8z-7").text
                price = card.find_element(By.CLASS_NAME, "ProductCard__PriceLabel-sc-__sc-1k3x8z-10").text
                delivery_time = "10 minutes"  # fixed or use header banner
                products.append({
                    "product_name": name,
                    "price": price,
                    "delivery_time": delivery_time,
                    "platform": "Zepto"
                })
            except Exception as e:
                print("Error:", e)
                continue
    finally:
        driver.quit()

    df = pd.DataFrame(products)
    df.to_csv("data/zepto_products.csv", index=False)
    print("Zepto data saved to data/zepto_products.csv")

if __name__ == "__main__":
    scrape_zepto()


ModuleNotFoundError: No module named 'faker'

In [15]:
# Check and confirm if data supports the analysis conclusions mentioned

# Load datasets
zepto_df = pd.read_csv("D:/scrap_project/zepto_synthetic_1000.csv")
blinkit_df = pd.read_csv("D:/scrap_project/blinkit_synthetic_1000.csv")

# Combine for comparison
combined_df = pd.concat([zepto_df, blinkit_df])

# Analysis 1: Average delivery time
avg_delivery_time = combined_df.groupby("platform")["delivery_time_min"].mean()

# Analysis 2: Average discount amount
avg_discount = combined_df.groupby("platform")["discount_amount"].mean()

# Analysis 3: Net order value comparison
avg_net_order_value = combined_df.groupby("platform")["net_order_value"].mean()

# Analysis 4: City distribution
city_counts = combined_df.groupby(["platform", "city"]).size().unstack().fillna(0)

# Analysis 5: Payment method distribution
payment_dist = combined_df.groupby(["platform", "payment_method"]).size().unstack().fillna(0)

# Analysis 6: Raw order value
avg_order_value = combined_df.groupby("platform")["order_value"].mean()

# Analysis 7: Category distribution
category_dist = combined_df.groupby(["platform", "product_category"]).size().unstack().fillna(0)

# Analysis 8: Order time range check (between 6PM–10PM)
combined_df["order_hour"] = pd.to_datetime(combined_df["order_time"], format='%H:%M:%S').dt.hour
peak_order_time = combined_df[(combined_df["order_hour"] >= 18) & (combined_df["order_hour"] <= 22)]
peak_order_dist = peak_order_time["platform"].value_counts(normalize=True) * 100

{
    "Avg Delivery Time (min)": avg_delivery_time.to_dict(),
    "Avg Discount Amount (₹)": avg_discount.to_dict(),
    "Avg Net Order Value (₹)": avg_net_order_value.to_dict(),
    "Avg Raw Order Value (₹)": avg_order_value.to_dict(),
    "City Presence (Surat, Jaipur)": city_counts[["Surat", "Jaipur"]].to_dict(),
    "Metro City Presence (Delhi, Mumbai, Bangalore)": city_counts[["Delhi", "Mumbai", "Bangalore"]].to_dict(),
    "Payment Method (UPI %)": (payment_dist.div(payment_dist.sum(axis=1), axis=0)["UPI"] * 100).to_dict(),
    "Order Volume in 6PM–10PM (%)": peak_order_dist.to_dict(),
    "Category Distribution": category_dist.to_dict()
}


{'Avg Delivery Time (min)': {'Blinkit': 16.487, 'Zepto': 11.788},
 'Avg Discount Amount (₹)': {'Blinkit': 17.51759, 'Zepto': 34.93879},
 'Avg Net Order Value (₹)': {'Blinkit': 431.83337,
  'Zepto': 367.05240000000003},
 'Avg Raw Order Value (₹)': {'Blinkit': 449.35096000000004,
  'Zepto': 401.99119},
 'City Presence (Surat, Jaipur)': {'Surat': {'Blinkit': 42, 'Zepto': 38},
  'Jaipur': {'Blinkit': 43, 'Zepto': 40}},
 'Metro City Presence (Delhi, Mumbai, Bangalore)': {'Delhi': {'Blinkit': 183,
   'Zepto': 168},
  'Mumbai': {'Blinkit': 205, 'Zepto': 207},
  'Bangalore': {'Blinkit': 135, 'Zepto': 149}},
 'Payment Method (UPI %)': {'Blinkit': 60.3, 'Zepto': 59.599999999999994},
 'Order Volume in 6PM–10PM (%)': {'Zepto': 50.23696682464455,
  'Blinkit': 49.763033175355446},
 'Category Distribution': {'Bakery': {'Blinkit': 104, 'Zepto': 108},
  'Beverages': {'Blinkit': 106, 'Zepto': 112},
  'Dairy': {'Blinkit': 101, 'Zepto': 122},
  'Frozen Foods': {'Blinkit': 112, 'Zepto': 101},
  'Fruits': {

In [16]:
import pandas as pd

# Assuming zepto_df and blinkit_df are already loaded DataFrames

# Make sure the 'platform' column exists in both, else add it
zepto_df['platform'] = 'Zepto'
blinkit_df['platform'] = 'Blinkit'

# Concatenate vertically
combined_df = pd.concat([zepto_df, blinkit_df], ignore_index=True)

# Save combined DataFrame as CSV
combined_df.to_csv('combined_zepto_blinkit.csv', index=False)

print("Combined CSV saved as 'combined_zepto_blinkit.csv'")


Combined CSV saved as 'combined_zepto_blinkit.csv'
