In [None]:
!pip install playwright tqdm --quiet
!playwright install chromium
!playwright install-deps chromium

Installing dependencies...
Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,398 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,838

In [39]:
import os
import re
import pandas as pd
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

# Allow nested event loops (for Colab)
nest_asyncio.apply()

In [40]:
CATEGORY_GROUPS = {
    "Groceries": [
        "oil-ghee",
        "sauces-price-in-pakistan",
        "local-drinks-price-in-pakistan",
        "milk-price-in-pakistan",
        "butter-price-in-pakistan",
        "egg-price-in-pakistan"
    ],
    "Electronics": [
        "air-purifier-price-in-pakistan",
        "microwave-oven-price-in-pakistan",
        "blender-price-in-pakistan",
        "washing-machine-price-in-pakistan",
        "food-processor-price-in-pakistan"
    ],
    "Perfumes": [
        "men-perfume-price-in-pakistan",
        "best-perfume-for-women",
        "buy-perfume-gift-set",
        "deodorant-price-in-pakistan"
    ],
    "Makeup": [
        "makeup-foundation-price-in-pakistan",
        "lipstick-price-in-pakistan",
        "makeup-palettes-in-pakistan",
        "buy-eyeshadow-online"
    ],
    "Skin Care": [
        "best-bb-and-cc-creams",
        "sheet-mask-price-in-pakistan",
        "face-wash-price-in-pakistan",
        "body-scrub-in-pakistan"
    ],
    "Toys": [
        "toy-car-price-in-pakistan",
        "gaming-console-price-in-pakistan",
        "video-game-price-in-pakistan",
        "activity-toys",
        "baby-push-car-price-in-pakistan"
    ]
}

In [41]:
async def extract_text(element, selector: str):
    try:
        el = await element.query_selector(selector)
        return (await el.inner_text()).strip() if el else None
    except:
        return None


async def extract_attribute(element, selector: str, attribute: str):
    try:
        el = await element.query_selector(selector)
        return await el.get_attribute(attribute) if el else None
    except:
        return None


def parse_price_text(price_text: str):
    """Extract original and discounted prices."""
    if not price_text:
        return None, None
    prices = re.findall(r"Rs\.([0-9,]+)", price_text)
    if len(prices) == 2:
        return prices[1].replace(",", ""), prices[0].replace(",", "")
    elif len(prices) == 1:
        p = prices[0].replace(",", "")
        return p, p
    return None, None


In [42]:
async def extract_product_data(card):
    """Extract data from a single product card."""
    try:
        name_elem = await card.query_selector(".product-title a")
        name = (await name_elem.inner_text()).strip() if name_elem else "N/A"
        url_part = await name_elem.get_attribute("href") if name_elem else ""
        full_url = f"https://alfatah.pk{url_part}" if url_part else "N/A"

        price_text = await extract_text(card, ".product-price")
        original_price, discount_price = parse_price_text(price_text)

        discount_elem = await card.query_selector(".product-sale-badge")
        discount_text = (await discount_elem.inner_text()).strip() if discount_elem else "No Discount"

        img_elem = await card.query_selector("img")
        image_url = await img_elem.get_attribute("src") if img_elem else None
        if not image_url:
            srcset = await img_elem.get_attribute("srcset") if img_elem else ""
            if srcset:
                image_url = srcset.split(",")[0].split(" ")[0]
        if image_url and not image_url.startswith("http"):
            image_url = f"https:{image_url}" if image_url.startswith("//") else f"https://alfatah.pk{image_url}"

        return {
            "Name": name,
            "Original Price (Rs)": original_price or "N/A",
            "Discounted Price (Rs)": discount_price or "N/A",
            "Discount": discount_text,
            "Product URL": full_url,
            "Image URL": image_url or "N/A",
        }
    except Exception as e:
        print(f"Error extracting product: {e}")
        return None

In [43]:
async def scrape_category(page, main_category, subcategory):
    """Scrape one subcategory and save CSV inside its folder."""
    products = []
    url = f"https://alfatah.pk/collections/{subcategory}"
    print(f"\n{'='*80}")
    print(f"Scraping [{main_category}] → {subcategory}")
    print(f"URL: {url}")
    print(f"{'='*80}")

    # Make folder for main category
    os.makedirs(main_category, exist_ok=True)

    try:
        await page.goto(url, timeout=60000)
        await page.wait_for_selector(".product-card", timeout=30000)
        await asyncio.sleep(2)
    except Exception as e:
        print(f"Failed to load {subcategory}: {e}")
        return

    product_cards = await page.query_selector_all(".product-card")
    print(f"Found {len(product_cards)} products in {subcategory}")

    for idx, card in enumerate(product_cards, 1):
        data = await extract_product_data(card)
        if data:
            products.append(data)
            print(f"Extracted {idx}/{len(product_cards)}: {data['Name']}")

    if products:
        df = pd.DataFrame(products)
        csv_path = os.path.join(main_category, f"{subcategory}.csv")
        df.to_csv(csv_path, index=False)
        print(f"Saved {len(df)} products to: {csv_path}")
    else:
        print(f"No products found for {subcategory}")

In [44]:
async def run_scraper():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        for main_category, subcategories in CATEGORY_GROUPS.items():
            print(f"\n{'#'*100}")
            print(f"Processing MAIN CATEGORY: {main_category}")
            print(f"{'#'*100}")
            for subcat in subcategories:
                await scrape_category(page, main_category, subcat)

        await browser.close()
    print("\nAll categories processed successfully.")


In [45]:
print("Starting scraper...")
await run_scraper()


Starting scraper...

####################################################################################################
Processing MAIN CATEGORY: Groceries
####################################################################################################

Scraping [Groceries] → oil-ghee
URL: https://alfatah.pk/collections/oil-ghee
Found 160 products in oil-ghee
Extracted 1/160: DALDA COOKING OIL POUCH 1 LTR
Extracted 2/160: MUNDIAL OLIVE OIL POMACE TIN 4 LTR
Extracted 3/160: SUFI CANOLA COOKING OIL POUCH 1 LTR
Extracted 4/160: SASSO OLIVE OIL TIN 100 ML
Extracted 5/160: SEASONS CANOLA OIL POUCH 1 LTR
Extracted 6/160: FILIPPO BERIO OLIVE OIL EXTRA VIRGIN BOTTLE 250 ML
Extracted 7/160: CANOLIVE PREMIUM CANOLA OIL POUCH 1 LTR
Extracted 8/160: SASSO OLIVE OIL TIN 200 ML
Extracted 9/160: MEZAN OLIVOLA OLIVE AND CANOLA OIL 1 LTR POUCH
Extracted 10/160: BORGES EXTRA VIRGIN OLIVE OIL BOTTLE 125 ML
Extracted 11/160: SUFI SUNFLOWER COOKING OIL POUCH 1 LTR
Extracted 12/160: DALDA CANOLA OIL P