In [2]:
!pip install playwright
!playwright install
!apt-get install libxcomposite1 libgtk-3-0 libatk1.0-0

Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.55.0 pyee-13.0.0
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 0.0s[0K[1G173.7 MiB [] 0% 34.2s[0K[1G173.7 MiB [] 0% 14.2s[0K[1G173.7 MiB [] 0% 9.1s[0K[1G173.7 MiB [] 1% 6.3s[0K[1G173.7 MiB [] 1% 5.3s[0K[1G173.7 MiB [] 2% 4.8s[0K[1G173.7 MiB [] 2% 4.1s[0K[1G173.7 MiB [] 3% 3.8s[0K[1G173.7 MiB [] 4% 3.5s

In [4]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import json
from datetime import datetime

async def scrape():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # List to store all product data for DataFrame
        all_products_data = []

        await page.goto("https://jalalsons.com.pk", timeout=60000)
        print("Navigated to site.")

        # --- Handle popup ---
        try:
            if await page.locator("#website_custom_popup").is_visible():
                await page.locator('#website_custom_popup .modal-header a.cursor-pointer.ms-auto').click()
                print("Popup closed.")
        except:
            print("No popup found or already closed.")

        # Click delivery tab and get all branches
        await page.click("a#delivery-loc-tab")
        await page.wait_for_selector("#selectDeliveryBranch", timeout=10000)
        branches = await page.locator("#selectDeliveryBranch option").all_text_contents()
        valid_branches = [b for b in branches if "Please select" not in b and "Lahore" in b]

        print(f"\nFound {len(valid_branches)} valid branches in Lahore: {valid_branches}")

        # --- Only process the 3rd branch (index 2) ---
        if len(valid_branches) >= 3:
            branch_name = valid_branches[3]  # 3rd branch (index 2)

            print(f"\n{'='*80}")
            print(f"Processing ONLY ASKARI Branch: {branch_name}")
            print(f"{'='*80}")

            # Select the branch
            await page.select_option("#selectDeliveryBranch", label=branch_name)
            await page.wait_for_timeout(2000)
            await page.click("a#delivery_order")
            print(f"Selected branch: {branch_name}")

            await page.wait_for_selector("ul.navbar-nav", timeout=10000)

            # --- Extract category structure ---
            target_categories = ["BAKERY", "DELI", "JS ICECREAM", "SWEETS", "DEALS", "GROCERY"]

            nav_items = await page.locator("ul.navbar-nav > li.nav-item").element_handles()
            category_links = {}

            for li in nav_items[:-3]:
                main_cat = await li.query_selector("a.nav-link")
                if not main_cat:
                    continue

                main_name = (await main_cat.inner_text()).strip()
                if main_name not in target_categories:
                    continue

                await main_cat.hover()
                await page.wait_for_timeout(500)
                sub_links = await li.query_selector_all("ul.dropdown-content a")

                urls = []
                if sub_links:
                    for sub in sub_links:
                        sub_name = (await sub.inner_text()).strip()
                        sub_href = await sub.get_attribute("href")
                        if sub_href:
                            urls.append({
                                "name": sub_name,
                                "url": f"https://jalalsons.com.pk{sub_href}"
                            })
                    category_links[main_name] = urls
                else:
                    href = await main_cat.get_attribute("href")
                    if href:
                        category_links[main_name] = [{
                            "name": main_name,
                            "url": f"https://jalalsons.com.pk{href}"
                        }]

            # Scrape products for each category
            for main_name, subcats in category_links.items():
                print(f"\nCategory: {main_name}")

                for sub in subcats:
                    sub_name = sub["name"]
                    sub_url = sub["url"]
                    print(f"   Scraping {sub_name}: {sub_url}")

                    try:
                        await page.goto(sub_url, timeout=60000)
                        await page.wait_for_selector(".single_product_theme", timeout=10000)
                    except Exception as e:
                        print(f"   Could not load {sub_name}: {e}")
                        continue

                    products = await page.query_selector_all(".single_product_theme")

                    for product in products:
                        name_el = await product.query_selector("p.product_name_theme")
                        price_el = await product.query_selector("span.price-value")
                        img_el = await product.query_selector("img")

                        name = (await name_el.inner_text()).strip() if name_el else None
                        price = (await price_el.inner_text()).strip() if price_el else None
                        image = await img_el.get_attribute("src") if img_el else None

                        if name and price:
                            # Determine category and subcategory
                            if main_name == sub_name:
                                # No subcategory, main category only
                                category = main_name
                                subcategory = None
                            else:
                                # Has subcategory
                                category = main_name
                                subcategory = sub_name

                            all_products_data.append({
                                "branch": branch_name,
                                "category": category,
                                "subcategory": subcategory,
                                "product_name": name,
                                "price": price,
                                "image_url": image,
                                "product_url": sub_url
                            })

                    print(f"   Scraped {len(products)} products from {sub_name}")
        else:
            print("Not enough branches found! Need at least 3 branches.")
            await browser.close()
            return

        # --- Create DataFrame and Save Results ---
        if all_products_data:
            df = pd.DataFrame(all_products_data)

            # Reorder columns for better readability
            column_order = ["branch", "category", "subcategory", "product_name", "price", "image_url", "product_url"]
            df = df[column_order]

            # Save to CSV
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            csv_filename = f"jalalsons_products_{timestamp}.csv"
            df.to_csv(csv_filename, index=False, encoding='utf-8-sig')

            # Save to Excel for better formatting
            excel_filename = f"jalalsons_products_{timestamp}.xlsx"
            df.to_excel(excel_filename, index=False, sheet_name='Products')

            # Print summary statistics
            print(f"\n{'='*80}")
            print("SCRAPING SUMMARY")
            print(f"{'='*80}")
            print(f"Total products scraped: {len(df)}")
            print(f"Branch: {branch_name}")
            print(f"Total categories: {df['category'].nunique()}")
            print(f"\nProducts per category:")
            print(df['category'].value_counts())
            print(f"\nData saved to:")
            print(f"   - {csv_filename}")
            print(f"   - {excel_filename}")

            # Display first few rows
            print(f"\nSample data:")
            print(df.head(10).to_string())
        else:
            print("No products were scraped from the 3rd branch!")

        await browser.close()

# Run
await scrape()

Navigated to site.
Popup closed.

Found 37 valid branches in Lahore: ['Allama Iqbal Town Branch Lahore', 'Askari 10 Branch, Sector F Lahore', 'Askari 10 sector S Lahore', 'Askari 11 Branch Lahore', 'Ayubia Market Branch Lahore', 'Bahria Orchard Lahore', 'Bahria Town ( Canal road ) Branch Lahore', 'Bahria Town ( Nishtar block ) Branch Lahore', 'Cantt CSD Branch  Lahore', 'Chauburji Branch Lahore', 'College Road Branch Lahore', 'DHA Phase 1, H- block Branch Lahore', 'DHA Phase 3, XX1A  Branch Lahore', 'DHA Phase 3, Y Block Branch Lahore', 'DHA Phase 4 Branch Lahore', 'DHA Phase 5 Branch Lahore', 'DHA Phase 6 Branch Lahore', 'DHA Phase 6 Sector C Branch Lahore', 'DHA Phase 8 Branch Lahore', 'Eden Canal Villas Lahore', 'Hussain Chowk Branch Lahore', 'Izmir Town Branch Lahore', 'Johar Town Branch Lahore', 'Johar Town Near Shadiwal Chowk Branch Lahore', 'Johar Town Shaukat Khanum Flyover Branch Lahore', 'Lake City Branch Lahore', 'Main Market Branch Lahore', 'Mall road Branch Lahore', 'Model