In [1]:
!pip install playwright
!playwright install

Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.55.0 pyee-13.0.0
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 132.2s[0K[1G173.7 MiB [] 0% 36.3s[0K[1G173.7 MiB [] 0% 18.7s[0K[1G173.7 MiB [] 0% 9.6s[0K[1G173.7 MiB [] 1% 5.8s[0K[1G173.7 MiB [] 1% 4.5s[0K[1G173.7 MiB [] 2% 3.6s[0K[1G173.7 MiB [] 3% 3.1s[0K[1G173.7 MiB [] 4% 2.8s[0K[1G173.7 MiB [] 5% 2.6

In [2]:
!pip install asyncio

Collecting asyncio
  Downloading asyncio-4.0.0-py3-none-any.whl.metadata (994 bytes)
Downloading asyncio-4.0.0-py3-none-any.whl (5.6 kB)
Installing collected packages: asyncio
Successfully installed asyncio-4.0.0


In [3]:
import asyncio
from playwright.async_api import async_playwright
import json
import time

class MetroLahoreScraper:
    def __init__(self, headless=True):
        self.headless = headless
        self.base_url = "https://www.metro-online.pk/home"

    async def scroll_to_load_all_products(self, page):
        """Scroll to bottom multiple times to load all lazy-loaded products"""
        print("    Scrolling to load all products...")
        previous_height = 0
        scroll_attempts = 0
        max_scroll_attempts = 15
        no_change_count = 0

        while scroll_attempts < max_scroll_attempts:
            # Get current scroll height
            current_height = await page.evaluate('''() => {
                window.scrollTo(0, document.body.scrollHeight);
                return document.body.scrollHeight;
            }''')

            # Wait for content to load
            await asyncio.sleep(2)

            # Check for load more button
            load_more_selectors = [
                'button:has-text("Load More")',
                'button:has-text("Show More")',
                'button:has-text("View More")',
                '.load-more',
                '.show-more'
            ]

            for selector in load_more_selectors:
                try:
                    load_more_btn = await page.query_selector(selector)
                    if load_more_btn and await load_more_btn.is_visible():
                        await load_more_btn.click()
                        await asyncio.sleep(3)
                        break
                except:
                    continue

            # Check if we've reached the bottom
            if current_height == previous_height:
                no_change_count += 1
                if no_change_count >= 2:  # If no change for 2 consecutive scrolls
                    break
            else:
                no_change_count = 0

            previous_height = current_height
            scroll_attempts += 1

            # Check if we have a reasonable number of products loaded
            product_count = await page.evaluate('''() => {
                return document.querySelectorAll('.CategoryGrid_product_card__FUMXW').length;
            }''')

            print(f"    Scroll {scroll_attempts}: Loaded {product_count} products so far...")

        print(f"    Finished scrolling. Total attempts: {scroll_attempts}")
        return scroll_attempts

    async def scrape_main_categories(self, page):
        """Extract all main categories from homepage"""
        print("Step 1: Extracting main categories...")

        await page.wait_for_selector('.CategoryGrid_grid_container__ouyHW', timeout=15000)

        main_categories = await page.evaluate('''(base_url) => {
            const categories = [];
            const categoryElements = document.querySelectorAll('.CategoryGrid_grid_item__FXimL');

            categoryElements.forEach((element) => {
                const linkElement = element.querySelector('a');
                const imgElement = element.querySelector('img');

                if (linkElement && imgElement) {
                    const category = {
                        name: imgElement.alt || 'No name',
                        url: linkElement.href || 'No URL',
                        image_url: imgElement.src || 'No image',
                        sub_categories: []
                    };

                    categories.push(category);
                }
            });

            return categories;
        }''', self.base_url)

        print(f" Found {len(main_categories)} main categories")
        return main_categories

    async def scrape_sub_categories(self, browser, main_categories):
        """Extract sub-categories for each main category"""
        print("\nStep 2: Extracting sub-categories...")

        all_subcategory_links = []

        for i, category in enumerate(main_categories, 1):
            print(f"  Processing category {i}/{len(main_categories)}: {category['name']}")

            try:
                category_page = await browser.new_page()

                # Build full URL
                if category['url'].startswith('/'):
                    full_url = f"{self.base_url}{category['url']}"
                else:
                    full_url = category['url']

                await category_page.goto(full_url, wait_until='networkidle', timeout=45000)

                # Try multiple selectors for sub-categories container
                sub_category_selectors = [
                    '.sc-gKPRtg.jJzJeK',
                ]

                sub_categories = []
                for selector in sub_category_selectors:
                    try:
                        await category_page.wait_for_selector(selector, timeout=5000)
                        sub_categories = await category_page.evaluate('''(selector) => {
                            const subCats = [];
                            const container = document.querySelector(selector);

                            if (container) {
                                const links = container.querySelectorAll('a');
                                links.forEach((link) => {
                                    const imgElement = link.querySelector('img');
                                    const nameElement = link.querySelector('h6, .sc-cwSeag, [class*="name"], [class*="title"]');

                                    if (link.href && nameElement) {
                                        const subCat = {
                                            name: nameElement.textContent?.trim() || 'No name',
                                            url: link.href,
                                            image_url: imgElement?.src || 'No image',
                                            alt_text: imgElement?.alt || 'No alt text'
                                        };
                                        subCats.push(subCat);
                                    }
                                });
                            }
                            return subCats;
                        }''', selector)

                        if sub_categories:
                            break
                    except:
                        continue

                category['sub_categories'] = sub_categories

                # Add to master list
                for sub_cat in sub_categories:
                    all_subcategory_links.append({
                        'main_category': category['name'],
                        'sub_category': sub_cat['name'],
                        'url': sub_cat['url'],
                        'image_url': sub_cat['image_url'],
                        'main_category_url': category['url']
                    })

                print(f"    Found {len(sub_categories)} sub-categories")
                await category_page.close()
                await asyncio.sleep(1.5)

            except Exception as e:
                print(f"    Error processing {category['name']}: {str(e)}")
                category['sub_categories'] = []
                try:
                    await category_page.close()
                except:
                    pass
                continue

        total_subcategories = sum(len(cat['sub_categories']) for cat in main_categories)
        print(f"\nTotal sub-categories found: {total_subcategories}")
        return main_categories, all_subcategory_links

    async def scrape_products_from_subcategory(self, page, subcat_link):
        """Scrape all products from a single sub-category with lazy loading handling"""
        try:
            # Build full URL
            if subcat_link['url'].startswith('/'):
                full_url = f"{self.base_url}{subcat_link['url']}"
            else:
                full_url = subcat_link['url']

            await page.goto(full_url, wait_until='networkidle', timeout=45000)

            # Wait for initial products
            try:
                await page.wait_for_selector('.CategoryGrid_product_card__FUMXW', timeout=10000)
            except:
                return []  # No products found

            # Scroll to load all lazy-loaded products
            await self.scroll_to_load_all_products(page)

            # Final wait to ensure everything is loaded
            await asyncio.sleep(2)

            # Extract products
            products = await page.evaluate('''() => {
                const products = [];
                const productElements = document.querySelectorAll('.CategoryGrid_product_card__FUMXW');

                productElements.forEach((productEl) => {
                    // Product name
                    const nameElement = productEl.querySelector('.CategoryGrid_product_name__3nYsN');
                    const productName = nameElement?.textContent?.trim() || 'No name';

                    // Product price
                    const priceElement = productEl.querySelector('.CategoryGrid_product_price__Svf8T');
                    const productPrice = priceElement?.textContent?.trim() || 'No price';

                    // Product URL
                    const linkElement = productEl.querySelector('a[href*="/detail/"]');
                    const productUrl = linkElement?.href || 'No URL';
                    const productPath = linkElement?.getAttribute('href') || 'No path';

                    // Product image
                    const imgElement = productEl.querySelector('img');
                    const productImage = imgElement?.src || 'No image';
                    const productAlt = imgElement?.alt || 'No alt text';

                    // Badge
                    const badgeElement = productEl.querySelector('[data-after-content]');
                    const badge = badgeElement?.getAttribute('data-after-content') || null;

                    // Product ID from URL
                    const urlParts = productUrl.split('/');
                    const productId = urlParts[urlParts.length - 1] || 'No ID';

                    const product = {
                        id: productId,
                        name: productName,
                        price: productPrice,
                        url: productUrl,
                        path: productPath,
                        image_url: productImage,
                        alt_text: productAlt,
                        badge: badge,
                        scraped_at: new Date().toISOString()
                    };

                    products.push(product);
                });

                return products;
            }''')

            # Add category info
            for product in products:
                product.update({
                    'main_category': subcat_link['main_category'],
                    'sub_category': subcat_link['sub_category'],
                    'main_category_url': subcat_link['main_category_url'],
                    'sub_category_url': subcat_link['url']
                })

            return products

        except Exception as e:
            print(f"      Error scraping products: {str(e)}")
            return []

    async def scrape_all_products(self, browser, all_subcategory_links, max_subcategories=None):
        """Scrape products from all sub-categories"""
        print("\nStep 3: Scraping products from sub-categories...")

        if max_subcategories:
            all_subcategory_links = all_subcategory_links[:max_subcategories]
            print(f"  Testing mode: Scraping first {max_subcategories} sub-categories")

        all_products = []

        for i, subcat_link in enumerate(all_subcategory_links, 1):
            print(f"  Processing sub-category {i}/{len(all_subcategory_links)}: {subcat_link['sub_category']}")

            try:
                product_page = await browser.new_page()
                products = await self.scrape_products_from_subcategory(product_page, subcat_link)
                all_products.extend(products)

                print(f"    Found {len(products)} products")
                await product_page.close()
                await asyncio.sleep(2)  # Rate limiting

            except Exception as e:
                print(f"    Failed to process {subcat_link['sub_category']}: {str(e)}")
                try:
                    await product_page.close()
                except:
                    pass
                continue

        return all_products

    async def run_complete_scraping(self, test_mode=True):
        """Main function to run complete scraping process"""
        print("Starting Metro Lahore Complete Scraper")
        print("=" * 60)

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=self.headless)

            try:
                # Step 1: Get main page and categories
                main_page = await browser.new_page()
                await main_page.goto(self.base_url, wait_until='networkidle', timeout=45000)

                main_categories = await self.scrape_main_categories(main_page)
                await main_page.close()

                if not main_categories:
                    print("No main categories found. Exiting.")
                    return

                # Step 2: Get sub-categories
                main_categories_with_subs, all_subcategory_links = await self.scrape_sub_categories(browser, main_categories)

                if not all_subcategory_links:
                    print("No sub-categories found. Exiting.")
                    return

                # Step 3: Get products
                if test_mode:
                    all_products = await self.scrape_all_products(browser, all_subcategory_links, max_subcategories=1)
                else:
                    all_products = await self.scrape_all_products(browser, all_subcategory_links)

                # Save results
                print(f"\n" + "=" * 60)
                print(" Saving results...")

                # Save complete hierarchy
                with open('metro_complete_hierarchy.json', 'w', encoding='utf-8') as f:
                    json.dump(main_categories_with_subs, f, indent=2, ensure_ascii=False)
                print(" Saved: metro_complete_hierarchy.json")

                # Save sub-category links
                with open('metro_subcategory_links.json', 'w', encoding='utf-8') as f:
                    json.dump(all_subcategory_links, f, indent=2, ensure_ascii=False)
                print(" Saved: metro_subcategory_links.json")

                # Save products
                with open('metro_products.json', 'w', encoding='utf-8') as f:
                    json.dump(all_products, f, indent=2, ensure_ascii=False)
                print(" Saved: metro_products.json")

                # Print summary
                print(f"\nSCRAPING COMPLETED!")
                print(f"Summary:")
                print(f"   • Main Categories: {len(main_categories_with_subs)}")
                print(f"   • Sub-categories: {len(all_subcategory_links)}")
                print(f"   • Products: {len(all_products)}")

                if all_products:
                    print(f"\n Sample Products:")
                    for i, product in enumerate(all_products[:5]):
                        print(f"   {i+1}. {product['name']} - {product['price']}")

            except Exception as e:
                print(f" Critical error: {str(e)}")
            finally:
                await browser.close()

async def main():
    scraper = MetroLahoreScraper(headless=True)

    await scraper.run_complete_scraping(test_mode=False)

if __name__ == "__main__":

    await main()

Starting Metro Lahore Complete Scraper
Step 1: Extracting main categories...
 Found 12 main categories

Step 2: Extracting sub-categories...
  Processing category 1/12: Metro Post Grocery
    Found 0 sub-categories
  Processing category 2/12: Fruits And Vegetables
    Found 3 sub-categories
  Processing category 3/12: Meat
    Found 3 sub-categories
  Processing category 4/12: Tea and Coffee
    Found 2 sub-categories
  Processing category 5/12: Commodities
    Found 5 sub-categories
  Processing category 6/12: Beverages
    Found 6 sub-categories
  Processing category 7/12: Dairy
    Found 6 sub-categories
  Processing category 8/12: Snacks
    Found 5 sub-categories
  Processing category 9/12: Toiletries
    Found 2 sub-categories
  Processing category 10/12: Frozen Ready to Cook
    Found 4 sub-categories
  Processing category 11/12: Laundry
    Found 3 sub-categories
  Processing category 12/12: Toiletries
    Found 2 sub-categories

Total sub-categories found: 41

Step 3: Scraping