In [1]:
!pip install playwright
!playwright install chromium
!apt-get install -y libnss3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \
                     libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 \
                     libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2


Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.55.0 pyee-13.0.0
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 11.5s[0K[1G173.7 MiB [] 0% 37.4s[0K[1G173.7 MiB [] 0% 41.7s[0K[1G173.7 MiB [] 0% 28.7s[0K[1G173.7 MiB [] 0% 30.1s[0K[1G173.7 MiB [] 0% 22.3s[0K[1G173.7 MiB [] 0% 14.8s[0K[1G173.7 MiB [] 1% 19.4s[0K[1G173.7 MiB [] 1% 18.1s[0K[1G173.7 MiB [] 1

In [2]:
import os
import re
import pandas as pd
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
import nest_asyncio

# Allow nested event loops (for Colab)
nest_asyncio.apply()

In [3]:
BASE_URL = "https://rajasahibfresh.pk"

def clean_category_name(raw_name: str) -> str:
    """
    Clean category names by:
    - Replacing '-' with spaces
    - Removing digits
    - Converting to title case
    - Collapsing multiple spaces
    """
    name = raw_name.replace("-", " ")
    name = re.sub(r"\d+", "", name)          # remove all digits
    name = re.sub(r"\s+", " ", name)         # collapse multiple spaces into one
    name = name.title().strip()              # title case and strip spaces
    return name


async def run():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        print("--- Step 1: Collecting Main Category Links ---")
        await page.goto(BASE_URL, timeout=60000)
        await page.wait_for_selector("div.swiper-wrapper a")

        links = page.locator("div.swiper-wrapper a")
        count = await links.count()
        print(f"Found {count} links on homepage.\n")

        category_links = []
        fruitsAndVeg = None
        megaDiscounts = None
        shoeCare = None

        for i in range(count):
            href = await links.nth(i).get_attribute("href")
            if not href or href == "/":
                continue

            if href.startswith("/"):
                href = BASE_URL + href

            if "fruits-and-vegetables" in href:
                fruitsAndVeg = href
                print(f"Stored Fruits & Vegetables category separately: {href}")
            elif "mega-discounts" in href:
                megaDiscounts = href
                print(f"Stored Mega Discounts category separately: {href}")
            elif "shoe-care" in href:
                shoeCare = href
                print(f"Stored Shoe Care category separately: {href}")
            else:
                category_links.append(href)

        category_links = list(set(category_links))
        print(f"\nCollected unique category links for traversal: {len(category_links)}")

        category_dict = {}

        # Step 2: Visit each main category to find subcategories
        print("\n--- Step 2: Traversing Categories for Subcategory Links ---")
        for idx, link in enumerate(category_links):
            raw_name = link.split("/")[-1]
            category_name = clean_category_name(raw_name)

            # Skip blank/invalid category names (e.g. "1542314")
            if not category_name:
                print(f"\n[{idx+1}/{len(category_links)}] Skipping invalid category at {link}")
                continue

            print(f"\n[{idx+1}/{len(category_links)}] Visiting category: {category_name} at {link}")

            try:
                await page.goto(link, timeout=60000, wait_until="domcontentloaded")
                await page.wait_for_timeout(2000)

                has_subcategories = await page.locator(
                    "div.MuiGrid-root.MuiGrid-container.blink-style-1d3bbye"
                ).count() > 0

                if not has_subcategories:
                    print(f"  No subcategory grid found - likely a product listing page")
                    category_dict[category_name] = []
                    continue
            except Exception as e:
                print(f"  Error loading page: {str(e)[:100]}")
                category_dict[category_name] = []
                continue

            sub_links = []
            try:
                subcategory_container = page.locator(
                    "div.MuiGrid-root.MuiGrid-container.blink-style-1d3bbye a"
                )
                link_count = await subcategory_container.count()
                print(f"  Found {link_count} potential sub-category links")

                for i in range(link_count):
                    try:
                        href = await subcategory_container.nth(i).get_attribute("href")
                        if href:
                            if href.startswith("/"):
                                href = BASE_URL + href
                            if "/catalog/" in href and href != link:
                                sub_links.append(href)
                    except Exception as e:
                        print(f"    Skipped link at index {i}: {str(e)[:50]}")
                        continue

                sub_links = list(set(sub_links))
                category_dict[category_name] = sub_links
                print(f"  Stored {len(sub_links)} unique subcategory links for {category_name}")
            except Exception as e:
                print(f"  Error finding subcategories: {str(e)[:100]}")
                category_dict[category_name] = []

        await browser.close()

        # Step 3: Summary
        print("\n--- Step 3: Summary of Collected Data ---")

        # Add Fruits & Vegetables and Shoe Care manually as categories
        if fruitsAndVeg:
            category_dict["Fruits And Vegetables"] = [fruitsAndVeg]
        if shoeCare:
            category_dict["Shoe Care"] = [shoeCare]

        total_subcategories = 0
        for k, v in category_dict.items():
            print(f"{k}: {len(v)} links")
            total_subcategories += len(v)

        print(f"\nTotal categories: {len(category_dict)}")
        print(f"Total subcategories: {total_subcategories}")

        #For future add on
        # if megaDiscounts:
        #     print(f"Mega Discounts category stored separately: {megaDiscounts}")

        return category_dict


# Main
category_dict = await run()


--- Step 1: Collecting Main Category Links ---
Found 27 links on homepage.

Stored Mega Discounts category separately: https://rajasahibfresh.pk/catalog/mega-discounts-32350
Stored Shoe Care category separately: https://rajasahibfresh.pk/catalog/shoe-care-31055
Stored Fruits & Vegetables category separately: https://rajasahibfresh.pk/catalog/fruits-and-vegetables-31049

Collected unique category links for traversal: 24

--- Step 2: Traversing Categories for Subcategory Links ---

[1/24] Visiting category: Cleaning Household at https://rajasahibfresh.pk/catalog/cleaning--household-31043
  Found 10 potential sub-category links
  Stored 8 unique subcategory links for Cleaning Household

[2/24] Visiting category: Hair Care at https://rajasahibfresh.pk/catalog/hair-care-31050
  Found 11 potential sub-category links
  Stored 9 unique subcategory links for Hair Care

[3/24] Visiting category: Chips And Snacks at https://rajasahibfresh.pk/catalog/chips-and-snacks-31042
  Found 14 potential sub

In [4]:
print(category_dict.keys())       # List all category names

dict_keys(['Cleaning Household', 'Hair Care', 'Chips And Snacks', 'Car Care', 'Pet Care', 'Frozen', 'Bath Body', 'Cooking Essentials', 'Mens Care', 'Fragrances', 'Health Wellness', 'Beverages', 'Meat', 'Women Adult Care', 'Hand Foot Care', 'Breakfast', 'Dental Care', 'Baby Foods Diapers', 'Dairy', 'Makeup', 'Kitchen Home Appliances', 'Electronic Accessories', 'Skin Care', 'Fruits And Vegetables', 'Shoe Care'])


In [5]:
for key, value in category_dict.items():
    print(key, ":", value)


Cleaning Household : ['https://rajasahibfresh.pk/catalog/cleaning--household-31043/laundry--detergents-56003', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/foil--cling-film-paper-55999', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/tissue--toilet-rolls-56005', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/garbage-bag-56000', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/household-cleaners-56001', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/kitchen-cleaners-56002', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/repellents--insecticides-56004', 'https://rajasahibfresh.pk/catalog/cleaning--household-31043/air-fresheners-55998']
Hair Care : ['https://rajasahibfresh.pk/catalog/hair-care-31050/hair-masks-56166', 'https://rajasahibfresh.pk/catalog/hair-care-31050/hair-dryers--straighteners-60651', 'https://rajasahibfresh.pk/catalog/hair-care-31050/hair-serum-56235', 'https://rajasahibfresh.pk/catalog/

In [6]:
async def extract_products_from_url(page, url, category_name, subcategory_name=None):
    """
    Extract all products from a given URL with infinite scroll - IMPROVED VERSION
    """
    print(f"    Extracting products from: {url.split('/')[-1]}")

    try:
        await page.goto(url, timeout=60000, wait_until="domcontentloaded")
        await page.wait_for_timeout(3000)

        products = []
        seen_product_ids = set()
        scroll_attempts = 0
        max_scroll_attempts = 20
        no_new_products_count = 0

        while scroll_attempts < max_scroll_attempts:

            await page.wait_for_selector("div.hazle-product-item_product_item__FSm1N")
            product_containers = page.locator("div.hazle-product-item_product_item__FSm1N")
            current_count = await product_containers.count()

            if current_count == 0:
                print("No products found on page")
                break

            batch_products = []

            for i in range(current_count):
                try:

                    product_id = await product_containers.nth(i).get_attribute("id")

                    if not product_id or product_id in seen_product_ids:
                        continue


                    product_data = await page.evaluate("""([containerIndex, category, subcategory]) => {
                        const container = document.querySelectorAll('div.hazle-product-item_product_item__FSm1N')[containerIndex];
                        if (!container) return null;

                        //product ID
                        const productId = container.id ? container.id.replace('product-item-', '') : '';

                        //title
                        const titleEl = container.querySelector('h4');
                        const title = titleEl ? titleEl.textContent.trim() : '';

                        //description
                        const descEl = container.querySelector('.hazle-product-item_product_item_description__ejRDa');
                        const description = descEl ? descEl.textContent.trim() : '';

                        //price
                        const priceEl = container.querySelector('.hazle-product-item_product_item_price_label__ET_we span');
                        const price = priceEl ? priceEl.textContent.trim() : '';

                        //image
                        const imgEl = container.querySelector('img');
                        const imageUrl = imgEl ? imgEl.src : '';

                        //product url
                        let productUrl = '';
                        const linkEl = container.querySelector('a');
                        if (linkEl && linkEl.href) {
                            productUrl = linkEl.href;
                        }

                        if (!title) return null;

                        return {
                            product_id: productId,
                            title: title,
                            description: description,
                            price: price,
                            image_url: imageUrl,
                            category: category,
                            subcategory: subcategory || category,
                            url: productUrl
                        };
                    }""", [i, category_name, subcategory_name])

                    if product_data:
                        batch_products.append(product_data)
                        seen_product_ids.add(product_id)

                except Exception as e:
                    # skip and continue
                    continue


            if batch_products:
                products.extend(batch_products)
                print(f"Found {len(batch_products)} new products (Total: {len(products)})")
                no_new_products_count = 0
            else:
                no_new_products_count += 1
                if no_new_products_count == 1:
                    print(f"No new products in this batch")

            # check scrolling
            if no_new_products_count >= 3:
                print(f"Stopping scroll - no new products")
                break

            # scroll to bottom
            previous_count = current_count
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(3000)


            new_count = await product_containers.count()
            if new_count <= previous_count:
                scroll_attempts += 1
                if scroll_attempts % 5 == 0:
                    print(f"Scroll attempt {scroll_attempts}/{max_scroll_attempts}")
            else:
                scroll_attempts = 0

        print(f"Extracted {len(products)} products")
        return products

    except Exception as e:
        print(f"Error processing URL: {str(e)[:100]}")
        return []

async def scrape_all_products(category_dict):
    """
    Main function to scrape all products from all categories and subcategories - IMPROVED
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()


        page.set_default_timeout(30000)

        all_products = []
        total_categories = len(category_dict)
        current_category = 0

        print("\n--- Starting Product Extraction ---")

        for category_name, subcategory_urls in category_dict.items():
            current_category += 1
            print(f"\n[{current_category}/{total_categories}] Processing category: {category_name}")

            category_products = []

            for url_idx, url in enumerate(subcategory_urls, 1):
                print(f"Subcategory {url_idx}/{len(subcategory_urls)}: {url.split('/')[-1]}")


                if len(subcategory_urls) == 1 and any(keyword in url.lower() for keyword in [category_name.lower().replace(" ", "-"), 'fruits', 'vegetables', 'shoe-care']):

                    products = await extract_products_from_url(page, url, category_name)
                else:

                    subcategory_name = clean_category_name(url.split("/")[-1])
                    products = await extract_products_from_url(page, url, category_name, subcategory_name)

                category_products.extend(products)

                await page.wait_for_timeout(2000)

            all_products.extend(category_products)
            print(f"Completed {category_name}: {len(category_products)} products")

        await browser.close()

        print(f"\n--- Product Extraction Complete ---")
        print(f"Total products extracted: {len(all_products)}")

        return all_products

print("Starting product extraction...")
all_products = await scrape_all_products(category_dict)

Starting product extraction...

--- Starting Product Extraction ---

[1/25] Processing category: Cleaning Household
Subcategory 1/8: laundry--detergents-56003
    Extracting products from: laundry--detergents-56003
Found 30 new products (Total: 30)
Found 52 new products (Total: 82)
Found 15 new products (Total: 97)
No new products in this batch
Stopping scroll - no new products
Extracted 97 products
Subcategory 2/8: foil--cling-film-paper-55999
    Extracting products from: foil--cling-film-paper-55999
Found 30 new products (Total: 30)
Found 13 new products (Total: 43)
No new products in this batch
Stopping scroll - no new products
Extracted 43 products
Subcategory 3/8: tissue--toilet-rolls-56005
    Extracting products from: tissue--toilet-rolls-56005
Found 30 new products (Total: 30)
Found 55 new products (Total: 85)
Found 10 new products (Total: 95)
No new products in this batch
Stopping scroll - no new products
Extracted 95 products
Subcategory 4/8: garbage-bag-56000
    Extracting

In [7]:
def save_products_to_dataframe(products_list):
    """
    Convert products list to pandas DataFrame and save to CSV
    """
    if not products_list:
        print("No products to save!")
        return None

    df = pd.DataFrame(products_list)


    column_order = ['product_id', 'title', 'description', 'price', 'category', 'subcategory', 'image_url', 'url']
    existing_columns = [col for col in column_order if col in df.columns]
    remaining_columns = [col for col in df.columns if col not in existing_columns]
    df = df[existing_columns + remaining_columns]


    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    filename = f"raja_sahib_products_{timestamp}.csv"
    df.to_csv(filename, index=False, encoding='utf-8')

    print(f"Products saved to: {filename}")
    print(f"DataFrame shape: {df.shape}")

    return df


products_df = save_products_to_dataframe(all_products)


if products_df is not None:
    print("\\nFirst 5 products:")
    display(products_df.head())

Products saved to: raja_sahib_products_20251109_172536.csv
DataFrame shape: (7596, 8)
\nFirst 5 products:


Unnamed: 0,product_id,title,description,price,category,subcategory,image_url,url
0,2027699,Iwash Detergent Powder-500g,500g,Rs. 140.00,Cleaning Household,Laundry Detergents,https://em-cdn.eatmubarak.pk/55545/gallery/896...,
1,2244863,Softlan Fabric Softener Lavender Pouch-1 Ltr,1 Ltr,Rs. 450.00,Cleaning Household,Laundry Detergents,https://g-cdn.blinkco.io/ordering-system/55545...,
2,2063693,Breeo Premium Detergent Washing Powder-1 Kg,1 Kg,Rs. 525.00,Cleaning Household,Laundry Detergents,https://g-cdn.blinkco.io/ordering-system/55545...,
3,2114517,Ariel Original Detergent Promo Pack-1 Kg,1 Kg,Rs. 580.00,Cleaning Household,Laundry Detergents,https://g-cdn.blinkco.io/ordering-system/55545...,
4,1491647,Bonus Tristar Washing Powder-3 Kg,3 Kg,Rs. 650.00,Cleaning Household,Laundry Detergents,https://hypr-images.s3.amazonaws.com/images/rs...,


In [8]:

if products_df is not None:
    print("--- Extraction Summary ---")
    print(f"Total products: {len(products_df)}")
    print(f"Total categories: {products_df['category'].nunique()}")
    print(f"Total subcategories: {products_df['subcategory'].nunique()}")

    print("\\nProducts per category:")
    category_counts = products_df['category'].value_counts()
    for category, count in category_counts.items():
        print(f"  {category}: {count} products")

    print("\\nProducts with missing data:")
    print(f"  Missing titles: {products_df['title'].isna().sum()}")
    print(f"  Missing prices: {products_df['price'].isna().sum()}")
    print(f"  Missing images: {products_df['image_url'].isna().sum()}")

--- Extraction Summary ---
Total products: 7596
Total categories: 25
Total subcategories: 141
\nProducts per category:
  Cooking Essentials: 1351 products
  Chips And Snacks: 1077 products
  Hair Care: 681 products
  Skin Care: 677 products
  Cleaning Household: 651 products
  Beverages: 426 products
  Baby Foods Diapers: 367 products
  Bath Body: 364 products
  Breakfast: 339 products
  Fragrances: 293 products
  Dental Care: 241 products
  Makeup: 179 products
  Frozen: 170 products
  Women Adult Care: 161 products
  Dairy: 147 products
  Mens Care: 117 products
  Health Wellness: 87 products
  Electronic Accessories: 53 products
  Pet Care: 45 products
  Car Care: 43 products
  Shoe Care: 41 products
  Fruits And Vegetables: 35 products
  Meat: 28 products
  Hand Foot Care: 17 products
  Kitchen Home Appliances: 6 products
\nProducts with missing data:
  Missing titles: 0
  Missing prices: 0
  Missing images: 0
