In [2]:
import asyncio
from playwright.async_api import async_playwright
import csv
import os
from typing import List, Dict, Optional
import logging
from datetime import datetime
import time
import sys

# Set up detailed logging with real-time output
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)-8s | %(message)s',
    datefmt='%H:%M:%S',
    handlers=[
        logging.FileHandler('rahim_scraper_detailed.log', encoding='utf-8'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

class RahimStoreScraper:
    def __init__(self):
        self.base_url = "https://www.rahimstore.com/department/"
        self.departments = ['001', '002', '003', '004', '005', '006', '007']
        self.output_file = f'rahim_store_products_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        self.product_data = []
        self.stats = {
            'total_products': 0,
            'failed_products': 0,
            'department_stats': {},
            'start_time': None,
            'end_time': None,
            'current_department': None,
            'current_page': 0
        }

    def print_banner(self):
        """Print startup banner"""
        print("\n" + "═" * 80)
        print("RAHIM STORE WEB SCRAPER - REAL-TIME MONITORING")
        print("═" * 80)
        print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"Output file: {self.output_file}")
        print(f"Departments: {', '.join(self.departments)}")
        print("═" * 80)
        print("LIVE LOGS STARTING...")
        print("─" * 80)

    def print_live_status(self, action: str, details: str = ""):
        """Print real-time status updates"""
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f"[{timestamp}] {action}: {details}")

    def print_product_extracted(self, product_name: str, product_id: str, price: str):
        """Print when a product is successfully extracted"""
        timestamp = datetime.now().strftime('%H:%M:%S')
        print(f"[{timestamp}] EXTRACTED: '{product_name}' (ID: {product_id}) - Price: {price}")

    def print_page_progress(self, dept: str, page: int, current: int, total: int):
        """Print page-by-page progress"""
        progress = f"Dept {dept} | Page {page} | Progress: {current}/{total} products"

    def print_department_summary(self, dept: str, total_products: int, duration: float):
        """Print department completion summary"""
        print("─" * 80)
        print(f"DEPARTMENT {dept} COMPLETED!")
        print(f"Total products: {total_products}")
        print(f"Time taken: {duration:.2f} seconds")
        print("─" * 80)

    def print_final_summary(self):
        """Print final summary"""
        total_duration = self.stats['end_time'] - self.stats['start_time']

        print("\n" + "═" * 80)
        print("SCRAPING COMPLETED - FINAL SUMMARY")
        print("═" * 80)
        print(f"Total duration: {total_duration:.2f} seconds")
        print(f"Total products: {self.stats['total_products']}")
        print(f"Failed extractions: {self.stats['failed_products']}")
        print(f"Output file: {self.output_file}")

        print("\nDEPARTMENT BREAKDOWN:")
        for dept in self.departments:
            count = self.stats['department_stats'].get(dept, 0)
            print(f"   └── Department {dept}: {count} products")

        total_attempts = self.stats['total_products'] + self.stats['failed_products']
        success_rate = (self.stats['total_products'] / total_attempts * 100) if total_attempts > 0 else 0
        print(f"\nSuccess rate: {success_rate:.1f}%")
        print("═" * 80)

    async def setup_browser(self) -> bool:
        """Initialize browser and context"""
        try:
            self.print_live_status("INITIALIZING BROWSER", "Starting Playwright...")
            self.stats['start_time'] = time.time()

            self.playwright = await async_playwright().start()
            self.browser = await self.playwright.chromium.launch(
                headless=True,
                args=[
                    '--no-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-blink-features=AutomationControlled',
                    '--disable-features=VizDisplayCompositor'
                ]
            )

            self.context = await self.browser.new_context(
                viewport={'width': 1920, 'height': 1080},
                user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                extra_http_headers={
                    'Accept-Language': 'en-US,en;q=0.9',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
                }
            )

            self.print_live_status("BROWSER READY", "Chromium initialized successfully")
            return True

        except Exception as e:
            self.print_live_status("BROWSER FAILED", f"Error: {str(e)}")
            return False

    async def close_browser(self):
        """Close browser resources"""
        try:
            if hasattr(self, 'context'):
                await self.context.close()
            if hasattr(self, 'browser'):
                await self.browser.close()
            if hasattr(self, 'playwright'):
                await self.playwright.stop()
            self.print_live_status("BROWSER CLOSED", "All resources cleaned up")
        except Exception as e:
            self.print_live_status("CLEANUP ERROR", f"Error: {str(e)}")

    def save_to_csv(self) -> bool:
        """Save data to CSV file"""
        try:
            if not self.product_data:
                self.print_live_status("CSV SAVE", "No data to save")
                return False

            fieldnames = [
                'department_id', 'product_id', 'product_name', 'current_price',
                'original_price', 'product_url', 'image_url', 'was_price',
                'scraped_timestamp'
            ]

            with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(self.product_data)

            self.print_live_status("CSV SAVED", f"{len(self.product_data)} products saved to {self.output_file}")
            return True

        except Exception as e:
            self.print_live_status("CSV ERROR", f"Failed to save: {str(e)}")
            return False

    async def wait_for_products_fully_loaded(self, page, department_id: str, page_number: int):
        """Wait for products to be fully loaded with multiple verification steps"""
        try:
            self.print_live_status("WAITING", f"Waiting for products to fully load on page {page_number}")

            # Wait for the main product container to be present
            await page.wait_for_selector('.item.img-hover-zoom--quick-zoom', timeout=20000)
            self.print_live_status("LOAD CHECK", "Initial product container found")

            # Wait for additional time to ensure all content is loaded
            await asyncio.sleep(2)

            # Check if product images are loaded
            images_loaded = await page.query_selector_all('img.img-fluid[src]')
            self.print_live_status("LOAD CHECK", f"{len(images_loaded)} product images found")

            # Wait for prices to be loaded
            prices_loaded = await page.query_selector_all('strong')
            self.print_live_status("LOAD CHECK", f"{len(prices_loaded)} price elements found")

            # Final wait to ensure everything is rendered
            await asyncio.sleep(1)

            self.print_live_status("LOAD COMPLETE", "All products fully loaded and ready for scraping")
            return True

        except Exception as e:
            self.print_live_status("LOAD ERROR", f"Failed to wait for products: {str(e)}")
            return False

    async def extract_product_info(self, product_card, department_id: str, card_index: int) -> Optional[Dict]:
        """Extract detailed product information with real-time logging"""
        try:
            # Extract product name
            name_element = await product_card.query_selector('a[style="display:block; height:50px;"]')
            if not name_element:
                self.print_live_status("EXTRACTION FAILED", f"Card {card_index}: No name element found")
                return None

            product_name = await name_element.inner_text()
            product_name = product_name.strip() if product_name else "N/A"

            # Extract product URL and ID
            product_url = await name_element.get_attribute('href') or "N/A"
            product_id = await name_element.get_attribute('productid') or "N/A"

            # Extract image
            img_element = await product_card.query_selector('img.img-fluid')
            image_url = await img_element.get_attribute('src') if img_element else "N/A"

            # Extract prices
            strong_element = await product_card.query_selector('strong')
            current_price = await strong_element.inner_text() if strong_element else "N/A"
            current_price = current_price.replace('Rs', '').replace('sup', '').strip()

            strike_element = await product_card.query_selector('strike')
            was_price = await strike_element.inner_text() if strike_element else "N/A"

            # Get additional price data from button
            button_element = await product_card.query_selector('button.btn-success')
            original_price = was_price

            if button_element:
                button_data = await button_element.get_attribute('data')
                if button_data and '~' in button_data:
                    data_parts = button_data.split('~')
                    if len(data_parts) >= 4:
                        original_price = data_parts[3]

            product_info = {
                'department_id': department_id,
                'product_id': product_id,
                'product_name': product_name,
                'current_price': current_price,
                'original_price': original_price,
                'product_url': product_url,
                'image_url': image_url,
                'was_price': was_price,
                'scraped_timestamp': datetime.now().isoformat()
            }

            # Print successful extraction
            self.print_product_extracted(product_name, product_id, current_price)
            return product_info

        except Exception as e:
            self.print_live_status("EXTRACTION ERROR", f"Card {card_index}: {str(e)}")
            self.stats['failed_products'] += 1
            return None

    async def handle_pagination(self, page, department_id: str, current_page: int) -> bool:
        """Handle pagination to next page with proper waiting"""
        try:
            self.print_live_status("PAGINATION", f"Checking for page {current_page + 1}")

            # Wait for pagination to load
            await page.wait_for_selector('.pagination', timeout=10000)

            # Find next button
            next_button = await page.query_selector('a.page-link[aria-label="Next"]')
            if not next_button:
                self.print_live_status("PAGINATION", "No next button found")
                return False

            # Check if next button is disabled
            is_disabled = await next_button.evaluate('(element) => element.parentElement.classList.contains("disabled")')
            if is_disabled:
                self.print_live_status("PAGINATION", f"Reached last page ({current_page})")
                return False

            # Click next button
            self.print_live_status("PAGINATION", f"Moving to page {current_page + 1}")
            await next_button.click()

            # Wait for navigation to complete and new page to load
            self.print_live_status("WAITING", "Waiting for new page to load after pagination...")
            await page.wait_for_timeout(4000)  # Increased wait time for page transition

            # Wait for products to be fully loaded on the new page
            await self.wait_for_products_fully_loaded(page, department_id, current_page + 1)

            self.print_live_status("PAGINATION", f"Successfully loaded page {current_page + 1}")
            return True

        except Exception as e:
            self.print_live_status("PAGINATION ERROR", f"Page {current_page}: {str(e)}")
            return False

    async def scrape_department_page(self, page, department_id: str, page_number: int) -> bool:
        """Scrape a single page of products with proper loading waits"""
        try:
            self.stats['current_page'] = page_number
            self.print_live_status("PAGE START", f"Department {department_id} - Page {page_number}")

            # Wait for products to be fully loaded before scraping
            if not await self.wait_for_products_fully_loaded(page, department_id, page_number):
                self.print_live_status("PAGE ERROR", f"Products not loaded properly on page {page_number}")
                return False

            # Get all product cards after ensuring they're loaded
            product_cards = await page.query_selector_all('.item.img-hover-zoom--quick-zoom')

            if not product_cards:
                self.print_live_status("PAGE EMPTY", "No product cards found after waiting")
                return False

            self.print_live_status("PRODUCTS FOUND", f"Found {len(product_cards)} products on page {page_number}")

            # Extract each product with small delays between extractions
            successful_extractions = 0
            for i, card in enumerate(product_cards, 1):
                # Show progress every 5 products
                if i % 5 == 0 or i == len(product_cards):
                    self.print_page_progress(department_id, page_number, i, len(product_cards))

                # Small delay between product extractions to be respectful
                if i > 1:
                    await asyncio.sleep(0.1)

                product_info = await self.extract_product_info(card, department_id, i)
                if product_info:
                    self.product_data.append(product_info)
                    successful_extractions += 1
                    self.stats['total_products'] += 1

            self.print_live_status("PAGE COMPLETE",
                f"Page {page_number}: {successful_extractions}/{len(product_cards)} products extracted")

            # Handle pagination
            has_next_page = await self.handle_pagination(page, department_id, page_number)
            return has_next_page

        except Exception as e:
            self.print_live_status("PAGE ERROR", f"Department {department_id} Page {page_number}: {str(e)}")
            return False

    async def scrape_department(self, department_id: str):
        """Scrape all pages of a department with proper loading waits"""
        dept_start_time = time.time()
        url = f"{self.base_url}{department_id}"

        self.stats['current_department'] = department_id
        print(f"\n{'=' * 80}")
        print(f"STARTING DEPARTMENT {department_id}")
        print(f"URL: {url}")
        print(f"{'=' * 80}")

        page = await self.context.new_page()
        dept_products_start = len(self.product_data)

        try:
            # Navigate to department with longer timeout
            self.print_live_status("NAVIGATING", f"Loading {url} (waiting for full load)...")
            response = await page.goto(url, wait_until='domcontentloaded', timeout=45000)

            if not response or response.status != 200:
                status_code = getattr(response, 'status', 'Unknown')
                self.print_live_status("NAVIGATION FAILED", f"HTTP {status_code}")
                return

            self.print_live_status("PAGE LOADED", "Initial page loaded, waiting for full content...")

            # Wait for the department page to be fully ready
            await self.wait_for_products_fully_loaded(page, department_id, 1)

            # Scrape all pages
            page_number = 1
            max_pages = 100  # Safety limit

            while page_number <= max_pages:
                has_next_page = await self.scrape_department_page(page, department_id, page_number)

                if not has_next_page:
                    self.print_live_status("DEPARTMENT COMPLETE", f"No more pages after page {page_number}")
                    break

                page_number += 1
                # Increased pause between pages
                await asyncio.sleep(2)

            # Department summary
            dept_products_count = len(self.product_data) - dept_products_start
            self.stats['department_stats'][department_id] = dept_products_count
            dept_duration = time.time() - dept_start_time

            self.print_department_summary(department_id, dept_products_count, dept_duration)

        except Exception as e:
            self.print_live_status("DEPARTMENT ERROR", f"Department {department_id}: {str(e)}")
        finally:
            await page.close()
            self.print_live_status("PAGE CLOSED", f"Department {department_id} browser page closed")

    async def run_scraper(self):
        """Main scraper execution function"""
        self.print_banner()

        if not await self.setup_browser():
            return

        try:
            # Scrape each department
            for i, department_id in enumerate(self.departments, 1):
                print(f"\nOVERALL PROGRESS: Department {i}/{len(self.departments)}")
                await self.scrape_department(department_id)

                # Longer pause between departments to be respectful
                if i < len(self.departments):
                    self.print_live_status("PAUSING", "Waiting 5 seconds before next department...")
                    await asyncio.sleep(5)

            # Final save and summary
            self.stats['end_time'] = time.time()
            self.save_to_csv()
            self.print_final_summary()

        except KeyboardInterrupt:
            self.print_live_status("INTERRUPTED", "User stopped the scraper")
            print("\nSaving collected data before exit...")
            self.save_to_csv()
        except Exception as e:
            self.print_live_status("FATAL ERROR", f"Scraper crashed: {str(e)}")
        finally:
            await self.close_browser()

async def main():
    """Main execution function"""
    print("Initializing Rahim Store Scraper with Real-time Monitoring...")

    try:
        scraper = RahimStoreScraper()
        await scraper.run_scraper()
    except Exception as e:
        print(f"Fatal initialization error: {e}")

if __name__ == "__main__":
    # Run the scraper
    await main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[15:34:33] LOAD CHECK: Initial product container found
[15:34:35] LOAD CHECK: 20 product images found
[15:34:35] LOAD CHECK: 22 price elements found
[15:34:36] LOAD COMPLETE: All products fully loaded and ready for scraping
[15:34:36] PRODUCTS FOUND: Found 33 products on page 13
[15:34:36] EXTRACTION FAILED: Card 1: No name element found
[15:34:36] EXTRACTION FAILED: Card 2: No name element found
[15:34:36] EXTRACTION FAILED: Card 3: No name element found
[15:34:36] EXTRACTION FAILED: Card 4: No name element found
[15:34:37] EXTRACTION FAILED: Card 5: No name element found
[15:34:37] EXTRACTION FAILED: Card 6: No name element found
[15:34:37] EXTRACTION FAILED: Card 7: No name element found
[15:34:37] EXTRACTION FAILED: Card 8: No name element found
[15:34:37] EXTRACTION FAILED: Card 9: No name element found
[15:34:37] EXTRACTION FAILED: Card 10: No name element found
[15:34:37] EXTRACTION FAILED: Card 11: No name element

In [1]:
!pip install playwright
!playwright install

Collecting playwright
  Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.55.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.55.0 pyee-13.0.0
Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 11.5s[0K[1G173.7 MiB [] 0% 3.2s[0K[1G173.7 MiB [] 1% 2.4s[0K[1G173.7 MiB [] 2% 2.2s[0K[1G173.7 MiB [] 3% 2.0s[0K[1G173.7 MiB [] 4% 1.9s[0K[1G173.7 MiB [] 5% 2.1s[0K[1G173.7 MiB [] 6% 2.2s[0K[1G173.7 MiB [] 7% 2.1s[0K[1G173.7 MiB [] 8% 2.0s[