In [1]:
# Install necessary packages
!pip install selenium
!pip install webdriver-manager



In [2]:
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    NoSuchElementException, WebDriverException, TimeoutException,
    ElementClickInterceptedException, StaleElementReferenceException,
    UnexpectedTagNameException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select # If needed in future, keep it

# Standard libraries
import queue
import threading
import time
import re
import os
import traceback # For detailed error logging

# Webdriver Manager (if using - recommended for local runs)
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.chrome.service import Service

print("Imports successful.")

Imports successful.


In [5]:
# --- Queues for Stage 2 ---
job_queue_stage2 = queue.Queue()
results_queue_stage2 = queue.Queue()

# --- Lock for thread-safe printing ---
print_lock = threading.Lock()

# --- Global list for results (managed by main thread) ---
all_results_stage2 = []
results_stage2_lock = threading.Lock() # Lock for modifying all_results_stage2 if needed

# --- Thread-Safe Print Function ---
def safe_print(message):
    """Prints messages in a thread-safe manner."""
    with print_lock:
        print(message)

# --- WebDriver Initialization for Threads ---
# Make sure this function matches the one that worked in your Stage 1 notebook
# If using webdriver-manager:
# def initialize_thread_driver():
#     """Initialize a new WebDriver instance for the thread using webdriver-manager."""
#     options = webdriver.ChromeOptions()
#     options.add_argument('--headless')
#     options.add_argument('--no-sandbox')
#     options.add_argument('--disable-dev-shm-usage')
#     options.add_argument('--disable-gpu')
#     options.add_argument('--disable-extensions')
#     options.add_argument("window-size=1920,1080") # Often helps headless stability
#     options.add_argument('--log-level=1') # Reduce default logging noise

#     try:
#         # Using webdriver-manager
#         service = Service(ChromeDriverManager().install())
#         driver = webdriver.Chrome(service=service, options=options)
#         safe_print("WebDriver initialized using webdriver-manager.")
#     except Exception as e:
#         safe_print(f"ERROR initializing driver with webdriver-manager: {e}")
#         # Fallback or re-raise - adjust as needed
#         raise # Re-raise the error if initialization fails

#     driver.set_page_load_timeout(60) # Increase timeout for detail pages
#     driver.implicitly_wait(5)  # Lower implicit wait, rely more on explicit waits
#     return driver

# If specifying chromedriver path directly (replace with your actual path if needed):
def initialize_thread_driver():
    """Initialize a new WebDriver instance for the thread specifying path."""
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument("window-size=1920,1080")
    options.add_argument('--log-level=1')

    # Specify the path to your chromedriver if not using webdriver-manager
    # chromedriver_path = '/path/to/your/chromedriver'
    # service = Service(executable_path=chromedriver_path)
    # driver = webdriver.Chrome(service=service, options=options)

    # If chromedriver is in PATH or managed automatically (e.g., Colab)
    driver = webdriver.Chrome(options=options)


    driver.set_page_load_timeout(60) # Increased timeout
    driver.implicitly_wait(5)
    return driver


# --- Helper functions for safe text extraction ---
def safe_get_text(driver, locator, default="", timeout=10):
    """Safely gets text using WebDriverWait."""
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((locator[0], locator[1]))
        )
        return element.text.strip()
    except (NoSuchElementException, TimeoutException):
        return default
    except Exception as e:
        safe_print(f"Warning: Unexpected error getting text for {locator}: {type(e).__name__}")
        return default

# Helper function to safely get multiple elements' text and join
def safe_get_joined_text(driver, locator, separator="\n", default="", timeout=10):
    """Safely gets and joins text from multiple elements using WebDriverWait."""
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((locator[0], locator[1]))
        )
        elements = driver.find_elements(locator[0], locator[1])
        if not elements:
            return default
        texts = [el.text.strip() for el in elements if el.text and el.text.strip()]
        return separator.join(texts)
    except (NoSuchElementException, TimeoutException):
        return default
    except Exception as e:
        safe_print(f"Warning: Unexpected error getting joined text for {locator}: {type(e).__name__}")
        return default
#-----------------------------------

def safe_get_outer_html(driver, locator, default="", timeout=10):
    """Safely gets outerHTML of an element using WebDriverWait."""
    try:
        element = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((locator[0], locator[1]))
        )
        # Get the outerHTML attribute via JavaScript execution
        return driver.execute_script("return arguments[0].outerHTML;", element)
    except (NoSuchElementException, TimeoutException):
        safe_print(f"Debug: Container element for outerHTML not found or timed out: {locator}")
        return default
    except Exception as e:
        safe_print(f"Warning: Unexpected error getting outerHTML for {locator}: {type(e).__name__}")
        return default

In [7]:
def scrape_detail_page(driver, url):
    """
    Navigates to a case detail URL and scrapes predefined data points.
    Full Text now captures the outerHTML of the main content container.
    """
    scraped_data = {}
    current_thread_id = threading.current_thread().name
    safe_print(f"[{current_thread_id}] Attempting navigation to: {url}")

    try:
        driver.get(url)
        WebDriverWait(driver, 30).until(
             EC.presence_of_element_located((By.ID, "decision_summary"))
        )
        safe_print(f"[{current_thread_id}] Navigation successful, page title: {driver.title}")
    except TimeoutException:
         safe_print(f"[{current_thread_id}] CRITICAL: Timeout waiting for #decision_summary navigating to {url}. Aborting.")
         return None
    except Exception as e:
        safe_print(f"[{current_thread_id}] CRITICAL: Error navigating to URL {url}: {type(e).__name__} - {e}. Aborting.")
        return None

    # --- Container (VERIFY THIS XPATH) ---
    CONTAINER_XPATH = "//div[@id='faisala_detail ']" # ID has a trailing space!
    container_locator = (By.XPATH, CONTAINER_XPATH) # Store locator tuple
    container_found = False
    try:
        WebDriverWait(driver, 7).until(EC.presence_of_element_located(container_locator))
        safe_print(f"[{current_thread_id}] Main content container ({CONTAINER_XPATH}) found.")
        container_found = True
    except TimeoutException:
         safe_print(f"[{current_thread_id}] WARNING: Main content container ({CONTAINER_XPATH}) not found quickly.")

    # --- Locators ---
    LOCATORS = {
        # Header/Summary
        "decision_no_raw": (By.XPATH, "//h1[@class='post-title']/a"),
        "nkp_volume": (By.XPATH, "//div[@id='edition-info']/span[contains(text(), 'भाग:')]/strong"),
        "nkp_year": (By.XPATH, "//div[@id='edition-info']/span[contains(text(), 'साल:')]/strong"),
        "nkp_month": (By.XPATH, "//div[@id='edition-info']/span[contains(text(), 'महिना:')]/strong"),
        "nkp_issue": (By.XPATH, "//div[@id='edition-info']/span[contains(text(), 'अंक:')]/strong"),
        "decision_date_raw": (By.XPATH, "//div[contains(@class, 'post-meta')]"),

        # Inside Container (using CONTAINER_XPATH prefix)
        "case_no_paragraph": (By.XPATH, CONTAINER_XPATH + "//p[contains(normalize-space(.), 'सालको रि.नं.') or contains(normalize-space(.), 'मुद्दा नं :') or contains(normalize-space(.),'-CR-') or contains(normalize-space(.),'-CI-') or contains(normalize-space(.),'-WO-') or contains(normalize-space(.), '-MS-')]"),
        "judges": (By.XPATH, CONTAINER_XPATH + "//p[starts-with(normalize-space(.), 'माननीय न्यायाधीश')]"),
        "subject_raw": (By.XPATH, "//h1[@class='post-title']/a"), # Using title
        "petitioner_raw": (By.XPATH, CONTAINER_XPATH + "//p[starts-with(normalize-space(.), 'निवेदक :') or starts-with(normalize-space(.), 'पुनरावेदक / वादी :') or starts-with(normalize-space(.), 'पुनरावेदक/निवेदक :')]"),
        "respondent_raw": (By.XPATH, CONTAINER_XPATH + "//p[starts-with(normalize-space(.), 'विपक्षी :') or starts-with(normalize-space(.), 'प्रत्यर्थी / प्रतिवादी :') or starts-with(normalize-space(.), 'विपक्षी/प्रत्यर्थी :')]"),
        "petitioner_lawyer_raw": (By.XPATH, CONTAINER_XPATH + "//p[starts-with(normalize-space(.), 'निवेदकतर्फबाट :')]"),
        "respondent_lawyer_raw": (By.XPATH, CONTAINER_XPATH + "//p[starts-with(normalize-space(.), 'विपक्षीतर्फबाट :')]"),

        # NO LONGER NEEDED FOR FULL TEXT: "full_text_paragraphs_order", "full_text_paragraphs_judge"
    }
    safe_print(f"[{current_thread_id}] Locators defined. Starting extraction...")

    # --- Extraction with Logging ---

    # --- Extract Individual Fields (Steps 1-9, same as before) ---
    # 1. Decision Number
    raw_title = safe_get_text(driver, LOCATORS["decision_no_raw"])
    safe_print(f"[{current_thread_id}] Raw Title: '{raw_title}'")
    scraped_data["decision_no"] = ""
    if raw_title and 'निर्णय नं.' in raw_title:
        match = re.search(r"निर्णय नं\.\s*(\S+)", raw_title)
        if match: scraped_data["decision_no"] = match.group(1)
        else:
             try: scraped_data["decision_no"] = raw_title.split('-')[0].replace('निर्णय नं.','').strip()
             except: pass
    safe_print(f"[{current_thread_id}] Extracted decision_no: '{scraped_data['decision_no']}'")

    # 2. Case Number (Strict Regex filter)
    case_no_text_raw = safe_get_text(driver, LOCATORS["case_no_paragraph"])
    safe_print(f"[{current_thread_id}] Raw text potentially containing case_no: '{case_no_text_raw}'")
    scraped_data["case_no"] = "" # Default to empty
    if case_no_text_raw:
        case_no_pattern = r"([०१२३४५६७८९]+[-–][A-Z]+[-–][०१२३४५६७८९]+)" # XXX-YY-ZZZ
        match = re.search(case_no_pattern, case_no_text_raw)
        if match:
            scraped_data["case_no"] = match.group(1)
            safe_print(f"[{current_thread_id}] Extracted case_no (Regex Match): '{scraped_data['case_no']}'")
        else:
            safe_print(f"[{current_thread_id}] Desired case_no pattern (XXX-YY-ZZZ) not found via Regex.")

    # 3. NKP Details
    scraped_data["nkp_volume"] = safe_get_text(driver, LOCATORS["nkp_volume"])
    scraped_data["nkp_year"] = safe_get_text(driver, LOCATORS["nkp_year"])
    scraped_data["nkp_month"] = safe_get_text(driver, LOCATORS["nkp_month"])
    scraped_data["nkp_issue"] = safe_get_text(driver, LOCATORS["nkp_issue"])
    safe_print(f"[{current_thread_id}] Found NKP Details: V='{scraped_data['nkp_volume']}', Y='{scraped_data['nkp_year']}', M='{scraped_data['nkp_month']}', I='{scraped_data['nkp_issue']}'")

    # 4. Decision Date
    raw_meta = safe_get_text(driver, LOCATORS["decision_date_raw"])
    safe_print(f"[{current_thread_id}] Raw Meta: '{raw_meta}'")
    scraped_data["decision_date"] = ""
    if raw_meta and 'फैसला मिति :' in raw_meta:
         match = re.search(r"फैसला मिति\s*:\s*(\S+)", raw_meta)
         if match: scraped_data["decision_date"] = match.group(1)
    safe_print(f"[{current_thread_id}] Extracted decision_date: '{scraped_data['decision_date']}'")

    # 5. Judges
    scraped_data["judges"] = safe_get_joined_text(driver, LOCATORS["judges"], separator="\n")
    safe_print(f"[{current_thread_id}] Found judges:\n{scraped_data['judges']}")

    # 6. Subject (From title)
    raw_subject_title = safe_get_text(driver, LOCATORS["subject_raw"])
    scraped_data["subject"] = ""
    if raw_subject_title and '-' in raw_subject_title:
        try: scraped_data["subject"] = raw_subject_title.split('-', 1)[1].strip()
        except Exception: pass
    elif raw_subject_title: scraped_data["subject"] = raw_subject_title
    safe_print(f"[{current_thread_id}] Extracted subject (from title): '{scraped_data['subject']}'")

    # 7. Petitioner
    raw_petitioner = safe_get_text(driver, LOCATORS["petitioner_raw"])
    safe_print(f"[{current_thread_id}] Raw Petitioner: '{raw_petitioner}'")
    petitioner_text = raw_petitioner
    for prefix in ["निवेदक :", "पुनरावेदक / वादी :", "पुनरावेदक/निवेदक :"]:
        if petitioner_text.startswith(prefix):
            petitioner_text = petitioner_text.replace(prefix, "", 1).strip(); break
    scraped_data["petitioner"] = petitioner_text if raw_petitioner else ""
    safe_print(f"[{current_thread_id}] Cleaned petitioner: '{scraped_data['petitioner']}'")

    # 8. Respondent
    raw_respondent = safe_get_text(driver, LOCATORS["respondent_raw"])
    safe_print(f"[{current_thread_id}] Raw Respondent: '{raw_respondent}'")
    respondent_text = raw_respondent
    for prefix in ["विपक्षी :", "प्रत्यर्थी / प्रतिवादी :", "विपक्षी/प्रत्यर्थी :"]:
         if respondent_text.startswith(prefix):
            respondent_text = respondent_text.replace(prefix, "", 1).strip(); break
    scraped_data["respondent"] = respondent_text if raw_respondent else ""
    safe_print(f"[{current_thread_id}] Cleaned respondent: '{scraped_data['respondent']}'")

    # 9. Lawyers
    p_lawyer_raw = safe_get_text(driver, LOCATORS["petitioner_lawyer_raw"])
    r_lawyer_raw = safe_get_text(driver, LOCATORS["respondent_lawyer_raw"])
    p_lawyer = p_lawyer_raw.replace("निवेदकतर्फबाट :", "").strip()
    r_lawyer = r_lawyer_raw.replace("विपक्षीतर्फबाट :", "").strip()
    lawyers_list = []
    if p_lawyer: lawyers_list.append(f"निवेदकतर्फबाट: {p_lawyer}")
    if r_lawyer: lawyers_list.append(f"विपक्षीतर्फबाट: {r_lawyer}")
    scraped_data["lawyers"] = "\n".join(lawyers_list)
    safe_print(f"[{current_thread_id}] Combined lawyers:\n{scraped_data['lawyers']}")


    # --- 10. Full Text (Get outerHTML of the container) --- START ---
    if container_found:
        scraped_data["full_text"] = safe_get_outer_html(driver, container_locator) # Use the helper
        safe_print(f"[{current_thread_id}] Found full_text HTML (Length: {len(scraped_data['full_text'])} chars)")
    else:
        # If container wasn't found, we cannot get the HTML
        scraped_data["full_text"] = ""
        safe_print(f"[{current_thread_id}] Could not extract full_text HTML because container was not found.")
    # --- 10. Full Text --- END ---


    safe_print(f"[{current_thread_id}] Extraction complete for {url}")
    return scraped_data

print("REVISED scrape_detail_page function (Capture Full HTML for text) defined.")

REVISED scrape_detail_page function (Capture Full HTML for text) defined.


In [9]:
def worker_thread_stage2(thread_id):
    """Worker thread function for scraping detail pages."""
    safe_print(f"[Thread-{thread_id} Stage2] Started.") # Changed logging format slightly
    driver = None

    try: # Outer try for driver lifecycle
        try: # Inner try for initial driver setup
            driver = initialize_thread_driver() # Reuse driver init logic
            safe_print(f"[Thread-{thread_id} Stage2] WebDriver initialized.")
        except Exception as e:
            safe_print(f"[Thread-{thread_id} Stage2] CRITICAL: Failed to initialize WebDriver: {type(e).__name__} - {e}. Thread exiting.")
            return # Cannot proceed

        while True: # Main job processing loop
            job_data = None # This will be a dictionary from the input CSV row
            try:
                # Get a job from the queue - wait up to 1 second if empty
                job_data = job_queue_stage2.get(block=True, timeout=1)

                link_to_scrape = job_data.get('link')
                if not link_to_scrape or not isinstance(link_to_scrape, str) or not link_to_scrape.startswith('http'):
                    safe_print(f"[Thread-{thread_id} Stage2] Skipping job due to invalid/missing 'link': {job_data.get('title', 'N/A')}")
                    job_queue_stage2.task_done()
                    continue

                safe_print(f"[Thread-{thread_id} Stage2] Processing Link: {link_to_scrape} (Title: {job_data.get('title', 'N/A')})")

                # --- Call the detail scraping function ---
                scraped_details = scrape_detail_page(driver, link_to_scrape) # This now returns None on critical failure

                if scraped_details is not None:
                    # Combine original data with scraped data
                    combined_result = job_data.copy() # Start with original data
                    combined_result.update(scraped_details) # Add/overwrite with scraped data
                    results_queue_stage2.put(combined_result)
                    # safe_print(f"[Thread-{thread_id} Stage2] Successfully scraped: {link_to_scrape}") # Logged within scrape_detail_page now
                else:
                    # scrape_detail_page returned None, likely navigation error or critical timeout logged inside it
                    safe_print(f"[Thread-{thread_id} Stage2] Scrape attempt finished with critical failure (check logs above): {link_to_scrape}")
                    # We don't put anything in results queue for critical failures

                # Mark job as done *regardless* of scraping success/failure (we attempted it)
                job_queue_stage2.task_done()

            except queue.Empty:
                safe_print(f"[Thread-{thread_id} Stage2] Job queue empty after waiting. Exiting loop.")
                break # Exit the while loop

            except WebDriverException as e:
                safe_print(f"[Thread-{thread_id} Stage2] WebDriverException for link {job_data.get('link', 'N/A')}: {type(e).__name__}. Reinitializing driver.")
                if job_data:
                     try: job_queue_stage2.task_done()
                     except ValueError: pass
                try:
                    if driver: driver.quit()
                except: pass
                try:
                    driver = initialize_thread_driver()
                    safe_print(f"[Thread-{thread_id} Stage2] WebDriver reinitialized.")
                except Exception as init_err:
                    safe_print(f"[Thread-{thread_id} Stage2] CRITICAL: Failed re-init WebDriver: {init_err}. Thread exiting.")
                    break

            except Exception as e:
                safe_print(f"[Thread-{thread_id} Stage2] UNEXPECTED ERROR processing link {job_data.get('link', 'N/A')}: {type(e).__name__} - {e}")
                safe_print(traceback.format_exc()) # Print traceback for unexpected errors
                if job_data:
                    try: job_queue_stage2.task_done()
                    except ValueError: pass
                safe_print(f"[Thread-{thread_id} Stage2] Skipping to next job attempt.")

        # <<< End of while True loop >>>

    finally:
        # Cleanup
        if driver:
            try:
                driver.quit()
                safe_print(f"[Thread-{thread_id} Stage2] WebDriver quit.")
            except Exception as e:
                safe_print(f"[Thread-{thread_id} Stage2] Error quitting WebDriver: {e}")
        safe_print(f"[Thread-{thread_id} Stage2] Finished.")

print("worker_thread_stage2 function defined.")

worker_thread_stage2 function defined.


In [11]:
def run_scrape_stage2(input_csv_path, output_csv_path, num_threads=5, test_limit=0): # Default to fewer threads
    """
    Reads links from input CSV, scrapes detail pages using threads,
    and saves combined results to a new output CSV.
    Optionally processes only the first 'test_limit' rows for testing.
    """
    print(f"\n=== INITIALIZING DETAIL SCRAPE (STAGE 2) ===")
    if test_limit > 0:
        print(f"--- !!! TEST RUN MODE: Processing only first {test_limit} rows !!! ---")
    print(f"Input CSV: {input_csv_path}")
    print(f"Output CSV: {output_csv_path}")
    print(f"Threads: {num_threads}")

    # --- Input Validation ---
    if not os.path.exists(input_csv_path):
        print(f"ERROR: Input CSV file not found at '{input_csv_path}'. Exiting.")
        return

    # --- Read Input CSV ---
    try:
        print(f"Reading input CSV: {input_csv_path}...")
        # Define columns we want to keep from original (ensure they exist in your CSV)
        columns_to_keep = [
            'link', 'title', 'ijlas_name', 'mudda_type_value', 'mudda_type_text',
            'mudda_name_value', 'mudda_name_text', 'faisala_type_value', 'page'
        ]
        df_input = pd.read_csv(input_csv_path)

        if 'link' not in df_input.columns:
             print(f"ERROR: Input CSV must contain a 'link' column. Exiting.")
             return

        # Select only the columns we want, handling missing ones gracefully
        existing_cols_to_keep = [col for col in columns_to_keep if col in df_input.columns]
        if not existing_cols_to_keep:
             print(f"ERROR: No relevant columns ({columns_to_keep}) found in input CSV. Exiting.")
             return
        df_input = df_input[existing_cols_to_keep].copy() # Create copy after selection

        # --- Apply Test Limit ---
        if test_limit > 0 and test_limit < len(df_input):
            print(f"Applying test limit: Selecting first {test_limit} rows from {len(df_input)} total.")
            df_input = df_input.head(test_limit).copy()

        print(f"Found {len(df_input)} links to process {'(after applying test limit)' if test_limit > 0 else ''}.")
        if df_input.empty:
            print("No links to process after filtering/limiting. Exiting.")
            return
        # Drop rows with missing or invalid links before queuing
        original_count = len(df_input)
        df_input = df_input.dropna(subset=['link'])
        df_input = df_input[df_input['link'].astype(str).str.startswith('http')]
        if len(df_input) < original_count:
            print(f"Dropped {original_count - len(df_input)} rows with missing/invalid links.")

    except Exception as e:
        print(f"ERROR reading input CSV: {e}. Exiting.")
        print(traceback.format_exc())
        return

    # --- Queue Setup ---
    print("Clearing Stage 2 queues...")
    while not job_queue_stage2.empty():
        try: job_queue_stage2.get_nowait()
        except queue.Empty: break
    try: # Clear pending task_done calls
        while True: job_queue_stage2.task_done()
    except ValueError: pass
    while not results_queue_stage2.empty():
        try: results_queue_stage2.get_nowait()
        except queue.Empty: break
    print("Queues cleared.")

    with results_stage2_lock:
        all_results_stage2.clear()

    # --- Populate Job Queue ---
    print("Populating job queue...")
    jobs_to_add = df_input.to_dict('records')
    for job_data in jobs_to_add:
        job_queue_stage2.put(job_data)
    total_jobs_initial = job_queue_stage2.qsize()
    if total_jobs_initial == 0:
        print("No valid jobs to add to queue. Exiting.")
        return
    print(f"Added {total_jobs_initial} jobs to the queue.")

    # --- Threading Setup ---
    start_time = time.time()
    save_interval_results_collected = 100 # Save every N results successfully collected
    last_save_count_results = 0

    print(f"\nStarting {num_threads} worker threads for Stage 2...")
    threads = []
    for i in range(num_threads):
        # Ensure thread IDs passed are unique if needed elsewhere, range(num_threads) is fine
        t = threading.Thread(target=worker_thread_stage2, args=(i,), name=f"Thread-{i}") # Assign name
        t.daemon = True
        t.start()
        threads.append(t)
    print("Stage 2 worker threads started.")

    # --- Monitoring and Saving Loop ---
    print("\n--- Starting Stage 2 Scrape Monitoring ---")
    monitoring_interval = 10 # Check every 10 seconds
    last_print_time = time.time()
    try:
        while job_queue_stage2.unfinished_tasks > 0:
            current_time = time.time()
            unfinished_tasks = job_queue_stage2.unfinished_tasks
            completed_jobs = max(0, total_jobs_initial - unfinished_tasks)
            progress = (completed_jobs / total_jobs_initial) * 100 if total_jobs_initial > 0 else 100
            elapsed = current_time - start_time

            # Collect results
            new_results = []
            while not results_queue_stage2.empty():
                try: new_results.append(results_queue_stage2.get_nowait())
                except queue.Empty: break
            if new_results:
                 with results_stage2_lock:
                      all_results_stage2.extend(new_results)
                 # Don't print every time, maybe just count?
                 # print(f"Collected {len(new_results)}. Total: {len(all_results_stage2)}")


            # Print progress update periodically or if results were collected
            if new_results or current_time - last_print_time > monitoring_interval:
                eta_str = "N/A"
                if completed_jobs > 0 and progress < 100:
                    avg_time = elapsed / completed_jobs
                    eta = avg_time * unfinished_tasks
                    eta_str = f"{eta / 60:.1f} mins" if eta > 0 else "Near completion"
                active_threads = sum(1 for t in threads if t.is_alive())
                print(f"Progress: {progress:.1f}% ({completed_jobs}/{total_jobs_initial}) | Unfinished: {unfinished_tasks} | Active: {active_threads}/{num_threads} | Scraped OK: {len(all_results_stage2)} | Elapsed: {elapsed/60:.1f}m | ETA: {eta_str}")
                last_print_time = current_time


                # --- Incremental Saving ---
                if len(all_results_stage2) >= last_save_count_results + save_interval_results_collected:
                     print(f"\nSaving incrementally ({len(all_results_stage2)} collected >= {last_save_count_results} + {save_interval_results_collected})...")
                     try:
                        # Ensure the list is copied before converting to DataFrame if needed
                        with results_stage2_lock:
                             results_to_save = list(all_results_stage2) # Create a copy under lock

                        temp_df = pd.DataFrame(results_to_save)

                        # Define expected column order (original kept + new scraped)
                        column_order = existing_cols_to_keep + [
                            'decision_no', 'case_no', 'nkp_volume', 'nkp_year', 'nkp_month',
                            'nkp_issue', 'decision_date', 'judges', 'subject', 'petitioner',
                            'respondent', 'lawyers', 'full_text'
                        ]
                        # Add any missing columns and reorder
                        temp_df = temp_df.reindex(columns=column_order)
                        temp_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
                        print(f"Incremental save successful: {len(results_to_save)} results saved to {output_csv_path}\n")
                        last_save_count_results = len(results_to_save) # Update counter
                     except Exception as save_err:
                         print(f"ERROR during incremental save: {save_err}\n")
                         print(traceback.format_exc())

            time.sleep(1) # Small sleep to prevent busy-waiting

        print("\n--- Monitoring loop finished (Unfinished tasks 0). ---")

    except KeyboardInterrupt:
        print("\n--- KeyboardInterrupt detected! Stopping monitoring... ---")
    except Exception as e:
        print(f"\n--- UNEXPECTED ERROR in main monitoring loop: {type(e).__name__} - {e} ---")
        print(traceback.format_exc())

    finally:
        print("\n--- Starting Final Cleanup (Stage 2) ---")
        print("Waiting for all Stage 2 tasks to be marked done (job_queue_stage2.join())...")
        job_queue_stage2.join() # Wait for task_done() calls
        print("All Stage 2 tasks marked done.")

        # Collect final results
        print("Collecting final results from queue...")
        final_batch = []
        while not results_queue_stage2.empty():
            try: final_batch.append(results_queue_stage2.get_nowait())
            except queue.Empty: break
        if final_batch:
             with results_stage2_lock: all_results_stage2.extend(final_batch)
             print(f"Collected {len(final_batch)} final results. Total: {len(all_results_stage2)}")

        # --- Final Save ---
        end_time = time.time()
        total_time_minutes = (end_time - start_time) / 60
        final_completed_jobs = total_jobs_initial

        print("\n--- Stage 2 Scrape Summary ---")
        print(f"Run Type: {'TEST RUN (Limited)' if test_limit > 0 else 'Full Run'}")
        print(f"Total execution time: {total_time_minutes:.2f} minutes.")
        print(f"Input links processed: {final_completed_jobs}") # Total jobs queued
        print(f"Detail pages successfully scraped (results collected): {len(all_results_stage2)}")
        print(f"Output file: {output_csv_path}")
        print("NOTE: Check logs for any 'CRITICAL' or 'WARNING' messages.")

        if all_results_stage2:
            print(f"\nPerforming final save to {output_csv_path}...")
            try:
                # Create final DataFrame from the global list
                final_df = pd.DataFrame(all_results_stage2)
                 # Reapply column order for final save
                column_order = existing_cols_to_keep + [
                    'decision_no', 'case_no', 'nkp_volume', 'nkp_year', 'nkp_month',
                    'nkp_issue', 'decision_date', 'judges', 'subject', 'petitioner',
                    'respondent', 'lawyers', 'full_text'
                ]
                # Ensure all columns exist and are in order
                final_df = final_df.reindex(columns=column_order)
                final_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
                print(f"Successfully saved {len(all_results_stage2)} results.")
            except Exception as save_err:
                print(f"ERROR during final save: {save_err}")
                print(traceback.format_exc())
        else:
            print("\nNo results collected to save.")

        print("\n=== DETAIL SCRAPING PROCESS (STAGE 2) FINISHED ===\n")
        # Optionally return the results list/DataFrame
        # return all_results_stage2

print("run_scrape_stage2 function defined.")

run_scrape_stage2 function defined.


In [13]:
def worker_thread_stage2(thread_id):
    """Worker thread function for scraping detail pages."""
    safe_print(f"[Thread {thread_id} Stage2] Started.")
    driver = None

    try: # Outer try for driver lifecycle
        try: # Inner try for initial driver setup
            driver = initialize_thread_driver() # Reuse driver init logic
            safe_print(f"[Thread {thread_id} Stage2] WebDriver initialized.")
        except Exception as e:
            safe_print(f"[Thread {thread_id} Stage2] CRITICAL: Failed to initialize WebDriver: {type(e).__name__} - {e}. Thread exiting.")
            return # Cannot proceed

        while True: # Main job processing loop
            job_data = None # This will be a dictionary from the input CSV row
            try:
                # Get a job from the queue - wait up to 1 second if empty
                job_data = job_queue_stage2.get(block=True, timeout=1)

                link_to_scrape = job_data.get('link')
                if not link_to_scrape:
                    safe_print(f"[Thread {thread_id} Stage2] Skipping job due to missing 'link': {job_data.get('title', 'N/A')}")
                    job_queue_stage2.task_done()
                    continue

                safe_print(f"[Thread {thread_id} Stage2] Processing Link: {link_to_scrape} (Title: {job_data.get('title', 'N/A')})")

                # --- Call the detail scraping function ---
                scraped_details = scrape_detail_page(driver, link_to_scrape)

                if scraped_details is not None:
                    # Combine original data with scraped data
                    combined_result = job_data.copy() # Start with original data
                    combined_result.update(scraped_details) # Add/overwrite with scraped data
                    results_queue_stage2.put(combined_result)
                    safe_print(f"[Thread {thread_id} Stage2] Successfully scraped: {link_to_scrape}")
                else:
                    # scrape_detail_page returned None, likely a navigation error logged inside it
                    safe_print(f"[Thread {thread_id} Stage2] Failed to scrape (navigation/critical error): {link_to_scrape}")
                    # Optionally: Put a placeholder or the original job_data back into results?
                    # For now, we just skip adding it if scraping fails critically.

                # Mark job as done *regardless* of scraping success/failure (we attempted it)
                job_queue_stage2.task_done()

            except queue.Empty:
                safe_print(f"[Thread {thread_id} Stage2] Job queue empty after waiting. Exiting loop.")
                break # Exit the while loop

            except WebDriverException as e:
                safe_print(f"[Thread {thread_id} Stage2] WebDriverException for link {job_data.get('link', 'N/A')}: {type(e).__name__}. Reinitializing driver.")
                if job_data: job_queue_stage2.task_done() # Mark done if possible
                try:
                    if driver: driver.quit()
                except: pass # Ignore errors quitting old driver
                try:
                    driver = initialize_thread_driver()
                    safe_print(f"[Thread {thread_id} Stage2] WebDriver reinitialized.")
                except Exception as init_err:
                    safe_print(f"[Thread {thread_id} Stage2] CRITICAL: Failed re-init WebDriver: {init_err}. Thread exiting.")
                    break

            except Exception as e:
                safe_print(f"[Thread {thread_id} Stage2] UNEXPECTED ERROR processing link {job_data.get('link', 'N/A')}: {type(e).__name__} - {e}")
                import traceback
                safe_print(f"[Thread {thread_id} Stage2] Traceback:\n{traceback.format_exc()}")
                if job_data: job_queue_stage2.task_done() # Mark done if possible
                safe_print(f"[Thread {thread_id} Stage2] Skipping to next job attempt.")
                # Continue to the next iteration

        # <<< End of while True loop >>>

    finally:
        # Cleanup
        if driver:
            try:
                driver.quit()
                safe_print(f"[Thread {thread_id} Stage2] WebDriver quit.")
            except Exception as e:
                safe_print(f"[Thread {thread_id} Stage2] Error quitting WebDriver: {e}")
        safe_print(f"[Thread {thread_id} Stage2] Finished.")

In [15]:
# --- !!! USER INPUT REQUIRED HERE !!! ---

# 1. Define the EXACT path to the input CSV file from Stage 1
#    Example: input_file = "nepal_law_type5_full_640r_202j.csv"
input_file = "nepal_law_type4_full_1976.csv" # <--- CHANGE THIS

# 2. Define the desired path for the output CSV file with details
#    Example: output_file = "nepal_law_type5_DETAILS_final.csv"
output_file = "nepal_law_type4_FULL_DETAILS.csv" # <--- CHANGE THIS

# 3. Set the number of threads (start lower for detail scraping, e.g., 3-5)
num_threads_stage2 = 10 # <--- ADJUST IF NEEDED

# 4. Set the test limit (0 to run on all links, >0 for test run on first N links)
test_run_limit = 0 # <--- SET TO 0 FOR FULL RUN, OR e.g., 5 FOR TESTING

# --- !!! END USER INPUT !!! ---


# --- Execute the Scraper ---
if input_file == "YOUR_INPUT_FILE.csv" or output_file == "YOUR_OUTPUT_FILE_DETAILS.csv":
    print("\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! PLEASE EDIT CELL 6 TO SET YOUR INPUT AND OUTPUT FILENAMES !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
else:
    run_scrape_stage2(
        input_csv_path=input_file,
        output_csv_path=output_file,
        num_threads=num_threads_stage2,
        test_limit=test_run_limit
    )


=== INITIALIZING DETAIL SCRAPE (STAGE 2) ===
Input CSV: nepal_law_type4_full_1976.csv
Output CSV: nepal_law_type4_FULL_DETAILS.csv
Threads: 10
Reading input CSV: nepal_law_type4_full_1976.csv...
Found 1976 links to process .
Clearing Stage 2 queues...
Queues cleared.
Populating job queue...
Added 1976 jobs to the queue.

Starting 10 worker threads for Stage 2...
[Thread 0 Stage2] Started.
[Thread 1 Stage2] Started.
[Thread 2 Stage2] Started.
[Thread 3 Stage2] Started.
[Thread 4 Stage2] Started.
[Thread 5 Stage2] Started.
[Thread 6 Stage2] Started.
[Thread 7 Stage2] Started.
[Thread 8 Stage2] Started.
[Thread 9 Stage2] Started.
Stage 2 worker threads started.

--- Starting Stage 2 Scrape Monitoring ---
[Thread 6 Stage2] WebDriver initialized.
[Thread 6 Stage2] Processing Link: https://nkp.gov.np/full_detail/3621 (Title: निर्णय नं. ८३७४ - कुटपीट अंगभंग ।)
[Thread-6] Attempting navigation to: https://nkp.gov.np/full_detail/3621
[Thread 7 Stage2] WebDriver initialized.
[Thread 7 Stage2] P