In [None]:
import os
import json
import nest_asyncio
import asyncio
import aiofiles
from pathlib import Path
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from asyncio import Semaphore
from datetime import datetime

# Apply nest_asyncio for compatibility with Jupyter/Async environments
nest_asyncio.apply()

# Limit the number of concurrent tasks
SEMAPHORE = Semaphore(3)
LOG_FILE = "[STEP4]get_the_archive_html_playwright_process_log.txt"  # Log file to track last processed file


def log_message(message):
    """Log a message with timestamp."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{timestamp} - {message}")
    with open("error_log.txt", "a", encoding="utf-8") as log_file:
        log_file.write(f"{timestamp} - {message}\n")


def clean_text_with_beautifulsoup(html_content):
    """Extract plain text from HTML using BeautifulSoup."""
    return BeautifulSoup(html_content, 'html.parser').get_text(separator=' ', strip=True)


async def extract_html_from_frames(page):
    """Extract raw HTML content from all frames on the page."""
    try:
        return " ".join(
            await asyncio.gather(*[frame.evaluate("""() => document.body.innerHTML""") for frame in page.frames])
        )
    except Exception as e:
        log_message(f"Error extracting HTML from frames: {e}")
        return None


async def scrape_and_clean_text(url, file_name):
    """Scrape visible text from a webpage, clean it, and return it."""
    async with SEMAPHORE:
        async with async_playwright() as p:
            try:
                browser = await p.firefox.launch(headless=True) 
                page = await browser.new_page()

                max_retries = 3
                for attempt in range(max_retries):
                    try:
                        log_message(f"[{file_name}] Accessing: {url}")
                        await page.goto(url, timeout=15000)
                        await page.wait_for_load_state('networkidle', timeout=10000)
                        break
                    except Exception as e:
                        log_message(f"[{file_name}] Retry {attempt + 1} failed for {url}: {e}")
                        if attempt == max_retries - 1:
                            log_message(f"[{file_name}] Skipping {url} after {max_retries} attempts.")
                            return None

                html_content = await extract_html_from_frames(page)
                return clean_text_with_beautifulsoup(html_content) if html_content else None
            except Exception as e:
                log_message(f"[{file_name}] Error processing {url}: {e}")
                return None
            finally:
                await browser.close()


async def process_json_file(json_file, output_folder):
    """Process a single JSON file, updating it with scraped HTML text."""
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)

    log_message(f"Processing file: {json_file.name}")
    async with aiofiles.open(json_file, mode="r", encoding="utf-8") as f:
        try:
            data = json.loads(await f.read())
        except json.JSONDecodeError as e:
            log_message(f"Error reading JSON file {json_file.name}: {e}")
            return

    updated_data = []
    for job in data:
        job_link = job.get("Job Link")
        if not job_link:
            log_message(f"[{json_file.name}] Skipping job listing without 'Job Link'")
            continue

        # Scrape and clean the content
        cleaned_text = await scrape_and_clean_text(job_link, json_file.name)
        job["HTML_Text"] = cleaned_text
        status = "successfully" if cleaned_text else "unsuccessfully"
        log_message(f"[{json_file.name}] Processed {job_link} {status}.")
        updated_data.append(job)

    # Save the updated JSON file
    output_file = output_path / json_file.name
    try:
        async with aiofiles.open(output_file, mode="w", encoding="utf-8") as out_f:
            await out_f.write(json.dumps(updated_data, ensure_ascii=False, indent=4))
        log_message(f"Saved updated file: {output_file}")
    except Exception as e:
        log_message(f"Error saving file {output_file}: {e}")

    # Log the processed file
    async with aiofiles.open(LOG_FILE, mode="w", encoding="utf-8") as log_f:
        await log_f.write(json_file.name)


async def process_files_in_batches(input_folder, output_folder, start_file=None, batch_size=3):
    """Process JSON files in batches."""
    input_path = Path(input_folder)
    json_files = sorted(input_path.glob("*.json"))

    # Find the start index if a starting file is provided
    start_index = next((i for i, file in enumerate(json_files) if file.name == start_file), 0)

    # Process files in batches
    for i in range(start_index, len(json_files), batch_size):
        batch = json_files[i:i + batch_size]
        tasks = [process_json_file(file, output_folder) for file in batch]
        await asyncio.gather(*tasks)


async def main():
    input_folder = "archive_filtered"  # Folder containing JSON files
    output_folder = "final_withhtml"  # Folder to save updated JSON files

    try:
        # Prompt for the starting file
        start_file = input("Enter the name of the starting file (or press Enter to start from the beginning): ").strip()
        await process_files_in_batches(input_folder, output_folder, start_file=start_file if start_file else None)
    except Exception as e:
        log_message(f"Unexpected error: {e}")


# Run the main coroutine
asyncio.run(main())


2024-12-11 12:21:10 - Processing file: job_listings_20240110_archive.json
2024-12-11 12:21:10 - Processing file: job_listings_20240111_archive.json
2024-12-11 12:21:10 - Processing file: job_listings_20240112_archive.json
2024-12-11 12:21:19 - [job_listings_20240112_archive.json] Accessing: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h1433576
2024-12-11 12:21:26 - [job_listings_20240110_archive.json] Accessing: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h1432675
2024-12-11 12:21:33 - [job_listings_20240111_archive.json] Accessing: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h1433129
2024-12-11 12:21:41 - [job_listings_20240110_archive.json] Retry 1 failed for https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h1432675: Page.goto: Timeout 15000ms exceeded.
Call log:
  - navigating to "https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h1432675", waiting until "load"

2024-12-11 12:21:41 - [job_listings_20240110_archive.json] Accessing: https://www.jobinde