In [None]:
import os
import json
import nest_asyncio
import asyncio
from pathlib import Path
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from asyncio import Semaphore

# Apply nest_asyncio for compatibility with Jupyter/Async environments
nest_asyncio.apply()

# Limit the number of concurrent tasks to avoid overwhelming the system
SEMAPHORE = Semaphore(15)  # Limit concurrent tasks (adjust based on your system)
LOG_FILE = "process_log.txt"  # Log file to track last processed file

def clean_text_with_beautifulsoup(html_content):
    """Cleans extracted HTML content using Beautiful Soup."""
    soup = BeautifulSoup(html_content, 'html.parser')
    cleaned_text = soup.get_text(separator=' ', strip=True)
    return cleaned_text

async def extract_html_from_frame(frame):
    """Extract raw HTML content from a frame."""
    return await frame.evaluate("""() => document.body.innerHTML""")

async def scrape_and_clean_text(url):
    """Scrape visible text from a webpage, clean it, and return it."""
    async with SEMAPHORE:  # Ensure limited concurrent tasks
        async with async_playwright() as p:
            # Launch headless browser
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Navigate to the URL
            print(f"Accessing: {url}")
            try:
                await page.goto(url)  # No timeout is set here
                await page.wait_for_load_state('networkidle', timeout=1500)  
            except Exception as e:
                print(f"Encountered an error while loading {url}: {e}")
                # Proceed with whatever is loaded

            # Extract raw HTML content from all frames
            try:
                frame_htmls = await asyncio.gather(
                    *[extract_html_from_frame(frame) for frame in page.frames]
                )
                # Combine all frame HTML content
                combined_html = " ".join(frame_htmls)
            except Exception as e:
                print(f"Failed to extract HTML content for {url}: {e}")
                combined_html = None

            # Close the browser
            await browser.close()

            if combined_html:
                # Clean the combined HTML using Beautiful Soup
                cleaned_text = clean_text_with_beautifulsoup(combined_html)
                return cleaned_text
            return None

async def process_json_file(json_file, output_folder):
    """Process a single JSON file."""
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)  # Create output folder if not exists

    print(f"Processing file: {json_file.name}")
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    updated_data = []  # To store updated job data

    # Iterate through each job listing in the JSON file
    for job in data:
        job_link = job.get("Job Link")
        if not job_link:
            print(f"Skipping job listing without 'Job Link'")
            continue

        # Scrape and clean the content
        print(f"Scraping URL: {job_link}")
        cleaned_text = await scrape_and_clean_text(job_link)
        if cleaned_text:
            # Add the cleaned text as a new key in the job dictionary
            job["HTML_Text"] = cleaned_text
            print(f"Extracted HTML text added to job listing: {job_link}")
        else:
            job["HTML_Text"] = None  # Add None if scraping failed
            print(f"Failed to extract HTML text for: {job_link}")

        updated_data.append(job)

    # Save the updated data to a new JSON file
    output_file = output_path / json_file.name
    with open(output_file, "w", encoding="utf-8") as out_f:
        json.dump(updated_data, out_f, ensure_ascii=False, indent=4)
    print(f"Saved updated JSON file to: {output_file}")

    # Update the log file with the last processed file
    with open(LOG_FILE, "w", encoding="utf-8") as log_f:
        log_f.write(json_file.name)

async def process_files(input_folder, output_folder, start_file=None, max_concurrent_files=15):
    """Process JSON files with a maximum of concurrent tasks."""
    input_path = Path(input_folder)
    json_files = sorted(input_path.glob("*.json"))  # Sort files for consistency

    # Find the index to start processing from
    start_index = 0
    if start_file:
        for idx, file in enumerate(json_files):
            if file.name == start_file:
                start_index = idx
                break

    # Process files starting from the specified start file
    pending_files = json_files[start_index:]
    tasks = set()  # Set to keep track of running tasks

    for json_file in pending_files:
        if len(tasks) >= max_concurrent_files:
            # Wait for some tasks to complete before starting new ones
            _done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)

        # Add a new task to the set
        tasks.add(asyncio.create_task(process_json_file(json_file, output_folder)))

    # Wait for the remaining tasks to complete
    await asyncio.gather(*tasks)

async def main():
    input_folder = "archive"  # Folder containing JSON files
    output_folder = "final_withhtml"  # Folder to save updated JSON files

    # Allow user to specify the starting file
    start_file = input("Enter the name of the starting file (or press Enter to start from the beginning): ").strip()
    if not start_file:
        start_file = None  # Start from the beginning if no input is provided

    # Process JSON files
    await process_files(input_folder, output_folder, start_file=start_file, max_concurrent_files=15)

# Run the main coroutine
asyncio.run(main())


Processing file: job_listings_20180106_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h757986
Processing file: job_listings_20180108_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h758416
Processing file: job_listings_20180109_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h758781
Processing file: job_listings_20180110_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h759180
Processing file: job_listings_20180111_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h759675
Processing file: job_listings_20180112_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h760093
Processing file: job_listings_20180114_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid=h759175
Processing file: job_listings_20180115_archive.json
Scraping URL: https://www.jobindexarkiv.dk/cgi/showarchive.