In [None]:
import os
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

# Function to load progress from progress.json
def load_progress(progress_file):
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as file:
            return json.load(file)
    return {"processed_files": []}

# Function to save progress to progress.json
def save_progress(progress_file, processed_files):
    with open(progress_file, 'w') as file:
        json.dump({"processed_files": processed_files}, file)

# Function to convert Jobindex URL to archive URL
def generate_archive_url(job_url):
    try:
        # Validate the input URL
        if not isinstance(job_url, str) or not job_url.startswith("https://www.jobindex.dk/vis-job/"):
            return {"Job Link": "Invalid Job URL", "Content": None}
        
        # Extract the job ID from the URL
        job_id = job_url.split('/')[-1].strip()  # Get the part after the last "/" and remove extra spaces
        
        # Check if job_id is non-empty
        if not job_id:
            return {"Job Link": "Invalid Job ID", "Content": None}
        
        # Construct the archive URL
        archive_url = f"https://www.jobindexarkiv.dk/cgi/showarchive.cgi?tid={job_id}"
        return {"Job Link": archive_url}
    except Exception as e:
        return {"Job Link": f"Error: {str(e)}", "Content": None}



# Function to process a single file
def process_file(filename, input_folder, output_folder, progress_file, processed_files):
    if not filename.endswith('.json'):
        return f"Skipping non-JSON file: {filename}"

    output_json_path = os.path.join(output_folder, filename.replace('.json', '_archive.json'))

    # Check if the output file already exists
    if os.path.exists(output_json_path):
        return f"Skipping file (already processed): {filename}"

    input_file_path = os.path.join(input_folder, filename)
    print(f"Processing file: {filename}")

    try:
        # Load the JSON file into a DataFrame
        df = pd.read_json(input_file_path)

        # Apply the generate_archive_url function to generate archive URLs
        results = df['URL'].apply(generate_archive_url)

        # Extract 'Job Link' into a separate column
        df['Job Link'] = results.apply(lambda x: x['Job Link'])

        # Save the updated DataFrame as JSON
        df.to_json(output_json_path, orient='records', indent=4)

        # Update progress after processing the file
        processed_files.append(filename)
        save_progress(progress_file, processed_files)

        return f"Processed and saved: {filename}"
    except Exception as e:
        return f"Error processing {filename}: {str(e)}"

# Directories
input_folder = 'Merged_Job_Listings'
output_folder = 'Merged_Job_Listings_with_ArchiveLink'
progress_file = '[STEP2]generate_archive_url_html_progress_log.json'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Load progress
progress = load_progress(progress_file)
processed_files = progress.get("processed_files", [])

# List all JSON files in the input folder, sorted for consistent ordering
files_to_process = sorted([f for f in os.listdir(input_folder) if f.endswith('.json')])

# Exclude already processed files
files_to_process = [f for f in files_to_process if f not in processed_files]

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {
        executor.submit(process_file, f, input_folder, output_folder, progress_file, processed_files): f
        for f in files_to_process
    }

    for future in futures:
        try:
            print(future.result())
        except Exception as e:
            print(f"Error with file {futures[future]}: {str(e)}")


Processing file: job_listings_20160101.json
Processing file: job_listings_20160102.json
Processing file: job_listings_20160103.json
Processing file: job_listings_20160104.json
Processing file: job_listings_20160105.json
Processing file: job_listings_20160106.json
Processing file: job_listings_20160107.json
Processing file: job_listings_20160108.json
Processing file: job_listings_20160109.json
Processing file: job_listings_20160110.json
Processing file: job_listings_20160111.json
Processed and saved: job_listings_20160101.json
Processing file: job_listings_20160113.json
Processing file: job_listings_20160112.json
Processing file: job_listings_20160114.json
Processing file: job_listings_20160115.json
Processed and saved: job_listings_20160102.json
Processed and saved: job_listings_20160103.json
Processed and saved: job_listings_20160104.json
Processed and saved: job_listings_20160105.json
Processed and saved: job_listings_20160106.json
Processing file: job_listings_20160118.json
Processi