In [1]:
import os
import json
from collections import defaultdict

# Directories for the folders
folder_1 = "./Merged_Job_Listings_with_ArchiveLink"
folder_2 = "./final_withhtml"
output_directory = "./final_withhtml_withcategories"

# Fields to keep in the merged data
fields_to_keep = [
    "Title", "URL", "Area", "Category_Job_ID", "Category_Job", 
    "Published", "Job Link", "HTML_Text"
]

# Function to load all JSON files from a folder
def load_json_files(folder_path):
    all_jobs = {}
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".json") and os.path.isfile(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)
                    for job in data:
                        job_link = job.get("Job Link")
                        if job_link:
                            all_jobs[job_link] = job
                except json.JSONDecodeError as e:
                    print(f"Error reading {file_path}: {e}")
    return all_jobs

# Load job listings from both folders
jobs_folder_1 = load_json_files(folder_1)
jobs_folder_2 = load_json_files(folder_2)

# Merge job listings by Job Link
merged_jobs_by_date = defaultdict(list)
for job_link, job in jobs_folder_1.items():
    merged_job = {key: job.get(key) for key in fields_to_keep}
    if job_link in jobs_folder_2:
        # Update with additional information from folder 2
        for key in fields_to_keep:
            merged_job[key] = jobs_folder_2[job_link].get(key, merged_job.get(key))
    published_date = merged_job.get("Published")
    if published_date:
        merged_jobs_by_date[published_date].append(merged_job)

# Ensure output directory exists
os.makedirs(output_directory, exist_ok=True)

# Save each day's job listings to a separate file
for date, jobs in merged_jobs_by_date.items():
    output_file = os.path.join(output_directory, f"job_listings_{date.replace('-', '')}.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(jobs, f, indent=4, ensure_ascii=False)

print(f"Merged job listings saved to {output_directory}")


Merged job listings saved to ./final_withhtml_withcategories
