In [2]:
from bs4 import BeautifulSoup
import json

In [None]:
# Load the HTML file
folder_path = "job-htmls"


def html_to_json(job_id): 
    with open(f'{folder_path}/{job_id}', "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Extract Summary
    summary_section = soup.find("div", id="summary")
    
    # better remove unnecessary text in Summary
    summary = summary_section.get_text().split('Summary')[1].strip() if summary_section else "Not available"
    # old verion: summary = summary_section.get_text(strip=True) if summary_section else "Not available"
    

    # Extract Overview (and all subsections)
    overview_section = soup.find("div", class_="usajobs-joa-section usajobs-joa-section-beta desktop-display-none")
    overview_dict = {}

    if overview_section:
        for item in overview_section.find_all("li", class_="usajobs-joa-summary__item"):
            label = item.find("h3")
            value = item.find("p")
            if label and value:
                overview_dict[label.get_text(strip=True)] = value.get_text(strip=True)

    # Extract Hiring Paths
    hiring_paths_section = soup.find("div", id="hiring-paths")
    hiring_paths = [item.get_text(strip=True) for item in hiring_paths_section.find_all("div", class_="usajobs-joa-intro-hiring-paths__title")] if hiring_paths_section else []

    # Extract Duties
    duties_section = soup.find("div", id="duties")
    duties = [li.get_text(strip=True) for li in duties_section.find_all("li")] if duties_section else []

    # Extract Requirements
    requirements_section = soup.find("div", id="requirements")
    requirements = [li.get_text(strip=True) for li in requirements_section.find_all("li")] if requirements_section else []

    # Extract How You Will Be Evaluated
    evaluation_section = soup.find("div", id="how-you-will-be-evaluated")
    evaluation_text = evaluation_section.get_text(strip=True) if evaluation_section else "Not available"

    # Extract Required Documents
    required_docs_section = soup.find("div", id="required-documents")
    required_documents = [li.get_text(strip=True) for li in required_docs_section.find_all("li")] if required_docs_section else []

    # Structuring the extracted data into a JSON dictionary
    job_data = {
        "summary": summary,
        "overview": overview_dict,
        "hiring_paths": hiring_paths,
        "duties": duties,
        "requirements": requirements,
        "evaluation": evaluation_text,
        "required_documents": required_documents
    }

    # Save the extracted data to a JSON file
    json_file_path = f"{job_id}.json"
    with open(json_file_path, "w", encoding="utf-8") as json_file:
        json.dump(job_data, json_file, indent=4)
        print(f"Processed: {json_file_path}")



In [6]:
html_to_json('usa_jobs_520636600.html')