In [1]:
import scrapy
import os
from scrapy.crawler import CrawlerProcess

class USAJobsHTMLSpider(scrapy.Spider):
    name = "usajobs_batch_scraper"

    # Adjust concurrency settings
    def __init__(self, job_ids, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.job_ids = job_ids
        os.makedirs("job_html", exist_ok=True)  # Ensure directory exists

    # Start requests
    def start_requests(self):
        base_url = "https://www.usajobs.gov/job/{}/print"
        for job_id in self.job_ids:
            url = base_url.format(job_id)
            yield scrapy.Request(
                url, 
                callback=self.save_html, 
                meta={'job_id': job_id}, 
                errback=self.handle_error
            )

    # Save HTML
    def save_html(self, response):
        job_id = response.meta['job_id']
        file_path = f"job_html/usa_jobs_{job_id}.html"

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(response.text)

        self.log(f"Saved: {file_path}")

    # Handle errors: if a job fails, save the job ID
    def handle_error(self, failure):
        """Log failed job ID for later reprocessing."""
        job_id = failure.request.meta['job_id']
        
        with open("failed_jobID.txt", "a", encoding="utf-8") as f:
            f.write(f"{job_id}\n")  # Save one failed job ID per line
        
        self.log(f" Job ID {job_id} failed and saved to failed_jobID.txt")

# 1. Download the HTML

In [None]:
### Download HTML


# 1. read the control number from the txt file (extracted from the job DB)
with open('job_control_number_list.txt') as f:
    job_control_number_list = f.read().splitlines() 
job_control_number_list = [int(i) for i in job_control_number_list] # convert all job control number to int

total_jobs_count = len(job_control_number_list) 
print("Total number of jobs:", total_jobs_count)

### potential chunking steps
# # Split the list into batches of 1000
# chunk_size = 1000
# chunks = [job_control_number_list[i:i + chunk_size] for i in range(0, len(job_control_number_list), chunk_size)]
# print("Split into", len(chunks), "chunks", "with size", chunk_size)

Total number of jobs: 2329297


In [None]:
# Function to Run Scrapy with Optimized Settings
def download_html_batch(job_ids, concurrent_requests=20):
    process = CrawlerProcess(settings={
        "CONCURRENT_REQUESTS": concurrent_requests,
        "DOWNLOAD_DELAY": 2,  # Fixed delay (Scrapy auto-randomizes)
        "AUTOTHROTTLE_ENABLED": True,  # Dynamically adjusts request speed
        "AUTOTHROTTLE_START_DELAY": 1,
        "AUTOTHROTTLE_MAX_DELAY": 5,
        "LOG_LEVEL": "WARNING",  # Less spammy logs, if want all, use INFO
        "COOKIES_ENABLED": False,  # Reduces tracking risk
        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
    })
    
    process.crawl(USAJobsHTMLSpider, job_ids=job_ids)
    process.start()  # Start Scrapy


In [None]:
### Run Scrapy with Optimized Settings
download_html_batch(job_control_number_list)  # Download all job HTMLs

2025-03-05 10:25:49 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.usajobs.gov/job/500620300/print> (failed 3 times): TCP connection timed out: 60: Operation timed out.
2025-03-05 10:25:52 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.usajobs.gov/job/501366300/print> (failed 3 times): TCP connection timed out: 60: Operation timed out.
2025-03-05 10:25:54 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.usajobs.gov/job/507491100/print> (failed 3 times): TCP connection timed out: 60: Operation timed out.
2025-03-05 10:25:56 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.usajobs.gov/job/491506300/print> (failed 3 times): TCP connection timed out: 60: Operation timed out.
2025-03-05 10:26:53 [scrapy.downloadermiddlewares.retry] ERROR: Gave up retrying <GET https://www.usajobs.gov/job/500665100/print> (failed 3 times): TCP connection timed out: 60: Operation timed out.


# 2. Parse the HTML into JSON 

In [6]:
### Parse HTML to JSON
from bs4 import BeautifulSoup
import json

def parse_html_to_json(job_id):
    file_path = f"job_html/usa_jobs_{job_id}.html"

    if not os.path.exists(file_path):
        print(f"HTML file for job {job_id} not found! Please run download_html_batch() first.")
        return

    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Extract Summary
    summary_section = soup.find("div", id="summary")
    # get the text only after the 'Summary' heading
    summary = summary_section.get_text().split('Summary')[1].strip() if summary_section else "Not available"



    # Extract Overview (all subsections)
    overview_section = soup.find("div", class_="usajobs-joa-section usajobs-joa-section-beta desktop-display-none")
    overview_dict = {}

    if overview_section:
        for item in overview_section.find_all("li", class_="usajobs-joa-summary__item"):
            label = item.find("h3")
            value = item.find("p")
            if label and value:
                overview_dict[label.get_text(strip=True)] = value.get_text(strip=True)

    # Extract Hiring Paths
    hiring_paths_section = soup.find("div", id="hiring-paths")
    hiring_paths = [item.get_text(strip=True) for item in hiring_paths_section.find_all("div", class_="usajobs-joa-intro-hiring-paths__title")] if hiring_paths_section else []

    # Extract Duties
    duties_section = soup.find("div", id="duties")
    duties = [li.get_text(strip=True) for li in duties_section.find_all("li")] if duties_section else []

    # Extract Requirements
    requirements_section = soup.find("div", id="requirements")
    requirements = [li.get_text(strip=True) for li in requirements_section.find_all("li")] if requirements_section else []

    # Extract How You Will Be Evaluated
    evaluation_section = soup.find("div", id="how-you-will-be-evaluated")
    evaluation_text = evaluation_section.get_text(strip=True) if evaluation_section else "Not available"

    # Extract Required Documents
    required_docs_section = soup.find("div", id="required-documents")
    required_documents = [li.get_text(strip=True) for li in required_docs_section.find_all("li")] if required_docs_section else []

    # Structuring the extracted data into a JSON dictionary
    job_data = {
        "job_id": job_id,
        "summary": summary,
        "overview": overview_dict,
        "hiring_paths": hiring_paths,
        "duties": duties,
        "requirements": requirements,
        "evaluation": evaluation_text,
        "required_documents": required_documents
    }

    # Save the extracted data to a JSON file
    os.makedirs("job_json", exist_ok=True) # create the directory if it doesn't exist
    json_file_path = f"job_json/usa_jobs_{job_id}.json" 
    
    # write the data to json file, and save to the job_json folder
    with open(json_file_path, "w", encoding="utf-8") as json_file: 
        json.dump(job_data, json_file, indent=4) 

    print(f"Saved JSON file: {json_file_path}")
    
    


In [7]:
### create a list called job_ids, including all the job control numbers in job_html folder
import os
folder_path = 'job_html'
job_ids = []
for file in os.listdir(folder_path):
    if file.endswith(".html"):
        job_id = file.split("_")[2].split(".")[0]
        job_ids.append(job_id)


print("Total number of jobs:", len(job_ids))

Total number of jobs: 30


In [8]:
### Parse each HTML file into JSON
for job_id in job_ids:
    try :
        parse_html_to_json(job_id) # parse the html file to json
    except:
        print(f"Error parsing job {job_id}.")
        continue

Saved JSON file: job_json/usa_jobs_535784600.json
Saved JSON file: job_json/usa_jobs_555186600.json
Saved JSON file: job_json/usa_jobs_545139600.json
Saved JSON file: job_json/usa_jobs_538360500.json
Saved JSON file: job_json/usa_jobs_550621300.json
Saved JSON file: job_json/usa_jobs_527728800.json
Saved JSON file: job_json/usa_jobs_544120700.json
Saved JSON file: job_json/usa_jobs_534103300.json
Saved JSON file: job_json/usa_jobs_526191100.json
Saved JSON file: job_json/usa_jobs_530191900.json
Saved JSON file: job_json/usa_jobs_548471000.json
Saved JSON file: job_json/usa_jobs_539292900.json
Saved JSON file: job_json/usa_jobs_539216100.json
Saved JSON file: job_json/usa_jobs_540226800.json
Saved JSON file: job_json/usa_jobs_539293500.json
Saved JSON file: job_json/usa_jobs_529651900.json
Saved JSON file: job_json/usa_jobs_524360100.json
Saved JSON file: job_json/usa_jobs_534788400.json
Saved JSON file: job_json/usa_jobs_547650100.json
Saved JSON file: job_json/usa_jobs_526456600.json
