In [1]:
import os

# Define the output directory
OUTPUT_DIR = "job_listings"

# Create the directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
import os



def fetch_job_listings(keyword, mindate, maxdate, page_limit=1):
    base_url = "https://www.jobindex.dk/jobsoegning.json?jobage=archive&subid=1&subid=2&subid=3&subid=4&subid=6&subid=7&subid=93"
    job_listings = []

    for page in range(1, page_limit + 1):
        params = {
            "q": keyword,
            "mindate": mindate,
            "maxdate": maxdate,
            "page": page
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            html_content = data.get('result_list_box_html', '')
            soup = BeautifulSoup(html_content, 'html.parser')

            # Find all job ads
            job_ads = soup.find_all('div', class_='jobsearch-result')
            
            # Stop if no job postings are found
            if not job_ads:
                print(f"No job postings found on page {page}. Stopping early.")
                break

            for job_ad in job_ads:
                # Extract title and URL
                title = job_ad.find('div', class_='jobad-element-menu-share')['data-share-title']
                url = job_ad.find('div', class_='jobad-element-menu-share')['data-share-url']
                job_location = job_ad.find('span', class_='jix_robotjob--area')

                # Extract the publication date
                published_tag = job_ad.find('time')
                published_date = published_tag['datetime'] if published_tag else 'Unknown'

                # Fetch additional details from the job URL
                details_response = requests.get(url)
                if details_response.status_code == 200:
                    details_soup = BeautifulSoup(details_response.content, 'html.parser')

                    # Locate the div containing job description
                    paid_job_inner = details_soup.find('div', class_='PaidJob-inner')
                    if paid_job_inner:
                        paragraphs = paid_job_inner.find_all('p')
                        job_details = " ".join(paragraph.get_text(strip=True) for paragraph in paragraphs)
                    else:
                        job_details = "Relevant details not found in the PaidJob-inner section."

                    # Locate the button and extract its href
                    button = details_soup.find('a', class_='btn btn-sm btn-block btn-primary d-md-none mt-2 seejobmobil')
                    if button and 'href' in button.attrs:
                        button_href = button['href']  # Extract the URL from the href attribute
                    else:
                        button_href = "No URL found"

                    # Append all data to the list
                    job_listings.append({
                        "Title": title,
                        "URL": url,
                        "Area": job_location.get_text(strip=True) if job_location else 'Unknown',
                        "Published": published_date,
                        "Description": job_details,
                        "Job Link": button_href
                    })

    return job_listings



def update_json_if_new(output_file, new_data, update_threshold=5):
    # Load existing data safely
    try:
        with open(output_file, 'r') as f:
            existing_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        print(f"Error loading JSON file: {output_file}. Starting with an empty dataset.")
        existing_data = []

    # Combine data and remove duplicates
    combined_data = pd.DataFrame(existing_data + new_data).drop_duplicates(subset=["Title", "URL"]).to_dict(orient="records")

    # Identify new rows
    new_rows = [row for row in new_data if row not in existing_data]

    # Update only if new rows exceed the threshold
    if len(new_rows) >= update_threshold:
        try:
            with open(output_file, 'w') as f:
                json.dump(combined_data, f, indent=4)
            print(f"JSON updated with {len(new_rows)} new rows.")
        except Exception as e:
            print(f"Error writing to JSON file: {output_file}. Error: {e}")
    else:
        print(f"Only {len(new_rows)} new rows found. Waiting for {update_threshold - len(new_rows)} more.")






def generate_daily_date_ranges(start_date, end_date, last_processed=None):
    """Generate a list of (mindate, maxdate) pairs for each day, skipping already processed dates."""
    date_ranges = []
    current_date = start_date

    # If a last_processed date exists, start from the next day
    if last_processed:
        current_date = datetime.strptime(last_processed, '%Y%m%d') + timedelta(days=1)

    # Generate daily date ranges where both mindate and maxdate are the same
    while current_date < end_date:
        date_ranges.append((current_date.strftime('%Y%m%d'), current_date.strftime('%Y%m%d')))
        current_date += timedelta(days=1)

    return date_ranges





def scrape_date_range(date_range, keyword, output_file_base, page_limit=1):
    mindate, maxdate = date_range
    print(f"Scraping data from {mindate} to {maxdate}...")
    try:
        new_data = fetch_job_listings(keyword, mindate, maxdate, page_limit)
        if new_data:
            # Construct the full file path
            output_file = os.path.join(OUTPUT_DIR, f"{output_file_base}_{mindate}.json")
            
            # Save data to the JSON file
            with open(output_file, 'w') as f:
                json.dump(new_data, f, indent=4)
            print(f"Data for {mindate} saved to {output_file}.")
        else:
            print(f"No data found for {mindate}.")
    except Exception as e:
        print(f"Error scraping date range {mindate} to {maxdate}: {e}. Skipping.")



In [3]:
import json

def save_progress(last_processed_date):
    """Save the last processed date to a file."""
    with open('progress.json', 'w') as f:
        json.dump({'last_processed': last_processed_date}, f)
    print(f"Progress saved: {last_processed_date}")


In [4]:
def load_progress():
    """Load the last processed date from the progress file."""
    try:
        with open('progress.json', 'r') as f:
            progress = json.load(f)
            return progress.get('last_processed', None)
    except (FileNotFoundError, json.JSONDecodeError):
        print("Progress file is missing or corrupted. Resetting progress.")
        save_progress(None)  # Reset progress
        return None


In [5]:
if __name__ == "__main__":
    try:
        # Configuration
        output_file_base = "job_listings"  # Base name for output files
        keyword = ""
        start_date = datetime(2016, 1, 1)
        end_date = datetime(2024, 11, 22)
        page_limit = 1000

        # Generate date ranges and load progress
        date_ranges = generate_daily_date_ranges(start_date, end_date)
        last_processed = load_progress()
        start_index = next((i for i, date_range in enumerate(date_ranges) if date_range[0] == last_processed), 0)

        # Parallel scraping
        with ThreadPoolExecutor(max_workers=40) as executor:
            futures = [executor.submit(scrape_date_range, date_range, keyword, output_file_base, page_limit)
                       for date_range in date_ranges[start_index:]]
            for future in futures:
                try:
                    future.result()
                except Exception as e:
                    print(f"Error in parallel scraping task: {e}. Skipping.")

        print("All scraping tasks completed.")
    except Exception as e:
        print(f"Critical error in the scraping pipeline: {e}")


Scraping data from 20160101 to 20160101...
Scraping data from 20160102 to 20160102...
Scraping data from 20160103 to 20160103...
Scraping data from 20160104 to 20160104...
Scraping data from 20160105 to 20160105...
Scraping data from 20160106 to 20160106...
Scraping data from 20160107 to 20160107...
Scraping data from 20160108 to 20160108...
Scraping data from 20160109 to 20160109...
Scraping data from 20160110 to 20160110...
Scraping data from 20160111 to 20160111...
Scraping data from 20160112 to 20160112...
Scraping data from 20160113 to 20160113...
Scraping data from 20160114 to 20160114...
Scraping data from 20160115 to 20160115...
Scraping data from 20160116 to 20160116...
Scraping data from 20160117 to 20160117...
Scraping data from 20160118 to 20160118...
Scraping data from 20160119 to 20160119...
Scraping data from 20160120 to 20160120...
Scraping data from 20160121 to 20160121...
Scraping data from 20160122 to 20160122...
Scraping data from 20160123 to 20160123...
Scraping da