In [1]:
import os
import json
import re
import pandas as pd

# Define input and output folder paths
input_folder = "archive"  # Replace with the path to your folder containing the JSON files
output_folder = "archive_filtered"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to clean trailing indicators
def clean_trailing_indicators(area):
    if not area:
        return None
    # Remove trailing indicators like 'o', 'c', etc.
    area = re.sub(r'\s+[a-zA-ZÆØÅæøå]+$', '', area.strip())
    return area

# Function to keep the first meaningful city/place
def keep_first_place_simple(area):
    if not area:
        return None
    # Split by delimiters and keep the first part
    delimiters = [',', '/', ' og', ' eller', ' or', ' and']
    for delim in delimiters:
        if delim in area:
            area = area.split(delim)[0].strip()
            break
    return area if len(area) > 2 else None

# Danish to English city name mapping
city_name_mapping = {
    'København': 'Copenhagen',
    'Århus': 'Aarhus',
    'Helsingør': 'Elsinore'
}

# List of the 30 biggest cities in Denmark
biggest_cities = [
    "Copenhagen", "Aarhus", "Odense", "Aalborg", "Esbjerg", "Randers", "Kolding", "Horsens", "Vejle", "Roskilde",
    "Herning", "Hørsholm", "Silkeborg", "Næstved", "Fredericia", "Viborg", "Køge", "Holstebro", "Taastrup", "Slagelse",
    "Hillerød", "Sønderborg", "Svendborg", "Hjørring", "Holbæk", "Frederikshavn", "Nørresundby", "Ringsted", "Haderslev",
    "Skive", "Ølstykke-Stenløse", "Nykøbing Falster", "Greve Strand", "Kalundborg", "Ballerup", "Rødovre", "Lyngby",
    "Albertslund", "Hvidovre", "Glostrup", "Ishøj", "Birkerød", "Farum", "Frederikssund", "Brøndby Strand",
    "Skanderborg", "Hedensted", "Frederiksværk", "Lillerød", "Solrød Strand"
]

# Process each JSON file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".json"):
        input_file_path = os.path.join(input_folder, file_name)
        output_file_path = os.path.join(output_folder, file_name)
        
        # Read the JSON file
        print(f"Processing file: {file_name}")
        with open(input_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Filter and clean jobs
        filtered_jobs = []
        for job in data:
            area = job.get("Area")
            if area:
                # Apply cleaning functions
                area = clean_trailing_indicators(area)
                area = keep_first_place_simple(area)
                area = city_name_mapping.get(area, area)  # Map to English names
                # Check if the cleaned area is in the list of biggest cities
                if area in biggest_cities:
                    job["Area"] = area  # Update area in the job record
                    filtered_jobs.append(job)
        
        # Feedback on the number of jobs retained
        print(f"File: {file_name} | Original jobs: {len(data)} | Filtered jobs: {len(filtered_jobs)}")
        
        # Write the filtered jobs to a new JSON file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            json.dump(filtered_jobs, output_file, ensure_ascii=False, indent=4)

print(f"All files have been processed. Filtered files are saved in the '{output_folder}' directory.")


Processing file: job_listings_20160101_archive.json
File: job_listings_20160101_archive.json | Original jobs: 19 | Filtered jobs: 15
Processing file: job_listings_20160102_archive.json
File: job_listings_20160102_archive.json | Original jobs: 4 | Filtered jobs: 3
Processing file: job_listings_20160103_archive.json
File: job_listings_20160103_archive.json | Original jobs: 7 | Filtered jobs: 7
Processing file: job_listings_20160104_archive.json
File: job_listings_20160104_archive.json | Original jobs: 60 | Filtered jobs: 31
Processing file: job_listings_20160105_archive.json
File: job_listings_20160105_archive.json | Original jobs: 96 | Filtered jobs: 53
Processing file: job_listings_20160106_archive.json
File: job_listings_20160106_archive.json | Original jobs: 61 | Filtered jobs: 28
Processing file: job_listings_20160107_archive.json
File: job_listings_20160107_archive.json | Original jobs: 53 | Filtered jobs: 30
Processing file: job_listings_20160108_archive.json
File: job_listings_20