# LLM setup

In [13]:
# Importing all the necessary libraries
import pandas as pd
import zipfile
import os
import json
import ollama
import re
from collections import Counter
import matplotlib.pyplot as plt

In [14]:
import os
import json

# Path to the folder containing JSON files
folder_path = 'final_withhtml_withcategories'

# Initialize a list to store all job listings
all_job_listings = []

# Iterate over each JSON file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        file_path = os.path.join(folder_path, file_name)
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    all_job_listings.extend(data)  # Add all job listings to the master list
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Process the combined job listings (example: count total jobs)
total_jobs = len(all_job_listings)
print(f"Total job listings processed: {total_jobs}")

# Example: Count jobs by Area
jobs_by_area = {}
for job in all_job_listings:
    area = job.get("Area", "Unknown")
    jobs_by_area[area] = jobs_by_area.get(area, 0) + 1

print("Job counts by area:")
for area, count in jobs_by_area.items():
    print(f"{area}: {count}")

# Example: Filter jobs with 'Python' in the title
python_jobs = [job for job in all_job_listings if 'Python' in job.get("Title", "")]
print(f"Total Python-related job listings: {len(python_jobs)}")


Total job listings processed: 164249
Job counts by area:
Copenhagen: 19369
Odense: 3258
Unknown: 15204
Ballerup: 6259
Slagelse: 209
Randers: 416
Søborg: 3014
Hellerup: 1087
Kalundborg: 271
Aalborg: 2956
Fredericia: 1315
Vejen: 222
Billund: 1718
Aarhus: 7015
Viby J: 820
Smørum: 886
Hvidovre: 1461
Bagsværd: 1660
Region Sjælland: 95
Skovlunde: 149
Herlev: 1261
Holte: 143
Struer: 86
Herning: 856
Allerød: 883
Storbritannien: 164
Middelfart: 404
Søften, Hinnerup: 1
Albertslund: 660
Karup J: 82
Frederiksberg C: 170
Kolding: 1588
Valby: 1210
Greve: 163
Lystrup: 402
Nice: 4
Kgs. Lyngby: 1491
Region Midtjylland: 416
Region Midtjylland, Vejle: 1
Skanderborg: 1116
Udlandet: 172
Farum: 285
Glostrup: 1430
Bornholm: 14
Gentofte: 1020
Silkeborg: 1609
Hørsholm: 867
Kastrup: 560
Taastrup: 1703
Rønde: 25
Værløse: 118
Broendby: 2
Hasselager: 206
Tyskland: 143
Frederikshavn: 97
Lynge: 196
Egebækgård in Nærum: 1
Hadsten: 63
Frederiksberg: 3148
Birkerød: 331
Hadsund: 87
Horsens: 799
Grønland: 22
Ølgod: 41
Sa

## maybe we should remove the ones where the area is unknown

In [15]:
import os
import json

# Path to the folder containing JSON files
folder_path = 'final_withhtml_withcategories'

# Initialize a counter for 'Unknown' areas
unknown_area_count = 0

# Iterate over each JSON file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        file_path = os.path.join(folder_path, file_name)
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Count 'Unknown' areas in the current file
                    unknown_area_count += sum(1 for job in data if job.get("Area") == "Unknown")
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Print the total count of 'Unknown' areas
print(f"Number of 'Unknown' Areas: {unknown_area_count}")


Number of 'Unknown' Areas: 15204


In [16]:
import os
import json

# Path to the folder containing JSON files
folder_path = 'final_withhtml_withcategories'

# Path to save the cleaned JSON files
cleaned_folder_path = 'final_withhtml_withcategories'
os.makedirs(cleaned_folder_path, exist_ok=True)

# Iterate over each JSON file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        file_path = os.path.join(folder_path, file_name)
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Filter out rows where 'Area' is 'Unknown'
                    cleaned_data = [job for job in data if job.get("Area") != "Unknown"]

                    # Save the cleaned data back to a new file
                    cleaned_file_path = os.path.join(cleaned_folder_path, file_name)
                    with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file:
                        json.dump(cleaned_data, cleaned_file, indent=4, ensure_ascii=False)
        except Exception as e:
            print(f"Error processing {file_name}: {e}")


## the ads with a word count bellow this number don't seem to give us much to work with

In [17]:
# Path to the folder containing JSON files
input_folder = 'final_withhtml_withcategories'

# Path to save the filtered JSON files
output_folder = 'final_withhtml_withcategories'
os.makedirs(output_folder, exist_ok=True)

min_word_count = 400

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        input_file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Filter out rows where 'HTML_Text' is NaN, None, or below the word count threshold
                    filtered_data = [
                        job for job in data
                        if job.get("HTML_Text") and len(job["HTML_Text"].split()) >= min_word_count
                    ]

                    # Save the filtered data to a new file
                    output_file_path = os.path.join(output_folder, file_name)
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(filtered_data, output_file, indent=4, ensure_ascii=False)
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

In [None]:
# Path to the folder containing JSON files
input_folder = 'final_withhtml_withcategories'

# Path to save the cleaned JSON files
output_folder = 'final_withhtml_withcategories'
os.makedirs(output_folder, exist_ok=True)

# Function to clean trailing indicators
def clean_trailing_indicators(area):
    if not area or not isinstance(area, str):  # Handle None or non-string values
        return None
    # Remove trailing indicators like 'o', 'c', 'SØ', 'N', 'K'
    area = re.sub(r'\s+[a-zA-ZÆØÅæøå]+$', '', area.strip())
    return area

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        input_file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Clean the 'Area' field for each job listing
                    for job in data:
                        if 'Area' in job:
                            job['Area'] = clean_trailing_indicators(job['Area'])

                    # Save the cleaned data to a new file
                    output_file_path = os.path.join(output_folder, file_name)
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(data, output_file, indent=4, ensure_ascii=False)

                    print(f"Cleaned file saved: {output_file_path}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

Cleaned file saved: final_withhtml_withcategories\job_listings_20160101.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160102.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160103.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160104.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160105.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160106.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160107.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160108.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160109.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160110.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160111.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160112.json
Cleaned file saved: final_withhtml_withcategories\job_listings_20160113.json

In [None]:
# Path to the folder containing JSON files
input_folder = 'final_withhtml_withcategories'

# Path to save the cleaned JSON files
output_folder = 'final_withhtml_withcategories'
os.makedirs(output_folder, exist_ok=True)

# Function to keep the first meaningful place
def keep_first_place_simple(area):
    if not area or not isinstance(area, str):  # Handle None or non-string values
        return None
    # Split by delimiters and keep the first meaningful part
    delimiters = [',', '/', ' og', ' eller', ' or', ' and']
    for delim in delimiters:
        if delim in area:
            # Keep only the first part
            area = area.split(delim)[0].strip()
            break
    # Return cleaned area if it is meaningful
    return area if len(area) > 2 else None

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        input_file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Simplify the 'Area' field for each job listing
                    for job in data:
                        if 'Area' in job:
                            job['Area'] = keep_first_place_simple(job['Area'])

                    # Save the cleaned data to a new file
                    output_file_path = os.path.join(output_folder, file_name)
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(data, output_file, indent=4, ensure_ascii=False)

                    print(f"Cleaned file saved: {output_file_path}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

Cleaned file saved: sampled_final_withhtml\job_listings_20160101.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160102.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160103.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160104.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160105.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160106.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160107.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160108.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160109.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160110.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160111.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160112.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160113.json
Cleaned file saved: sampled_final_withhtml\job_listings_20160114.json
Cleaned file saved: 

In [None]:

# Path to the folder containing JSON files
input_folder = 'final_withhtml_withcategories'

# Initialize a set to store unique areas
unique_areas = set()

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Add each 'Area' value to the set
                    for job in data:
                        area = job.get("Area")
                        if area:  # Ensure area is not None or empty
                            unique_areas.add(area)
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Display all unique areas
print("Unique areas:")
for area in sorted(unique_areas):  # Sort the areas for easier readability
    print(area)


Unique areas:
Aabenraa
Aabybro
Aabyhøj
Aahus
Aalborg
Aarhhus
Aarhus
Aarhus &
Aarhus (Egå)
Aarhus eller Gødstrup
Aars
Aarslev
Aarup
Aasiaat
Abu
Afghanistan
Albertslund
Algeriet
Allerød
Allingåbro
Amerikansk
Amsterdam
Ans
Ansager
Arden
Argentina
Aruba
Asiaq
Asnæs
Asperup
Assens
Athen
Athens
Augustenborg
Aulum
Aulum &
Auning
Australien
Austria
Avedøre
Avedøreværket i Hvidovre
Bagsværd
Ballerup
Bangladesh
Barrit
Barritskov
Beckum
Belgien
Berlin
Billlund
Billund
Bindeballe
Birkerød
Bjeringbro
Bjerringbro
Bjert
Bjæverskov
Blokhus
Blommenslyst
Bogense
Bolderslev
Bording
Bornholm
Borup
Brabrand
Bramming
Brande
Brande &
Brande and
Brande og
Brasilien
Bredebro
Bredsten
Broager
Broby
Broendby
Brovst
Brrande
Bryrup
Brøndby
Brøndbyøster
Brønderslev
Brønshøj
Brørup
Budapest
Bulgarien
Bække
Børkop
Canada
Cham
Charlottenlund
Chile
China
Christiansfeld
Colombia
Comorerne
Cop
Copenhagen
Copenhagen (Vallensbæk)
Copenhagen area
Costa
Cypern
DIBS i
Danmark
Darmstadt
De Forenede Arabiske
Den Europæiske
Den 

In [10]:

# Mapping of Danish to English city names
city_name_mapping = {
    'København': 'Copenhagen',
    'Århus': 'Aarhus',
    'Helsingør': 'Elsinore'
}

# Function to translate Danish city names to English
def translate_city_name(area):
    if area in city_name_mapping:
        return city_name_mapping[area]
    return area

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        input_file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
 
                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Replace Danish names with their English equivalents
                    for job in data:
                        if 'Area' in job:
                            job['Area'] = translate_city_name(job['Area'])

                    # Save the updated data to a new file
                    output_file_path = os.path.join(output_folder, file_name)
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(data, output_file, indent=4, ensure_ascii=False)

        except Exception as e:
            print(f"Error processing {file_name}: {e}")


In [11]:
cities = [
    "Copenhagen", "Aarhus", "Odense", "Aalborg", "Esbjerg", "Randers", "Kolding", "Horsens", "Vejle", "Roskilde",
    "Herning", "Hørsholm", "Silkeborg", "Næstved", "Fredericia", "Viborg", "Køge", "Holstebro", "Taastrup", "Slagelse",
    "Hillerød", "Sønderborg", "Svendborg", "Hjørring", "Holbæk", "Frederikshavn", "Nørresundby", "Ringsted", "Haderslev",
    "Skive", "Ølstykke-Stenløse", "Nykøbing Falster", "Greve Strand", "Kalundborg", "Ballerup", "Rødovre", "Lyngby",
    "Albertslund", "Hvidovre", "Glostrup", "Ishøj", "Birkerød", "Farum", "Frederikssund", "Brøndby Strand",
    "Skanderborg", "Hedensted", "Frederiksværk", "Lillerød", "Solrød Strand"
]

In [None]:

# Iterate over each JSON file in the folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        input_file_path = os.path.join(input_folder, file_name)
        try:
            # Read the JSON file
            with open(input_file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Ensure the data is a list (since each JSON file contains job listings)
                if isinstance(data, list):
                    # Filter job listings to include only those in the cities list
                    filtered_data = [job for job in data if job.get("Area") in cities]

                    # Save the filtered data to a new file
                    output_file_path = os.path.join(output_folder, file_name)
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        json.dump(filtered_data, output_file, indent=4, ensure_ascii=False)

                    print(f"Filtered file saved: {output_file_path} (Removed {len(data) - len(filtered_data)} listings)")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")


Filtered file saved: sampled_final_withhtml\job_listings_20160101.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160102.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160103.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160104.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160105.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160106.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160107.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160108.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160109.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160110.json (Removed 0 listings)
Filtered file saved: sampled_final_withhtml\job_listings_20160111.json (Removed 