# LLM setup notebook

In [None]:
!pip install ollama -q > /dev/null

In [None]:
# Importing all the necessary libraries
import pandas as pd
import zipfile
import os
import json
import ollama
import re
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Code to connect all the files in within the ZIP file into one DF

In [None]:
# Path to the ZIP file
zip_file_path = '/content/drive/My Drive/Files/file.zip'

# Temporary folder to extract JSON files
extract_folder = '/content/json_files'

# Extract all files from the ZIP
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# Initialize a list to store data
dataframes = []

# Iterate over each JSON file in the extracted folder
for file_name in os.listdir(extract_folder):
    if file_name.endswith('.json'):  # Check if it's a JSON file
        file_path = os.path.join(extract_folder, file_name)
        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                df_temp = pd.DataFrame(data)
                dataframes.append(df_temp)
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Combine all DataFrames into one named df
df = pd.concat(dataframes, ignore_index=True)

## Data Preprocessing Before LLM

In this step, we preprocess the data to ensure it is ready for the LLM. We remove rows with missing or insufficient information, as well as those that do not align with the project's objectives, ensuring that only relevant and complete data is passed for further processing.

In [None]:
 # Calculate word counts for each row in the 'HTML_Text' column
df['word_count'] = df['HTML_Text'].apply(lambda x: len(x.split()))

# Generate statistics on the word counts
stats = df['word_count'].describe()

# Print the statistics
print(stats)

### We remove the HTML texts where the word count is less than 400 as those do not give use enought info to work with.

In [None]:
# Define a threshold for minimum word count
min_word_count = 400

# Filter the original dataframe in place, only keeping rows meeting the word count requirement
df = df[df['HTML_Text'].apply(lambda x: len(x.split()) > min_word_count)].copy()

df.shape

### Removing additional letters from city names. For example ''Aalborg C'' would become ''Aalborg''

In [None]:
# Function to city area indications
def clean_trailing_indicators(area):
    if pd.isna(area):
        return None
    # Remove things like 'o', 'c', 'SØ', 'N', 'K'
    area = re.sub(r'\s+[a-zA-ZÆØÅæøå]+$', '', area.strip())
    return area

# Apply the cleaning function to the 'Area' column
df['Area'] = df['Area'].apply(clean_trailing_indicators)

### Some ads have more than one city listed as an option. We only keep the city that was listed as the first option

In [None]:
# Simplified function to keep the first city
def keep_first_place_simple(area):
    if pd.isna(area):
        return None
    # Split by simpler delimiters
    delimiters = [',', '/', ' og', ' eller', ' or', ' and']
    for delim in delimiters:
        if delim in area:
            # Keep only the first part
            area = area.split(delim)[0].strip()
            break
    # Return cleaned area if it is meaningful
    return area if len(area) > 2 else None

# Apply the simplified cleaning function
df['Area'] = df['Area'].apply(keep_first_place_simple)

### Mapping the 3 cities that have different name in English vs Danish

In [None]:
# Mapping of Danish to English city names
city_name_mapping = {
    'København': 'Copenhagen',
    'Århus': 'Aarhus',
    'Helsingør': 'Elsinore'
}

# Replace Danish names with their English equivalents
df['Area'] = df['Area'].replace(city_name_mapping)

### A list of cities we want to look at. If the area is not one of the ones listed, it is removed. These are the 30 biggest cities in Denmark

In [None]:
# List of 30 biggest cities in Denmark
cities = [
    "Copenhagen", "Aarhus", "Odense", "Aalborg", "Esbjerg", "Randers", "Kolding", "Horsens", "Vejle", "Roskilde",
    "Herning", "Hørsholm", "Silkeborg", "Næstved", "Fredericia", "Viborg", "Køge", "Holstebro", "Taastrup", "Slagelse",
    "Hillerød", "Sønderborg", "Svendborg", "Hjørring", "Holbæk", "Frederikshavn", "Nørresundby", "Ringsted", "Haderslev",
    "Skive", "Ølstykke-Stenløse", "Nykøbing Falster", "Greve Strand", "Kalundborg", "Ballerup", "Rødovre", "Lyngby",
    "Albertslund", "Hvidovre", "Glostrup", "Ishøj", "Birkerød", "Farum", "Frederikssund", "Brøndby Strand",
    "Skanderborg", "Hedensted", "Frederiksværk", "Lillerød", "Solrød Strand"
]

In [None]:
# Filter the DataFrame to only include rows where Area is in the cities list
df = df[df['Area'].isin(cities)]

# Display the filtered DataFrame
print("Filtered DataFrame:")
print(df.shape)

# This is a sample data. Only for a few months

In [None]:
# Install Ollama
!sudo apt-get install -y pciutils
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
# Sets up environment variables and starts the Ollama server
import os
import threading
import subprocess

def start_ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=start_ollama)
ollama_thread.start()

In [None]:
!ollama run llama3.1:8b

In [None]:
SYSTEM_PROMPT = """
You are given a job listing. Your task is to extract relevant skills, categorizing them into **primary IT-related technical skills** and **secondary (soft/general) skills**.

### Definitions:
- **Primary Skills**: Technical, actionable skills directly relevant to IT roles, including tools, technologies, methods, or domain expertise.
- **Secondary Skills**: Soft skills, behavioral traits, or general competencies that support workplace effectiveness.

### Rules:
1. Separate extracted skills into **primary** (IT-related technical) and **secondary** (soft/general) categories.
2. Split compound terms into individual skills (e.g., "Skill A and Skill B" → ["Skill A", "Skill B"]).
3. Exclude vague, repetitive, or non-actionable terms.
4. Translate all extracted skills into **English**, regardless of the input language (e.g., Danish to English).
5. Ensure the output is clear, specific, concise, and in valid JSON format.

---

### Output Format:
```json
{
  "skills": {
    "primary": ["Skill 1", "Skill 2", "..."],
    "secondary": ["Skill A", "Skill B", "..."]
  }
}
```

### Examples:
#### Example 1
**Job Listing Text:** "Looking for expertise in modern development practices and effective communication skills."

**Output:**
```json
{
  "skills": {
    "primary": ["Modern development practices"],
    "secondary": ["Communication"]
  }
}
```

#### Example 2
**Job Listing Text:** "Vi søger en erfaren person med kendskab til ny teknologi og gode samarbejdsevner."

**Output:**
```json
{
  "skills": {
    "primary": ["New technology"],
    "secondary": ["Collaboration"]
  }
}
```
"""

In [None]:
# Sets up environment variables and starts the Ollama server
import os
import threading
import subprocess

def start_ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=start_ollama)
ollama_thread.start()

In [None]:
import pandas as pd
import json

# 1) Extract skills
def extract_skills_from_text(html_text, row_id):
    prompt = f"""
    Extract the relevant skills from the following job listing, categorizing them into **primary (IT-related technical skills)** and **secondary (soft/general skills)**.

    ### Rules:
    - Focus on actionable, specific, and clearly stated skills.
    - Outputs must be in **clear and concise English only**, regardless of the input language.
    - If no skills are present, return empty lists for both categories.

    Provide the results in this JSON format:
    {{
      "skills": {{
        "primary": ["Skill 1", "Skill 2", "..."],
        "secondary": ["Skill A", "Skill B", "..."]
      }}
    }}

    Job Listing Text:
    {html_text}
    """
    try:
        response = ollama.generate(
            model='llama3.1:8b',
            prompt=prompt,
            options={"temperature": 0.1}
        )
        response_content = response.get('response', '')

        # Extract JSON block from the response
        json_start = response_content.find('{')
        json_end = response_content.rfind('}') + 1
        if json_start == -1 or json_end == -1:
            raise ValueError("JSON block not found in the response")

        # Parse extracted JSON
        skills_data = json.loads(response_content[json_start:json_end])
        return skills_data

    except (ValueError, json.JSONDecodeError) as e:
        print(f"Error extracting skills for Row {row_id}: {e}")
        return {"skills": {"primary": [], "secondary": []}}
    except Exception as e:
        print(f"Unexpected error extracting skills for Row {row_id}: {e}")
        return {"skills": {"primary": [], "secondary": []}}

# 2) Refine extracted skills
def refine_skills_with_llm(skills_data, row_id):
    # Prepare the prompt for refinement
    prompt = f"""
    Refine the following extracted skills to ensure they are:
    - Deduplicated (remove synonyms/redundant entries).
    - Correctly separated into **primary (IT/technical)** vs **secondary (soft/general)**.
    - Outputs must be in **clear and concise English only**, regardless of the input language.


    Input:
    {json.dumps(skills_data, indent=2)}

    Return the output in the same JSON structure:
    {{
      "skills": {{
        "primary": ["Refined Skill 1", "Refined Skill 2", "..."],
        "secondary": ["Refined Skill A", "Refined Skill B", "..."]
      }}
    }}
    """

    try:
        response = ollama.generate(
            model='llama3.1:8b',
            prompt=prompt,
            options={"temperature": 0.1}
        )
        response_content = response.get('response', '')

        # Extract JSON block from the response
        json_start = response_content.find('{')
        json_end = response_content.rfind('}') + 1
        if json_start == -1 or json_end == -1:
            raise ValueError("JSON block not found in the response")

        # Parse refined JSON
        refined_data = json.loads(response_content[json_start:json_end])
        return refined_data

    except (ValueError, json.JSONDecodeError) as e:
        print(f"Error refining skills for Row {row_id}: {e}")
        # Fall back to unrefined data
        return skills_data
    except Exception as e:
        print(f"Unexpected error refining skills for Row {row_id}: {e}")
        # Fall back to unrefined data
        return skills_data

# 3) Process each row by extracting first, then refining
def process_row(html_text, row_id):
    extracted = extract_skills_from_text(html_text, row_id)
    refined = refine_skills_with_llm(extracted, row_id)
    print(f"Row {row_id} - Extracted Skills: {extracted}")
    print(f"Row {row_id} - Refined Skills: {refined}")

    return refined['skills']['primary'], refined['skills']['secondary']

# Example usage with a DataFrame containing a column 'HTML_Text'
df['Primary_Skills'], df['Secondary_Skills'] = zip(
    *df['HTML_Text'].apply(lambda text: process_row(
        text, df.index[df['HTML_Text'] == text].tolist()[0] + 1))
)

# Save final results
output_file = 'updated_job_listings_with_refined_skills.csv'
try:
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Updated job listings saved to {output_file}.")
except IOError as e:
    print(f"Error saving file: {e}")