In [None]:
!pip install datasets




In [None]:
import json
import re
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import Dataset
from transformers import TrainerCallback
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [None]:
with open('job_descriptions_200.json', 'r') as f:
    data = json.load(f)

print(data)
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

[{'job_title': 'Mobile Application Developer', 'job_description': 'Seeking a Mobile Application Developer with 3 years of experience in developing native and cross-platform mobile applications. Proficiency in Swift, Kotlin, and React Native is required.', 'skills': ['Swift', 'Kotlin', 'React Native', 'Mobile UI/UX', 'RESTful APIs'], 'years_experience': '3 years'}, {'job_title': 'Cloud Solutions Architect', 'job_description': 'Looking for a Cloud Solutions Architect with 5+ years of experience in designing and implementing scalable cloud architectures. Expertise in AWS, Azure, and GCP is essential.', 'skills': ['AWS', 'Azure', 'Google Cloud Platform', 'Infrastructure as Code', 'Networking'], 'years_experience': '5+ years'}, {'job_title': 'Cybersecurity Analyst', 'job_description': 'Hiring a Cybersecurity Analyst with 4 years of experience in identifying and mitigating security threats. Familiarity with SIEM tools and intrusion detection systems is required.', 'skills': ['SIEM', 'Intrusi

In [None]:


def annotate_text(example):
    text = example["job_description"]
    tokens = []
    labels = []
    spans = []

    # Tokenize the job description into non-whitespace sequences
    for match in re.finditer(r'\S+', text):
        token = match.group(0)
        tokens.append(token)
        spans.append((match.start(), match.end()))
        labels.append("O")  # Default label

    # Annotate skills
    # Preprocess and stem skills
    processed_skills = []
    for skill in example["skills"]:
        # Split into parts, clean, stem, and lowercase
        parts = [
            stemmer.stem(re.sub(r'[^\w+#&/.]', '', part).strip('.,;:').lower())
            for part in skill.split()
        ]
        if parts:
            processed_skills.append(parts)
    # Sort skills by length (longest first)
    sorted_skills = sorted(
        processed_skills,
        key=lambda x: (-len(x), -sum(len(part) for part in x)),
    )
    # sorted_skills = sorted(
    #     [
    #         [re.sub(r'[^\w+#&/.]', '', part).lower() for part in skill.split()]
    #         for skill in example["skills"]
    #     ],
    #     key=lambda x: len(x),
    #     reverse=True
    # )

    # Track labeled indices to avoid overlaps
    labeled_indices = set()

    for skill_parts in sorted_skills:
        if not skill_parts:
            continue

        # Check all possible positions for multi-word matches
        for i in range(len(tokens) - len(skill_parts) + 1):
            # Skip if any token in the window is already labeled
            if any(idx in labeled_indices for idx in range(i, i + len(skill_parts))):
                continue

            match = True
            for j in range(len(skill_parts)):
                # Clean token while preserving key characters (e.g., ".", "#", "+")
                token_clean = re.sub(r'[^\w+#&/.]', '', tokens[i + j])
                token_clean = stemmer.stem(token_clean.strip('.,;:').lower())
                if token_clean != skill_parts[j]:
                    match = False
                    break

            if match:
                # Label the first token as B-SKILL, others as I-SKILL
                labels[i] = "B-SKILL"
                labeled_indices.add(i)
                for j in range(1, len(skill_parts)):
                    labels[i + j] = "I-SKILL"
                    labeled_indices.add(i + j)


    # Annotate years of experience
    years = example["years_experience"]
    if years:  # Check if years_experience is provided
        exp_tokens = years.split()  # Split into individual words
        len_exp = len(exp_tokens)
        # Look for sequences matching the years_experience phrase
        for i in range(len(tokens) - len_exp + 1):
            # Compare sequence of tokens (case-insensitive)
            if all(tokens[i + j].lower() == exp_tokens[j].lower() for j in range(len_exp)):
                labels[i]="B-YEXP"
                for j in range(len_exp-1):
                 labels[i + j + 1] = "I-YEXP"

    # Update the example dictionary
    example["tokens"] = tokens
    example["labels"] = labels
    example["spans"] = spans

    return example
annotated_data = [annotate_text(example) for example in train_data]
dataset_train = Dataset.from_list(annotated_data)
print(annotated_data)

[{'job_title': 'Product Manager', 'job_description': 'Hiring a Product Manager with 5 years of experience in managing the product lifecycle from conception to launch. Strong market research and stakeholder management skills are essential.', 'skills': ['Product Lifecycle Management', 'Market Research', 'Stakeholder Management', 'Agile', 'Roadmap Planning'], 'years_experience': '5 years', 'tokens': ['Hiring', 'a', 'Product', 'Manager', 'with', '5', 'years', 'of', 'experience', 'in', 'managing', 'the', 'product', 'lifecycle', 'from', 'conception', 'to', 'launch.', 'Strong', 'market', 'research', 'and', 'stakeholder', 'management', 'skills', 'are', 'essential.'], 'labels': ['O', 'O', 'O', 'O', 'O', 'B-YEXP', 'I-YEXP', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SKILL', 'I-SKILL', 'O', 'B-SKILL', 'I-SKILL', 'O', 'O', 'O'], 'spans': [(0, 6), (7, 8), (9, 16), (17, 24), (25, 29), (30, 31), (32, 37), (38, 40), (41, 51), (52, 54), (55, 63), (64, 67), (68, 75), (76, 85), (86, 9

In [None]:
# Tokenizer setup
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Define label mapping
label_list = ["O", "B-SKILL", "I-SKILL", "B-YEXP", "I-YEXP"]
label_map = {
    "O": 0,
    "B-SKILL": 1,
    "I-SKILL": 2,
    "B-YEXP": 3,
    "I-YEXP": 4
}

# Corrected tokenization and label alignment
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["job_description"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_offsets_mapping=True
    )

    labels = []
    for i in range(len(examples["job_description"])):
        offset_mapping = tokenized_inputs["offset_mapping"][i]  # List of (start, end) for subwords
        orig_labels = examples["labels"][i]  # Labels from annotation
        spans = examples["spans"][i]  # Character spans from annotation

        label_ids = []
        current_token_idx = -1  # Track the current original token
        for j, (start, end) in enumerate(offset_mapping):
            if start == end:  # Special tokens like [CLS], [SEP]
                label_ids.append(-100)
            else:
                # Find which original token this subword belongs to
                for k, (token_start, token_end) in enumerate(spans):
                    if start >= token_start and end <= token_end:
                        if k != current_token_idx:
                            # First subword of a new original token
                            label_ids.append(label_map[orig_labels[k]])
                            current_token_idx = k
                        else:
                            # Subsequent subword of the same token
                            label_ids.append(-100)
                        break
                else:
                    # No matching token found (edge case)
                    label_ids.append(-100)

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
tokenized_dataset = dataset_train.map(tokenize_and_align_labels, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/136 [00:00<?, ? examples/s]

Dataset({
    features: ['job_title', 'job_description', 'skills', 'years_experience', 'tokens', 'labels', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 136
})


In [None]:
# Model setup
num_labels = len(label_list)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=num_labels,id2label={v: k for k, v in label_map.items()},
    label2id=label_map)
annotated_val_data = [annotate_text(example) for example in val_data]
val_dataset = Dataset.from_list(annotated_val_data)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
# Training arguments (unchanged)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    num_train_epochs=9,
    weight_decay=0.01,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="wandb",
    run_name="skill-extraction-bert",
    logging_dir="./logs",
)

# Define callback (unchanged)
class LossLoggerCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\nEpoch {state.epoch} - Training Loss: {state.log_history[-1]['loss']:.4f}")
        if "eval_loss" in state.log_history[-1]:
            print(f"Epoch {state.epoch} - Validation Loss: {state.log_history[-1]['eval_loss']:.4f}")

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val_dataset,
   # callbacks=[LossLoggerCallback()]
)

# Start training
print("Starting training...")
trainer.train()
eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']:.4f}")

# Make predictions on the validation set
predictions, labels, _ = trainer.predict(tokenized_val_dataset)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Starting training...


Epoch,Training Loss,Validation Loss
1,0.9121,0.465546
2,0.3896,0.261319
3,0.2297,0.19358
4,0.1606,0.162173
5,0.1165,0.147913
6,0.0905,0.138778
7,0.0702,0.142049
8,0.0597,0.134516
9,0.0499,0.136459


Validation Loss: 0.1345


In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset)
import numpy as np
# Function to extract predicted skills and experience
def extract_entities(tokens, pred_indices, label_list):
    skills = []
    experience = None
    # Map predicted indices to labels and extract entities
    for token, pred_idx in zip(tokens, pred_indices):
        if pred_idx == -100:
            continue  # Skip special tokens or invalid predictions
        label = label_list[pred_idx]  # Convert index to label name
        if label == "B-SKILL":
            skills.append(token)
        elif label == "B-YEXP":
            experience = token
    return skills, experience
pred_indices = np.argmax(predictions, axis=-1)  # Convert logits to label indices

# Display predictions
print("\n=== Validation Set Predictions ===")
for i, example in enumerate(val_data):
    tokens = example["tokens"]
    pred_idx = pred_indices[i]  # 1D array of predicted label indices
    pred_skills, pred_experience = extract_entities(tokens, pred_idx, label_list)

    print(f"\nJob Description: {example['job_description']}")
    print(f"Predicted Skills: {pred_skills}")
    print(f"Predicted Experience: {pred_experience}")
    print(f"Actual Skills: {example['skills']}")
    print(f"Actual Experience: {example['years_experience']}")



=== Validation Set Predictions ===

Job Description: Hiring a Technical Support Engineer with 2 years of experience in providing technical assistance to clients. Strong troubleshooting and communication skills are essential.
Predicted Skills: ['essential.']
Predicted Experience: skills
Actual Skills: ['Troubleshooting', 'Customer Support', 'Ticketing Systems', 'Networking Basics', 'Product Knowledge']
Actual Experience: 2 years

Job Description: We are hiring an E-commerce Manager with three to four years of experience in online retail operations, digital marketing, and platform optimization. Strong analytical skills and knowledge of SEO are required.
Predicted Skills: ['operations,', 'and', 'and', 'of', 'SEO']
Predicted Experience: None
Actual Skills: ['E-commerce', 'Digital Marketing', 'SEO', 'Platform Optimization']
Actual Experience: three to four years

Job Description: Looking for a Business Development Manager to drive revenue growth and build strong client relationships. Prove

In [None]:
import numpy as np

def extract_entities(tokens, pred_indices, label_list):
    skills = []
    experience = []
    current_skill = None
    current_experience = None

    for token, pred_idx in zip(tokens, pred_indices):
        if pred_idx == -100:
            continue  # Skip special tokens

        label = label_list[pred_idx]

        # Handle Skills (B-SKILL and I-SKILL)
        if label == "B-SKILL":
            if current_skill is not None:  # End previous skill
                skills.append(current_skill)
            current_skill = token  # Start new skill
        elif label == "I-SKILL":
            if current_skill is not None:
                current_skill += " " + token  # Extend skill
            else:
                current_skill = token  # Assume invalid I-SKILL without B-SKILL
        else:
            if current_skill is not None:  # End skill
                skills.append(current_skill)
                current_skill = None

        # Handle Experience (B-YEXP and I-YEXP)
        if label == "B-YEXP":
            if current_experience is not None:  # End previous experience
                experience.append(" ".join(current_experience))
            current_experience = [token]  # Start new experience
        elif label == "I-YEXP":
            if current_experience is not None:
                current_experience.append(token)  # Extend experience
            else:
                current_experience = [token]  # Assume invalid I-YEXP without B-YEXP
        else:
            if current_experience is not None:  # End experience
                experience.append(" ".join(current_experience))
                current_experience = None

    # Add remaining entities
    if current_skill is not None:
        skills.append(current_skill)
    if current_experience is not None:
        experience.append(" ".join(current_experience))

    # Return skills and first experience (or None)
    return skills, experience[0] if experience else None

# Example usage
pred_indices = np.argmax(predictions, axis=-1)  # Convert logits to label indices

print("\n=== Validation Set Predictions ===")
for i, example in enumerate(val_data):
    tokens = example["tokens"]
    pred_idx = pred_indices[i]  # 1D array of predicted label indices
    pred_skills, pred_experience = extract_entities(tokens, pred_idx, label_list)

    print(f"\nJob Description: {example['job_description']}")
    print(f"Predicted Skills: {pred_skills}")
    print(f"Predicted Experience: {pred_experience}")
    print(f"Actual Skills: {example['skills']}")
    print(f"Actual Experience: {example['years_experience']}")


=== Validation Set Predictions ===

Job Description: Hiring a Technical Support Engineer with 2 years of experience in providing technical assistance to clients. Strong troubleshooting and communication skills are essential.
Predicted Skills: ['essential.']
Predicted Experience: 2 years
Actual Skills: ['Troubleshooting', 'Customer Support', 'Ticketing Systems', 'Networking Basics', 'Product Knowledge']
Actual Experience: 2 years

Job Description: We are hiring an E-commerce Manager with three to four years of experience in online retail operations, digital marketing, and platform optimization. Strong analytical skills and knowledge of SEO are required.
Predicted Skills: ['operations, digital', 'and', 'and', 'of', 'SEO']
Predicted Experience: None
Actual Skills: ['E-commerce', 'Digital Marketing', 'SEO', 'Platform Optimization']
Actual Experience: three to four years

Job Description: Looking for a Business Development Manager to drive revenue growth and build strong client relationshi