In [1]:
# Install spaCy transformer pipeline and model
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m445.2 kB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from

In [None]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import json
import re
from sklearn.model_selection import train_test_split

# Improved training data preparation
def create_training_example(job_description, skills):
    text = job_description.lower()  # Normalize case
    entities = []

    # Flexible skill matching with context awareness
    for skill in skills:
        # Handle special characters and variations
        pattern = r'\b{}\b(?!\w)'.format(re.escape(skill.lower()))
        for match in re.finditer(pattern, text):
            start, end = match.start(), match.end()
            # Expand matches to include common punctuation
            while start > 0 and text[start-1] in {',', '(', '"', "'"}:
                start -= 1
            entities.append((start, end, "SKILL"))

    return (text, {"entities": entities})

# Load and prepare data
with open('job_descriptions_202.json') as f:
    data = json.load(f)

train_examples = [create_training_example(entry["job_description"], entry["skills"]) for entry in data]
train_data, val_data = train_test_split(train_examples, test_size=0.2, random_state=42)

# Initialize model with better configuration
nlp = spacy.blank("en")  # Start fresh for custom NER
ner = nlp.add_pipe("ner")

# Add labels and configure transformer
ner.add_label("SKILL")

# Training parameters
learning_rate = 0.001
dropout = 0.3
epochs = 50
batch_sizes = compounding(2.0, 16.0, 1.001)

# Convert examples
train_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train_data]
val_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in val_data]

# Enhanced training loop with validation
optimizer = nlp.initialize()
best_f1 = 0

for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}

    # Training phase
    batches = minibatch(train_examples, size=batch_sizes)
    for batch in batches:
        nlp.update(
            batch,
            drop=dropout,
            sgd=optimizer,
            losses=losses,
            annotates=["ner"]
        )

    # Validation phase
    val_loss = 0
    tp, fp, fn = 0, 0, 0  # True/False positives/negatives

    with nlp.use_params(optimizer.averages):
        for example in val_examples:
            doc = nlp(example.predicted)
            gold = example.reference

            # Calculate validation metrics
            for ent in doc.ents:
                if ent.label_ == "SKILL":
                    if any(gold_ent for gold_ent in gold.ents
                         if gold_ent.start == ent.start and gold_ent.end == ent.end):
                        tp += 1
                    else:
                        fp += 1
            for gold_ent in gold.ents:
                if not any(ent for ent in doc.ents
                          if ent.start == gold_ent.start and ent.end == gold_ent.end):
                    fn += 1

    # Calculate precision/recall/F1
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {losses['ner']:.4f}")
    print(f"Validation - P: {precision:.2f}, R: {recall:.2f}, F1: {f1:.2f}")

    # Save best model
    if f1 > best_f1:
        best_f1 = f1
        nlp.to_disk("best_model")
        print("New best model saved!")

print("Training complete!")

Epoch 1
Train Loss: 938.6939
Validation - P: 0.55, R: 0.37, F1: 0.44
New best model saved!
Epoch 2
Train Loss: 395.1526
Validation - P: 0.55, R: 0.52, F1: 0.53
New best model saved!
Epoch 3
Train Loss: 294.3825
Validation - P: 0.56, R: 0.60, F1: 0.58
New best model saved!
Epoch 4
Train Loss: 234.8492
Validation - P: 0.54, R: 0.62, F1: 0.58
Epoch 5
Train Loss: 191.9596
Validation - P: 0.53, R: 0.62, F1: 0.57
Epoch 6
Train Loss: 170.4020
Validation - P: 0.51, R: 0.62, F1: 0.56
Epoch 7
Train Loss: 124.6869
Validation - P: 0.51, R: 0.62, F1: 0.56
Epoch 8
Train Loss: 135.1011
Validation - P: 0.51, R: 0.62, F1: 0.56
Epoch 9
Train Loss: 100.4953
Validation - P: 0.50, R: 0.63, F1: 0.56
Epoch 10
Train Loss: 101.1940
Validation - P: 0.50, R: 0.63, F1: 0.56
Epoch 11
Train Loss: 87.0857
Validation - P: 0.50, R: 0.63, F1: 0.56
Epoch 12
Train Loss: 84.6494
Validation - P: 0.50, R: 0.63, F1: 0.56
Epoch 13
Train Loss: 95.0065
Validation - P: 0.50, R: 0.63, F1: 0.56
Epoch 14
Train Loss: 65.6851
Validat

In [None]:
import spacy

# Load your saved model
nlp = spacy.load("best_model")  # Path to your saved model directory
def predict_skills(job_description):
    # Preprocess text (match training preprocessing)
    text = job_description  # If you trained on lowercase

    # Process text
    doc = nlp(text)

    # Extract skills
    skills = list(set([ent.text.strip(".,:;") for ent in doc.ents if ent.label_ == "SKILL"]))

    return skills

In [None]:
new_job_description = """
Job opportunity for a Data Engineer / AI Engineer in India or Remote

We're seeking a Data Engineer, Innovation Team, to design and build large-scale data solutions that shape the future of people-facing and business-facing products. You'll collaborate with a lean team of software engineers, data scientists, and product managers to develop scalable data models, optimize ETL pipelines, and create impactful visualizations. Your expertise in SQL, ETL, data modeling, and programming (Python, C++, C#, or Scala) will drive product innovation and strategic insights for users. If you're passionate about solving complex data challenges, optimizing growth strategies, and influencing product development with data-driven insights, join us to make an impact!


"""

predicted_skills = predict_skills(new_job_description)
print("Predicted Skills:", predicted_skills)

Predicted Skills: ['C', 'SQL', 'software engineers', 'data modeling', 'Remote', 'Scala', '(Python', 'ETL pipelines', 'C++', 'ETL', 'data scientists', 'India']
