In [None]:
# Install spaCy transformer pipeline and model
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->spacy-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from t

In [None]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random
import json
import re
from sklearn.model_selection import train_test_split

In [None]:
def create_training_example(job_description, skills):
    text = job_description.lower()  # Normalize case
    entities = []

    # Flexible skill matching with context awareness
    for skill in skills:
        # Handle special characters and variations
        pattern = r'\b{}\b(?!\w)'.format(re.escape(skill.lower()))
        for match in re.finditer(pattern, text):
            start, end = match.start(), match.end()
            # Expand matches to include common punctuation
            while start > 0 and text[start-1] in {',', '(', '"', "'"}:
                start -= 1
            entities.append((start, end, "SKILL"))

    return (text, {"entities": entities})

# Load and prepare data
with open('job_descriptions_202.json') as f:
    data = json.load(f)

train_examples = [create_training_example(entry["job_description"], entry["skills"]) for entry in data]
train_data, val_data = train_test_split(train_examples, test_size=0.2, random_state=42)

In [None]:
# Initialize model with better configuration
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")


ner.add_label("SKILL")

# Training parameters
learning_rate = 0.001
dropout = 0.3
epochs = 50
batch_sizes = compounding(2.0, 16.0, 1.001)

# Convert examples
train_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train_data]
val_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in val_data]

# Enhanced training loop with validation
optimizer = nlp.initialize()
best_f1 = 0

def evaluate_model(nlp, examples):

    tp, fp, fn = 0, 0, 0

    print("\n--- Evaluation on Validation Set ---")
    for i, example in enumerate(examples):
        text = example.reference.text
        gold_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]
        doc = nlp(text)
        pred_ents = [(ent.start, ent.end, ent.label_) for ent in doc.ents if ent.label_ == "SKILL"]




        for ent in pred_ents:
            if ent in gold_ents:
                tp += 1
            else:
                fp += 1
        for ent in gold_ents:
            if ent not in pred_ents:
                fn += 1

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    print("\nOverall Evaluation Metrics:")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    return precision, recall, f1

# Training loop with validation
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}

    # Training phase
    batches = minibatch(train_examples, size=batch_sizes)
    for batch in batches:
        nlp.update(
            batch,
            drop=dropout,
            sgd=optimizer,
            losses=losses
        )

    print(f"\nEpoch {epoch+1}")
    print(f"Train Loss: {losses.get('ner', 0):.4f}")

    # Validation phase
    with nlp.use_params(optimizer.averages):
        precision, recall, f1 = evaluate_model(nlp, val_examples)

    # Save the best model based on F1 score
    if f1 > best_f1:
        best_f1 = f1
        nlp.to_disk("best_model")
        print("New best model saved!")

print("\nTraining complete!")




Epoch 1
Train Loss: 947.2119

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.43
Recall: 0.42
F1 Score: 0.43
New best model saved!

Epoch 2
Train Loss: 426.6366

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.73
Recall: 0.66
F1 Score: 0.69
New best model saved!

Epoch 3
Train Loss: 274.1742

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.67
Recall: 0.67
F1 Score: 0.67

Epoch 4
Train Loss: 237.5577

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.68
Recall: 0.76
F1 Score: 0.72
New best model saved!

Epoch 5
Train Loss: 196.5519

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.75
Recall: 0.75
F1 Score: 0.75
New best model saved!

Epoch 6
Train Loss: 149.1974

--- Evaluation on Validation Set ---

Overall Evaluation Metrics:
Precision: 0.84
Recall: 0.75
F1 Score: 0.80
New best model saved!

Epoch 7
Train Loss: 132.1199

--- Evaluatio

In [None]:
import spacy
from google.colab import drive
drive.mount('/content/drive')
nlp.to_disk("/content/drive/My Drive/ResumeMatcher/models/my_spacy_model")


# Load your saved model
nlp = spacy.load("best_model")  # Path to your saved model directory
nlp.to_disk("/content/drive/My Drive/ResumeMatcher/models/my_spacy_model")
def predict_skills(job_description):
    # Preprocess text (match training preprocessing)
    text = job_description  # If you trained on lowercase

    # Process text
    doc = nlp(text)

    # Extract skills
    skills = list(set([ent.text.strip(".,:;/()") for ent in doc.ents if ent.label_ == "SKILL"]))

    return skills
new_job_description = """
We are seeking a Full Stack Developer with 3-5 years of experience in building scalable web applications. The ideal candidate should be proficient in JavaScript, Node.js, and React, and have hands-on experience with MongoDB for database management. Familiarity with RESTful API design and cloud deployments is a plus.
"""
predicted_skills = predict_skills(new_job_description)
print("Predicted Skills:", predicted_skills)
prediction={}
prediction["skills"]=predicted_skills

Mounted at /content/drive
Predicted Skills: ['MongoDB', 'Node.js', 'cloud deployments', 'JavaScript', 'React']


In [None]:
nlp = spacy.load("en_core_web_trf")

def extract_years_experience(text):
    """
    Process the text with a general NER model and then use regex to extract phrases like "3 years", "3-4 years", etc.
    """
    doc = nlp(text)

    # Collect all entities of type DATE, CARDINAL, or any number (sometimes they come as QUANTITY)
    candidate_entities = [ent.text for ent in doc.ents if ent.label_ in {"DATE", "CARDINAL", "QUANTITY"}]


    return candidate_entities[0]

In [None]:
experience_pred=extract_years_experience(new_job_description)
prediction["years_experience"]=experience_pred
print(prediction)

{'skills': ['MongoDB', 'Node.js', 'cloud deployments', 'JavaScript', 'React'], 'years_experience': '3-5 years'}


In [None]:

from google.colab import drive
drive.mount('/content/drive')
import json

# Save extracted job requirements
job_data = {
    "required_skills": predicted_skills,
    "required_experience": experience_pred,
    "job_text":new_job_description
}

with open('/content/drive/MyDrive/ResumeMatcher/job_requirements.json', 'w') as f:
    json.dump(job_data, f)

print("💼 Job Requirements Saved!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
💼 Job Requirements Saved!
