In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
sachinharshitha_resume_skill_mentions_annotations_path = kagglehub.dataset_download('sachinharshitha/resume-skill-mentions-annotations')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/sachinharshitha/resume-skill-mentions-annotations?dataset_version_number=1...


100%|██████████| 45.0k/45.0k [00:00<00:00, 4.10MB/s]

Extracting files...
Data source import complete.





# Load & Prepare **Data**

In [8]:
import json

# Load dataset
file_path = "/root/.cache/kagglehub/datasets/sachinharshitha/resume-skill-mentions-annotations/versions/1/project-1-at-2025-07-19-20-11-bc999050.json"
with open(file_path, "r") as f:
    data = json.load(f)

# Function to clean overlapping entities
def clean_entities(entities):
    entities = sorted(entities, key=lambda x: (x[0], -x[1]))  # sort by start, then longest
    cleaned, prev_end = [], -1
    for start, end, label in entities:
        if start >= prev_end:
            cleaned.append((start, end, label))
            prev_end = end
    return cleaned

# Convert annotations to spaCy format
TRAIN_DATA = []
for item in data:
    text = item.get("data", {}).get("text", "")
    entities = []
    for ann in item.get("annotations", []):
        for res in ann.get("result", []):
            if "labels" in res.get("value", {}) and "SKILL" in res["value"]["labels"]:
                entities.append((res["value"]["start"], res["value"]["end"], "SKILL"))
    entities = clean_entities(entities)
    if text and entities:
        TRAIN_DATA.append((text, {"entities": entities}))

# Add synthetic examples
extra_examples = [
    ("Experienced in Python, Pandas, TensorFlow.", {"entities":[(14,20,"SKILL"), (22,28,"SKILL"), (34,44,"SKILL")]}),
    ("Worked with AWS cloud and Docker.", {"entities":[(11,20,"SKILL"), (25,31,"SKILL")]}),
]
TRAIN_DATA.extend(extra_examples)

print(f"Total samples: {len(TRAIN_DATA)}")
print("Sample example:", TRAIN_DATA[0])


Total samples: 17
Sample example: ('--- Page 1 ---\nAnuradha Kulatunge\n+94-71-9872413 | ovinduanuradha@gmail.com | linkedin.com/in/anuradha-kulathunga | github.com/AnuradhaKulathunga\nWork Experience\nData Analyst Intern Oct 2023 – Apr 2024\nDIMO Lanka Colombo, Sri Lanka\n• Cleaning large data sets, fuzzy matching with Python Pandas data frames, Excel Power Query, PySpark, Spark\nSQL\n• Validating columns in large data sets using Excel default functions and Office Script referencing rule tables\n• Implementing advanced search boxes for large data sets using Excel\n• Software engineering with Power Apps, SharePoint, Excel, Python, and JavaScript\n• Automation skills with Power Automate and Python\nProject Experience\nTime Series Forecasting of Ethereum Prices Using Holt-Winters Exponential Smoothing | 2024\n• Implemented outlier removal, missing value filling, and stationarity checking.\n• Conducted ACF and PACF plot analysis for model selection and training.\nSri Lankan Used Phone Pri

In [4]:
import json

file_path = "/root/.cache/kagglehub/datasets/sachinharshitha/resume-skill-mentions-annotations/versions/1/project-1-at-2025-07-19-20-11-bc999050.json"

with open(file_path, "r") as f:
    data = json.load(f)

# Extract skill words for first 5 samples
for i, item in enumerate(data[:5], 1):
    skills = []
    for ann in item.get("annotations", []):
        for res in ann.get("result", []):
            if "labels" in res.get("value", {}) and "SKILL" in res["value"]["labels"]:
                skills.append(res["value"]["text"].strip())

    print(f"\n--- Sample {i} ---")
    print("Skills:", skills)



--- Sample 1 ---
Skills: ['Python', 'Pandas data frames', 'Excel Power Query', 'PySpark', 'Spark\\nSQL', 'Excel', 'Power Apps', 'SharePoint', 'Excel', 'Python', 'JavaScript', 'Power Automate', 'Python', 'Time Series Forecasting', 'Ethereum Prices Using Holt-Winters Exponential S', 'model selection and training.', 'web scraping', 'Power Automate', 'Python', 'data collection.', 'sentiment analysis', 'TensorFlow', 'deep learning', 'time series analysis', 'FBProphet and\\nLSTM.', 'web deployment', 'Machine Learning', 'machine learning', 'scikit-learn', 'Deep Learning', 'TensorFlow', 'Keras', 'Image Classification', 'Machine Learning', 'Python', 'SVM,RF,LR.', 'Data Reprocessing', 'EDA,model training,Time Series analysis,Computer Vinson,NLP', 'Python, R, Excel, SQL, NoSQL', 'Html', 'CSS', 'avaScript', 'lask', 'Flask', 'MySQL', 'MongoDB,', 'eo']

--- Sample 2 ---
Skills: ['Artificial Intelligence', 'Data Science', 'data science', 'machine learning', 'programming', 'problem-solving', 'PySpark

# Train-Test **Split**

In [9]:
from sklearn.model_selection import train_test_split

# Split into train and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")


Training samples: 13, Validation samples: 4


# Train spaCy NER **Model**

In [10]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

# Load spaCy model (Transformer for better accuracy, fallback to small model)
try:
    nlp = spacy.load("en_core_web_trf")
except:
    nlp = spacy.load("en_core_web_sm")

# Add NER pipeline if missing
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add custom label
ner.add_label("SKILL")

# Train NER model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    best_loss = float("inf")

    for itn in range(30):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(texts[i]), annotations[i]) for i in range(len(texts))]
            nlp.update(examples, sgd=optimizer, drop=0.2, losses=losses)

        # Early stopping
        if losses["ner"] < best_loss:
            best_loss = losses["ner"]
        else:
            print(f"Early stopping at iteration {itn}")
            break

        print(f"Iteration {itn}, Loss: {losses}")


Iteration 0, Loss: {'ner': np.float32(1459.7595)}
Iteration 1, Loss: {'ner': np.float32(1087.3484)}
Iteration 2, Loss: {'ner': np.float32(871.3844)}
Iteration 3, Loss: {'ner': np.float32(680.45245)}
Iteration 4, Loss: {'ner': np.float32(639.7633)}
Iteration 5, Loss: {'ner': np.float32(593.4991)}
Iteration 6, Loss: {'ner': np.float32(527.4514)}
Iteration 7, Loss: {'ner': np.float32(459.32983)}
Iteration 8, Loss: {'ner': np.float32(439.3287)}
Iteration 9, Loss: {'ner': np.float32(402.64252)}
Iteration 10, Loss: {'ner': np.float32(373.29596)}
Early stopping at iteration 11


# Save & Test the **Model**

In [11]:
# Save the trained model
nlp.to_disk("skill_ner_model")
print("Model saved successfully!")

# Load and test
nlp2 = spacy.load("skill_ner_model")
test_text = "I have worked with Java, TensorFlow, Pandas, and AWS cloud."
doc = nlp2(test_text)

print("Extracted Skills:", [(ent.text, ent.label_) for ent in doc.ents])


Model saved successfully!
Extracted Skills: [('Java', 'SKILL'), ('TensorFlow', 'SKILL'), ('Pandas', 'SKILL')]


# **Jaccard Similarity**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def jaccard_similarity(text1, text2):
    words1, words2 = set(text1.lower().split()), set(text2.lower().split())
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))
    return intersection / union if union != 0 else 0

# Example usage
resume_text = "Experienced in Python, Pandas, TensorFlow, and AWS Cloud"
job_text = "Looking for Python developer with AWS and Pandas knowledge"
score = jaccard_similarity(resume_text, job_text)
print(f"Jaccard Similarity Score: {score:.2f}")


Jaccard Similarity Score: 0.13


**TF-IDF + Cosine Similarity**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([resume_text, job_text])

# Cosine similarity
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
print(f"TF-IDF Cosine Similarity Score: {cos_sim:.2f}")


TF-IDF Cosine Similarity Score: 0.31


**BERT Sentence Embeddings** – Semantic Understanding

In [14]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer, util

# Load pretrained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & accurate

# Encode sentences
embeddings = model.encode([resume_text, job_text], convert_to_tensor=True)

# Semantic similarity
bert_sim = util.cos_sim(embeddings[0], embeddings[1]).item()
print(f"BERT Semantic Similarity Score: {bert_sim:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BERT Semantic Similarity Score: 0.77
