# Resume Parser with Named Entity Recognition

This notebook demonstrates how to train a Named Entity Recognition (NER) model using spaCy to parse resumes.

In [4]:
# Import necessary libraries
import numpy as np
import pickle
import spacy
import random
from spacy.training import Example
from spacy.util import minibatch, compounding
import pandas as pd
import os
for dirname, _, filenames in os.walk('/Users/utsavsharma/Desktop/RR/Resumes'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/utsavsharma/Desktop/RR/Resumes/HK.docx
/Users/utsavsharma/Desktop/RR/Resumes/.DS_Store
/Users/utsavsharma/Desktop/RR/Resumes/Resume (2) - Alex Kim.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Resume - Selene H.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Copy of Bản sao của LÊ MINH ĐẠT Resume - Đạt Lê minh.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Ayush.Jain_Resume - Ayush.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Mia Molinelli MASTER Resume  - Mia Molinelli.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Jayden Spitulnik Resume - Jayden Spitulnik.docx
/Users/utsavsharma/Desktop/RR/Resumes/Andriana Detsis Fall 2023 Resume - Detsis, Andriana S..pdf
/Users/utsavsharma/Desktop/RR/Resumes/Evan_Hadam_Resume - Evan Hadam.pdf
/Users/utsavsharma/Desktop/RR/Resumes/navya_nair_Resume (2) - Navya Nair.pdf
/Users/utsavsharma/Desktop/RR/Resumes/2023 Resume Ella Mendelowitz - Ella.pdf
/Users/utsavsharma/Desktop/RR/Resumes/Ansh Parikh_2023 Resume - Ansh Parikh.pdf
/Users/utsavsharma/Desktop/RR/Resum

In [1]:
# Load Blank Model
nlp = spacy.blank('en')

def train_model(train_data):
    # Add the NER pipeline to the model
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
    
    # Add labels to the NER pipeline
    for _, annotation in train_data:
        for ent in annotation.get('entities'):
            ner.add_label(ent[2])
    
    # Remove other pipelines if they are there
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):  # train for 10 iterations
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            for batch in minibatch(train_data, size=compounding(4.0, 32.0, 1.5)):
                for text, annotations in batch:
                    try:
                        doc = nlp.make_doc(text)
                        example = Example.from_dict(doc, annotations)
                        nlp.update(
                            [example],  # batch of examples
                            drop=0.2,  # dropout - make it harder to memorize data
                            sgd=optimizer,  # callable to update weights
                            losses=losses)
                    except Exception as e:
                        print(f"Error in iteration {itn}: {e}")
                        pass
                
            print(losses)

# Example training data
train_data = [
    ("John Doe is a software engineer with 5 years of experience in Python and Java.", {"entities": [(0, 8, "PERSON"), (39, 45, "EXPERIENCE"), (49, 55, "EXPERIENCE"), (59, 65, "SKILL"), (70, 74, "SKILL")]}),
    ("Jane Smith, a data scientist skilled in machine learning and statistical analysis.", {"entities": [(0, 10, "PERSON"), (14, 29, "JOB_TITLE"), (42, 56, "SKILL"), (61, 81, "SKILL")]}),
]

# Start Training model
train_model(train_data)

Starting iteration 0
{'ner': 19.313601043075323}
Starting iteration 1
{'ner': 18.602520409971476}
Starting iteration 2
{'ner': 18.238527432084084}
Starting iteration 3
{'ner': 16.2943418584764}
Starting iteration 4
{'ner': 13.613409722223878}
Starting iteration 5
{'ner': 9.596332433633506}
Starting iteration 6
{'ner': 5.979694422334433}
Starting iteration 7




{'ner': 5.0591885706890025}
Starting iteration 8
{'ner': 4.6304952280224825}
Starting iteration 9
{'ner': 4.753946005311949}


## Test the trained model on new resumes

In [2]:
# Example resumes
resumes = [
    "Emily Clark, a senior data analyst with 10 years of experience in SQL and Python.",
    "Michael Brown is a machine learning engineer with expertise in TensorFlow and PyTorch.",
]

# Use the trained model to process the resumes
for resume in resumes:
    doc = nlp(resume)
    print(f"Entities in '{resume}':")
    for ent in doc.ents:
        print(f" - {ent.text} ({ent.label_})")

Entities in 'Emily Clark, a senior data analyst with 10 years of experience in SQL and Python.':
Entities in 'Michael Brown is a machine learning engineer with expertise in TensorFlow and PyTorch.':
