### Imports & Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from pathlib import Path
import json
import nltk
from nltk.corpus import stopwords

OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

resumes_df = pd.read_csv("processed/resumes_clean.csv")
print(resumes_df.shape)
resumes_df.head()


(2484, 4)


Unnamed: 0,filename,filepath,text,clean_text
0,10554236.pdf,C:\Guvi\Talent Intelligence & Workforce Optimi...,ACCOUNTANT\nSummary\nFinancial Accountant spec...,ACCOUNTANT Summary Financial Accountant specia...
1,10674770.pdf,C:\Guvi\Talent Intelligence & Workforce Optimi...,STAFF ACCOUNTANT\nSummary\nHighly analytical a...,STAFF ACCOUNTANT Summary Highly analytical and...
2,11163645.pdf,C:\Guvi\Talent Intelligence & Workforce Optimi...,ACCOUNTANT\nProfessional Summary\nTo obtain a ...,ACCOUNTANT Professional Summary To obtain a po...
3,11759079.pdf,C:\Guvi\Talent Intelligence & Workforce Optimi...,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...,SENIOR ACCOUNTANT Experience Company Name June...
4,12065211.pdf,C:\Guvi\Talent Intelligence & Workforce Optimi...,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...,SENIOR ACCOUNTANT Professional Summary Senior ...


### Download Required NLP Resources

In [2]:
# Download stopwords only once
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)


True

### Load spaCy Pretrained NER Model

In [3]:
nlp = spacy.load("en_core_web_md")


### Helper Functions (Name, Email, Phone Extractors)

In [4]:
def extract_email(text):
    match = re.search(r"[A-Za-z0-9\._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}", text)
    return match.group(0) if match else ""

def extract_phone(text):
    match = re.findall(r'\+?\d[\d\-\s]{8,}\d', text)
    return match[0] if match else ""

def extract_name(doc):
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return ""


### Skill Extraction (Keyword Based + NER Boost)

In [5]:
skill_keywords = [
    "python", "java", "sql", "power bi", "tableau", "excel", "machine learning",
    "deep learning", "data science", "nlp", "pandas", "numpy", "react",
    "nodejs", "html", "css", "javascript", "aws", "azure", "docker", "git"
]

def extract_skills(text):
    text = text.lower()
    found = [skill for skill in skill_keywords if skill in text]
    return list(set(found))


### Education Extraction

In [6]:
education_keywords = ["b.e", "btech", "b.tech", "m.tech", "m.sc", "msc", 
                      "bsc", "b.sc", "mba", "phd", "mca", "bca"]

def extract_education(text):
    text = text.lower()
    found = [deg for deg in education_keywords if deg in text]
    return list(set(found))


### Work Experience Extraction

In [7]:
def extract_experience(text):
    exp_pattern = r'(\d+)\s*(?:year|years|yrs)'
    matches = re.findall(exp_pattern, text.lower())
    if matches:
        return max([int(m) for m in matches])
    return 0


### Apply NER to Each Resume

In [8]:
resumes_df['clean_text'] = resumes_df['clean_text'].fillna("").astype(str)

parsed_resumes = []

for idx, row in resumes_df.iterrows():
    text = row['clean_text']

    # Skip empty text safely
    if not isinstance(text, str) or len(text.strip()) < 5:
        doc = None
    else:
        doc = nlp(text)

    name = extract_name(doc) if doc else ""
    email = extract_email(text)
    phone = extract_phone(text)
    skills = extract_skills(text)
    education = extract_education(text)
    experience = extract_experience(text)

    parsed_resumes.append({
        "filename": row.get("filename", ""),
        "name": name,
        "email": email,
        "phone": phone,
        "skills": skills,
        "education": education,
        "experience_years": experience,
        "text": text
    })

parsed_df = pd.DataFrame(parsed_resumes)
parsed_df.head()


Unnamed: 0,filename,name,email,phone,skills,education,experience_years,text
0,10554236.pdf,Wing,,,"[excel, aws]",[],0,ACCOUNTANT Summary Financial Accountant specia...
1,10674770.pdf,Adobe,,,[excel],[],0,STAFF ACCOUNTANT Summary Highly analytical and...
2,11163645.pdf,Gail L. Lugo,,2 864-472-7092,[excel],[],0,ACCOUNTANT Professional Summary To obtain a po...
3,11759079.pdf,John R. Jones Accounting Award,,,[excel],[],0,SENIOR ACCOUNTANT Experience Company Name June...
4,12065211.pdf,reconcileÂ,,2001 - 2002,"[excel, sql]",[],0,SENIOR ACCOUNTANT Professional Summary Senior ...


### Save Parsed Resumes as CSV + JSON

In [9]:
parsed_df.to_csv(f"{OUTPUT_DIR}/parsed_resumes.csv", index=False)
print("Saved parsed_resumes.csv")

json_path = f"{OUTPUT_DIR}/parsed_resumes.json"
parsed_df.to_json(json_path, orient="records", indent=4)
print("Saved parsed_resumes.json")


Saved parsed_resumes.csv
Saved parsed_resumes.json
