In [1]:
from pathlib import Path
import os, re, docx, json
from pypdf import PdfReader
from collections import defaultdict
import spacy

nlp = spacy.load("en_core_web_sm")

PROJECT_ROOT = Path().resolve().parent

RESUME_FOLDER = PROJECT_ROOT / "data" / "resumes"
OUTPUT_FILE = PROJECT_ROOT / "data" / "recruiter_output.json"

def extract_text(file_path):
    path = Path(file_path)
    if path.suffix == ".pdf":
        reader = PdfReader(str(path))
        return "\n".join([p.extract_text() or "" for p in reader.pages])
    elif path.suffix == ".docx":
        doc = docx.Document(str(path))
        return "\n".join([p.text for p in doc.paragraphs])
    else:
        return ""

def parse_resume(text):
    doc = nlp(text)
    out = defaultdict(list)
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG"}:
            out["names_orgs"].append(ent.text)
        elif ent.label_ == "GPE":
            out["locations"].append(ent.text)
        elif ent.label_ == "DATE":
            out["dates"].append(ent.text)
    skills = re.findall(r"\b(python|sql|aws|excel|pandas)\b", text, re.I)
    out["skills"] = list(set(map(str.lower, skills)))
    out["raw"] = text[:2000]
    return dict(out)

# Parse all resumes
results = []
for file in RESUME_FOLDER.iterdir():
    if file.suffix in [".pdf", ".docx"]:
        try:
            text = extract_text(file)
            parsed = parse_resume(text)
            parsed["file_name"] = file.name
            results.append(parsed)
        except Exception as e:
            print(f"Error: {file.name} -> {e}")

# Save
with open(OUTPUT_FILE, "w") as f:
    json.dump(results, f, indent=2)

print("✅ Done parsing all resumes.")


✅ Done parsing all resumes.
