In [3]:
%pip install python-docx pdfminer.six

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting typing-extensions>=4.9.0 (from python-docx)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading typing_extensions-4.14.0-py3-none-any.whl (43 kB)
Installing collected packages: typing-extensions, python-docx, pdfminer.six
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.4.0
    Uninstalling typing_extensions-4.4.0:
      Successfully uninstalled typing_extensions-4.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that

In [13]:
# screen.py

import os
import re
from datetime import datetime
from io import StringIO

import docx
from pdfminer.high_level import extract_text

# 1. File → Text converters

def pdf_to_text(pdf_path):
    try:
        return extract_text(pdf_path)
    except Exception as e:
        print(f"Failed to extract from PDF {pdf_path}: {e}")
        return ""

def docx_to_text(docx_path):
    try:
        d = docx.Document(docx_path)
        return "\n".join([para.text for para in d.paragraphs])
    except Exception as e:
        print(f"Failed to extract from DOCX {docx_path}: {e}")
        return ""

def txt_to_text(txt_path):
    try:
        with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception as e:
        print(f"Failed to read TXT {txt_path}: {e}")
        return ""

def load_resume_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return pdf_to_text(file_path)
    elif ext == ".docx":
        return docx_to_text(file_path)
    elif ext == ".txt":
        return txt_to_text(file_path)
    else:
        return ""

# 2. Cleanup & Section segmentation

SECTION_HEADERS = [
    "Education", "Work Experience", "Professional Experience",
    "Experience", "Skills", "Certifications", "Projects", "Summary"
]

def clean_text(text):
    text = re.sub(r"\r\n|\r|\n", " ", text)
    text = re.sub(r"[ ]{2,}", " ", text)
    return text.strip()

def segment_sections(raw_text):
    pattern = re.compile(r"(?i)^\s*(%s)\s*:?" % "|".join(SECTION_HEADERS), re.MULTILINE)
    matches = list(pattern.finditer(raw_text))
    if not matches:
        return {"ALL": raw_text}
    segments = {}
    for idx, m in enumerate(matches):
        header = m.group(1).upper().strip()
        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(raw_text)
        segments[header] = raw_text[start:end].strip()
    return segments

# 3. Skill extraction

MASTER_SKILLS = {
    "python", "java", "c++", "sql", "docker", "kubernetes",
    "aws", "azure", "gcp", "django", "flask", "react", 
    "angular", "machine learning", "nlp", "tensorflow", "pytorch", "git"
}

def extract_skills(text_block):
    found = set()
    lowered = text_block.lower()
    for skill in MASTER_SKILLS:
        pattern = r"\b" + re.escape(skill.lower()) + r"\b"
        if re.search(pattern, lowered):
            found.add(skill)
    return found

# 4. Education extraction

DEGREE_KEYWORDS = [
    "bachelor", "b.sc", "b.s.", "b.s", "bachelor of science", 
    "master", "m.sc", "m.s.", "m.s", "master of science", 
    "phd", "doctor of philosophy"
]

def extract_education_info(edu_block):
    info = {"degrees": [], "years": []}
    lower = edu_block.lower()
    for kw in DEGREE_KEYWORDS:
        if kw in lower:
            info["degrees"].append(kw)
    for token in re.findall(r"\b(19|20)\d{2}\b", edu_block):
        info["years"].append(int(token))
    return info

# 5. Experience extraction

def parse_date_range(date_str):
    if not date_str:
        return None, None
    date_str = date_str.replace("–", "-").replace("—", "-")
    parts = [p.strip() for p in date_str.split("-")]
    def parse_part(p):
        if not p:
            return None
        if p.lower() in {"present", "current"}:
            return None
        for fmt in ("%b %Y", "%B %Y", "%Y"):
            try:
                return datetime.strptime(p, fmt)
            except:
                continue
        return None
    start = parse_part(parts[0]) if parts else None
    end = parse_part(parts[1]) if len(parts) > 1 else None
    return start, end

def compute_experience_months(start, end):
    if not start:
        return 0
    if not end:
        end = datetime.now()
    return (end.year - start.year) * 12 + (end.month - start.month)

def parse_experience_block(exp_block):
    roles = []
    # Split on double newline as naive delimiter
    chunks = re.split(r"\n{2,}", exp_block)
    for chunk in chunks:
        lines = [ln.strip() for ln in chunk.split("\n") if ln.strip()]
        if not lines:
            continue
        header = lines[0]
        m = re.match(r"^(.*?),\s*(.*?)\s*\((.*?)\)$", header)
        if m:
            title, company, date_range = m.groups()
        else:
            title = header
            company = None
            date_range = None
        details = " ".join(lines[1:])
        roles.append({
            "title": title,
            "company": company,
            "date_range": date_range,
            "details": details
        })
    return roles

# 6. Job Description parsing

def extract_jd_requirements(jd_text):
    jd_lower = jd_text.lower()
    required_skills = extract_skills(jd_text)
    exp_match = re.search(r"(\d+)\s+years?", jd_lower)
    min_experience = int(exp_match.group(1)) if exp_match else 0
    degree_level = None
    for deg in DEGREE_KEYWORDS:
        if deg in jd_lower:
            degree_level = deg
            break
    return {
        "required_skills": required_skills,
        "min_experience_years": min_experience,
        "degree": degree_level
    }

# 7. Scoring

def score_candidate(candidate, jd_req):
    score = 0
    # Skills
    matched_skills = candidate["skills"].intersection(jd_req["required_skills"])
    score += 10 * len(matched_skills)
    extra_skills = candidate["skills"] - jd_req["required_skills"]
    score += 2 * len(extra_skills)
    # Experience
    cand_years = candidate["experience_months"] / 12.0
    if cand_years >= jd_req["min_experience_years"]:
        score += 20
    else:
        if jd_req["min_experience_years"] > 0:
            frac = cand_years / jd_req["min_experience_years"]
            score += int(frac * 20)
    # Education
    if jd_req["degree"] and jd_req["degree"] in candidate["education_degrees"]:
        score += 15
    elif candidate["education_degrees"]:
        score += 5
    # Recent grad bonus
    if candidate["education_years"]:
        last_grad = max(candidate["education_years"])
        if last_grad >= datetime.now().year - 2:
            score += 5
    return score

# 8. Build profile & screening

def build_candidate_profile(file_path):
    raw = load_resume_text(file_path)
    if not raw:
        return None
    text = clean_text(raw)
    sections = segment_sections(text)

    skills_block = sections.get("SKILLS", "") + " " + text
    skills = extract_skills(skills_block)

    edu_block = sections.get("EDUCATION", "")
    edu_info = extract_education_info(edu_block)

    exp_block = sections.get("WORK EXPERIENCE", "") or sections.get("EXPERIENCE", "")
    exp_roles = parse_experience_block(exp_block)
    total_months = 0
    for role in exp_roles:
        if role.get("date_range"):
            start, end = parse_date_range(role["date_range"])
            months = compute_experience_months(start, end)
            total_months += months

    return {
        "filename": os.path.basename(file_path),
        "skills": skills,
        "education_degrees": edu_info["degrees"],
        "education_years": edu_info["years"],
        "experience_months": total_months
    }

def screen_resumes(resumes_folder, job_description_path):
    jd_text = open(job_description_path, encoding="utf-8").read()
    jd_req = extract_jd_requirements(jd_text)

    candidates = []
    for fname in os.listdir(resumes_folder):
        path = os.path.join(resumes_folder, fname)
        if not os.path.isfile(path):
            continue
        prof = build_candidate_profile(path)
        if prof:
            prof["score"] = score_candidate(prof, jd_req)
            candidates.append(prof)

    candidates_sorted = sorted(candidates, key=lambda x: x["score"], reverse=True)

    # Print a simple table
    print(f"{'Candidate':<30} {'Score':<6} {'#Skills':<7} {'Exp(yrs)':<8} {'Degrees':<20}")
    print("-" * 80)
    for cand in candidates_sorted:
        yrs = round(cand["experience_months"] / 12.0, 1)
        degs = ", ".join(cand["education_degrees"])
        print(f"{cand['filename']:<30} {cand['score']:<6} {len(cand['skills']):<7} {yrs:<8} {degs:<20}")

    return candidates_sorted

# 9. Entrypoint

if __name__ == "__main__":
    !mkdir resumes
    !mv /Users/xinruyu/Downloads/xinru.cv.pdf ./resumes/
    resumes_dir = "resumes/"            # Folder with .pdf/.docx/.txt resumes
    jd_file = "/Users/xinruyu/Documents/Untitled.txt"     # Text file containing the job description
    if not os.path.exists(resumes_dir):
        print(f"Folder '{resumes_dir}' not found. Please create it and add resumes.")
    elif not os.path.isfile(jd_file):
        print(f"Job description file '{jd_file}' not found. Create a plain text file.")
    else:
        screen_resumes(resumes_dir, jd_file)


Candidate                      Score  #Skills Exp(yrs) Degrees             
--------------------------------------------------------------------------------
xinru.cv.pdf                   9      2       0.0      b.sc, b.s, master, m.sc, m.s
