In [3]:
#!/usr/bin/env python3
"""
auto_apply.py

A complete Python script that:
1. Parses a CV (PDF/DOCX/TXT) to extract skills, education, and work experience.
2. Fetches job postings for a desired direction (e.g., “Data Analyst”) from Indeed.
3. Scores each job against your CV profile.
4. Automatically applies to the top matches via email (SMTP) and/or web‐form (Selenium).

Before running:
- Install dependencies:
    pip install pdfminer.six python-docx requests selenium
- Download ChromeDriver and ensure it’s in your PATH (compatible with your Chrome version).
- Fill in the placeholders (API keys, email credentials, personal info).
"""

!pip install pdfminer.six python-docx requests selenium

import argparse
import os
import re
import time
import smtplib
import requests

from datetime import datetime
from email.message import EmailMessage

# PDF extraction
from pdfminer.high_level import extract_text

# DOCX extraction
import docx

# Selenium for web‐form autofill
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# ----------------------------
# 1. CV Parsing / Profile Build
# ----------------------------

# Master lists of skills and degree keywords—customize as needed.
MASTER_SKILLS = {
    "python", "sql", "excel", "tableau",  "pandas",
    "machine learning", "statistics", "data visualization"
}
DEGREE_KEYWORDS = [
    "bachelor", "b.sc", "b.s.", "b.s", "bachelor of science",
    "master", "m.sc", "m.s.", "m.s", "master of science"
]
SECTION_HEADERS = [
    "Skills", "Work Experience", "Experience", "Education",
    "Projects", "Certifications", "Summary"
]


def load_cv_text(path):
    """
    Load text from a CV file (PDF, DOCX, or TXT).
    """
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        try:
            return extract_text(path)
        except Exception as e:
            print(f"[CV Load] Failed to extract PDF '{path}': {e}")
            return ""
    elif ext == ".docx":
        try:
            doc = docx.Document(path)
            return "\n".join([p.text for p in doc.paragraphs])
        except Exception as e:
            print(f"[CV Load] Failed to extract DOCX '{path}': {e}")
            return ""
    elif ext == ".txt":
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        except Exception as e:
            print(f"[CV Load] Failed to read TXT '{path}': {e}")
            return ""
    else:
        raise ValueError(f"Unsupported CV format: {ext}")


def clean_text(txt):
    """
    Basic cleanup: collapse whitespace & replace newlines with spaces.
    """
    txt = re.sub(r"[\r\n]+", " ", txt)
    txt = re.sub(r"\s{2,}", " ", txt).strip()
    return txt


def segment_sections(txt):
    """
    Split the full CV text into sections by common headers.
    Returns a dict: {HEADER → subsection text}.
    """
    pattern = re.compile(r"(?mi)^\s*(%s)\s*:?" % "|".join(SECTION_HEADERS))
    matches = list(pattern.finditer(txt))
    if not matches:
        return {"ALL": txt}

    segments = {}
    for idx, m in enumerate(matches):
        header = m.group(1).upper().strip()
        start = m.end()
        end = matches[idx + 1].start() if idx + 1 < len(matches) else len(txt)
        segments[header] = txt[start:end].strip()
    return segments


def extract_skills(txt_block):
    """
    Return a set of MASTER_SKILLS found (case-insensitive, word-boundary).
    """
    found = set()
    low = txt_block.lower()
    for sk in MASTER_SKILLS:
        if re.search(rf"\b{re.escape(sk.lower())}\b", low):
            found.add(sk)
    return found


def parse_date_range(date_str):
    """
    Convert date-range strings like "Jan 2020 – Jun 2022" or "2020–Present"
    into (datetime_start, datetime_end). End is None if 'Present'.
    """
    if not date_str:
        return None, None
    date_str = date_str.replace("–", "-").replace("—", "-")
    parts = [p.strip() for p in date_str.split("-")]
    def parse_one(p):
        if not p or p.lower() in {"present", "current"}:
            return None
        for fmt in ("%b %Y", "%B %Y", "%Y"):
            try:
                return datetime.strptime(p, fmt)
            except:
                continue
        return None

    start = parse_one(parts[0]) if parts else None
    end = parse_one(parts[1]) if len(parts) > 1 else None
    return start, end


def compute_months(a, b):
    """
    Given two datetimes (b can be None → treat as now),
    return number of full months between them.
    """
    if not a:
        return 0
    if not b:
        b = datetime.now()
    return (b.year - a.year) * 12 + (b.month - a.month)


def extract_experience(txt_block):
    """
    Parse the WORK EXPERIENCE / EXPERIENCE block:
    - Split on double newlines as a heuristic for separate roles.
    - Header line format: "Title, Company (DateRange)".
    Returns:
      roles: list of {title, company, months_in_role}
      total_months: sum of all months across roles
    """
    roles = []
    total_months = 0
    chunks = re.split(r"\n{2,}", txt_block)
    for chunk in chunks:
        lines = [ln.strip() for ln in chunk.split("\n") if ln.strip()]
        if not lines:
            continue
        header = lines[0]
        m = re.match(r"^(.*?),\s*(.*?)\s*\((.*?)\)$", header)
        if m:
            title, company, daterng = m.groups()
        else:
            title, company, daterng = header, None, None
        months = 0
        if daterng:
            start, end = parse_date_range(daterng)
            months = compute_months(start, end)
            total_months += months
        roles.append({"title": title, "company": company, "months": months})
    return roles, total_months


def extract_education(txt_block):
    """
    From EDUCATION block, find degree keywords and 4-digit years.
    Returns:
      degrees: list of matched degree keywords
      years: list of int years found
    """
    degrees = []
    years = []
    low = txt_block.lower()
    for d in DEGREE_KEYWORDS:
        if d in low:
            degrees.append(d)
    for tok in re.findall(r"\b(19|20)\d{2}\b", txt_block):
        years.append(int(tok))
    return degrees, years


def parse_cv(cv_path):
    """
    Build a profile dict from the CV file:
      {
        "skills": set([...]),
        "degrees": [...],
        "edu_years": [...],
        "roles": [ {title, company, months}, ... ],
        "total_months": int
      }
    """
    raw = load_cv_text(cv_path)
    txt = clean_text(raw)
    segs = segment_sections(txt)

    # Skills: look in SKILLS section + entire text as fallback
    skills_block = segs.get("SKILLS", "") + " " + txt
    skills = extract_skills(skills_block)

    # Education
    edu_block = segs.get("EDUCATION", "")
    degrees, edu_years = extract_education(edu_block)

    # Experience
    exp_block = segs.get("WORK EXPERIENCE", "") or segs.get("EXPERIENCE", "")
    roles, total_months = extract_experience(exp_block)

    return {
        "skills": skills,
        "degrees": degrees,
        "edu_years": edu_years,
        "roles": roles,
        "total_months": total_months
    }


# ----------------------------
# 2. Desired-Direction Matching
# ----------------------------

# Synonym lists for various “directions”. Modify as needed.
DIRECTION_KEYWORDS = {
    "data analyst": ["data analyst", "business intelligence", "reporting analyst"],
    "data scientist": ["data scientist", "machine learning engineer", "ml engineer"],
    "backend engineer": ["backend engineer", "backend developer", "server engineer"],
    # Extend with other directions if desired
}


def matches_direction(job_title, desired_direction):
    """
    Check if 'job_title' contains any synonyms of 'desired_direction'.
    """
    desired = desired_direction.lower()
    synonyms = DIRECTION_KEYWORDS.get(desired, [desired])
    jt_low = job_title.lower()
    return any(keyword in jt_low for keyword in synonyms)


# ----------------------------
# 3. Job Fetching (Indeed API)
# ----------------------------

# Replace with your own Indeed API publisher key.
INDEED_PUBLISHER_ID = "YOUR_INDEED_API_KEY"


def fetch_indeed_jobs(api_key, query, location=""):
    """
    Fetch job postings from Indeed API (v2). Returns a list of postings:
    Each posting is a dict with at least:
      "jobtitle", "company", "snippet", "url", "apply_email", "apply_url", "formattedLocation"
    """
    url = "https://api.indeed.com/ads/apisearch"
    params = {
        "publisher": api_key,
        "v": "2",
        "q": query,
        "l": location,
        "format": "json",
        "limit": 50
    }
    try:
        resp = requests.get(url, params=params, timeout=10)
        data = resp.json()
        results = data.get("results", [])
        postings = []
        for job in results:
            postings.append({
                "jobtitle": job.get("jobtitle", ""),
                "company": job.get("company", ""),
                "snippet": job.get("snippet", ""),
                "url": job.get("url", ""),
                # The Indeed API doesn’t always provide a direct “email to apply” field.
                # You may need to scrape the actual job page (not covered here).
                "apply_method": "web",
                "apply_url": job.get("url", ""),
                "location": job.get("formattedLocation", ""),
            })
        return postings
    except Exception as e:
        print(f"[Job Fetch] Failed to fetch Indeed jobs: {e}")
        return []


# ----------------------------
# 4. Scoring Functions
# ----------------------------

def parse_posting_requirements(posting_text):
    """
    Naive: find MASTER_SKILLS keywords in the posting text.
    """
    return extract_skills(posting_text)


def score_job(posting, profile, desired_direction):
    """
    Score a single job posting against your CV profile and desired direction.
    Returns (score:int, req_skills:set).
    """
    score = 0
    jt = posting.get("jobtitle", "")

    # 1) Title match
    if matches_direction(jt, desired_direction):
        score += 20

    # 2) Skill match
    combined_text = posting.get("snippet", "") + " " + posting.get("description", "")
    req_skills = parse_posting_requirements(combined_text)
    matched = profile["skills"].intersection(req_skills)
    score += 10 * len(matched)

    # 3) Extra skills (none for now—can be extended)
    # extra = req_skills - profile["skills"]
    # extra_matched = set()  # if profile had extras
    # score += 5 * len(extra_matched)

    # 4) Experience requirement (if any)
    snippet_low = posting.get("snippet", "").lower()
    exp_match = re.search(r"(\d+)\+?\s+years?", snippet_low)
    if exp_match:
        req_years = int(exp_match.group(1))
        your_years = profile["total_months"] / 12.0
        if your_years >= req_years:
            score += 15
        else:
            frac = your_years / req_years
            score += int(frac * 15)

    return score, req_skills


# ----------------------------
# 5. Email-Based Application
# ----------------------------

# SMTP / email credentials (replace with your own).
SMTP_SERVER = "smtp.gmail.com"
SMTP_PORT = 587
YOUR_EMAIL = "youremail@example.com"
YOUR_PASSWORD = "your_app_password_here"  # e.g., Gmail App Password


def send_email_application(posting, profile, cv_path, personal_info):
    """
    Compose and send an application email with CV attached.
    posting: dict with keys 'jobtitle', 'company', 'apply_email' or 'apply_to'.
    personal_info: dict with at least 'full_name', 'email', 'phone', 'linkedin'.
    """
    to_addr = posting.get("apply_email") or posting.get("email") or None
    if not to_addr:
        print(f"[Email Apply] No email found for {posting['jobtitle']} @ {posting.get('company','')}")
        return

    job_title = posting["jobtitle"]
    company = posting.get("company", "")
    subject = f"Application for {job_title} at {company}"

    body = f"""
Hello,

My name is {personal_info['full_name']}, and I am writing to apply for the position of {job_title} at {company}. Based on my background in {', '.join(profile['skills'])} and over {int(profile['total_months']/12)} years of experience, I believe I am a strong fit.

I have attached my résumé for your review. Please let me know if you need any further information.

Thank you for your time and consideration.

Sincerely,
{personal_info['full_name']}
{personal_info['phone']}
{personal_info['email']}
{personal_info.get('linkedin','')}
    """.strip()

    msg = EmailMessage()
    msg["From"] = YOUR_EMAIL
    msg["To"] = to_addr
    msg["Subject"] = subject
    msg.set_content(body)

    # Attach the CV file
    with open(cv_path, "rb") as f:
        data = f.read()
        fname = os.path.basename(cv_path)
        if fname.lower().endswith(".pdf"):
            maintype, subtype = "application", "pdf"
        else:
            maintype, subtype = "application", "vnd.openxmlformats-officedocument.wordprocessingml.document"
        msg.add_attachment(data, maintype=maintype, subtype=subtype, filename=fname)

    try:
        with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as smtp:
            smtp.ehlo()
            smtp.starttls()
            smtp.login(YOUR_EMAIL, YOUR_PASSWORD)
            smtp.send_message(msg)
        print(f"[Email Apply] Sent application email to {to_addr} for {job_title} @ {company}")
    except Exception as e:
        print(f"[Email Apply] Failed to send email to {to_addr}: {e}")


# ----------------------------
# 6. Web-Form Autofill (Selenium)
# ----------------------------

def autofill_web_form(posting, profile, cv_path, personal_info):
    """
    Use Selenium (headless Chrome) to fill a typical “Apply Now” form.
    posting: dict with 'apply_url' pointing to the job page.
    personal_info: dict with 'full_name', 'email', 'phone', 'linkedin', etc.
    """
    url = posting.get("apply_url")
    if not url:
        print(f"[Web Apply] No apply_url for {posting['jobtitle']} @ {posting.get('company','')}")
        return

    chrome_opts = Options()
    chrome_opts.add_argument("--headless")
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=chrome_opts)
    driver.get(url)
    time.sleep(2)  # Adjust or use explicit waits if needed

    try:
        # Example element lookups—these are site-specific!
        # Inspect the page manually to find correct name/id/xpath for each field.

        # Name field
        try:
            name_input = driver.find_element(By.NAME, "applicant_name")
            name_input.send_keys(personal_info["full_name"])
        except:
            pass

        # Email field
        try:
            email_input = driver.find_element(By.NAME, "applicant_email")
            email_input.send_keys(personal_info["email"])
        except:
            pass

        # Phone field
        try:
            phone_input = driver.find_element(By.NAME, "applicant_phone")
            phone_input.send_keys(personal_info["phone"])
        except:
            pass

        # LinkedIn field (if present)
        try:
            linkedin_input = driver.find_element(By.NAME, "linkedin_url")
            linkedin_input.send_keys(personal_info.get("linkedin",""))
        except:
            pass

        # Upload CV
        try:
            upload_field = driver.find_element(By.XPATH, "//input[@type='file']")
            upload_field.send_keys(os.path.abspath(cv_path))
        except:
            pass

        # Submit button
        try:
            submit_btn = driver.find_element(By.XPATH, "//button[contains(text(),'Submit') or @type='submit']")
            submit_btn.click()
            time.sleep(2)
            print(f"[Web Apply] Applied to {posting['jobtitle']} @ {posting.get('company','')} via web form.")
        except:
            print(f"[Web Apply] Could not find/submit form on {url}.")
    except Exception as e:
        print(f"[Web Apply] Error during form autofill: {e}")
    finally:
        driver.quit()


# ----------------------------
# 7. Main Orchestration
# ----------------------------


# 1) Specify your CV path and desired direction here:
cv_path = "/Users/xinruyu/Downloads/xinru.cv.pdf"        # ← change to your actual CV file
desired_direction = "Data Analyst"     # ← change to whatever role you want, e.g. “Backend Engineer”
location_filter = "Germany"              # ← optional location string for Indeed search
max_jobs_to_apply = 5                   # ← top N to auto-apply

# 2) Parse your CV into a profile
profile = parse_cv(cv_path)
print(f"[CV Parse] Skills: {profile['skills']}")
print(f"[CV Parse] Total experience: {profile['total_months']//12} years, Roles: {len(profile['roles'])}")
print(f"[CV Parse] Degrees: {profile['degrees']} (Years: {profile['edu_years']})\n")

# 3) Fetch job postings from Indeed
print(f"[Job Fetch] Searching Indeed for '{desired_direction}' in '{location_filter}' …")
jobs = fetch_indeed_jobs(INDEED_PUBLISHER_ID, desired_direction, location_filter)
print(f"[Job Fetch] Retrieved {len(jobs)} postings.\n")

# If no jobs found, bail out:
if not jobs:
    print("[Job Fetch] No jobs found. Exiting.")
else:
    # 4) Score each job and pick top N
    scored_jobs = []
    for post in jobs:
        score, req_sk = score_job(post, profile, desired_direction)
        scored_jobs.append((score, post, req_sk))
    scored_jobs.sort(key=lambda x: x[0], reverse=True)
    top_jobs = scored_jobs[:max_jobs_to_apply]

    print(f"[Ranking] Top {len(top_jobs)} job matches:")
    for i, (score, job, req_sk) in enumerate(top_jobs, start=1):
        print(f"{i}. {job['jobtitle']} @ {job['company']} → Score: {score}")
        print(f"   Location: {job.get('location','')}")
        print(f"   Required skills: {req_sk}")
        print(f"   Apply via: {job.get('apply_method','web')} → {job.get('apply_url','n/a')}\n")

    # 5) Personal info for applications (fill in your own details)
    personal_info = {
        "full_name": "Xinru Yu",                              # ← fill in
        "email": "xinru.yu@gmx.com",                      # ← fill in
        "phone": "+49 15222448839",                             # ← fill in
        "linkedin": "https://www.linkedin.com/in/xinru-yu-484955228/"      # ← optional
    }

    # 6) Loop through top matches and apply automatically
    for score, job, _ in top_jobs:
        if job.get("apply_method") == "email":
            send_email_application(job, profile, cv_path, personal_info)
            time.sleep(2)  # delay between emails
        else:
            # “apply_method” is “web” by default
            autofill_web_form(job, profile, cv_path, personal_info)
            time.sleep(2)  # delay between form submissions



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[CV Load] Failed to extract PDF '/Users/xinruyu/Downloads/xinru.cv.pdf': [Errno 2] No such file or directory: '/Users/xinruyu/Downloads/xinru.cv.pdf'
[CV Parse] Skills: set()
[CV Parse] Total experience: 0 years, Roles: 0
[CV Parse] Degrees: [] (Years: [])

[Job Fetch] Searching Indeed for 'Data Analyst' in 'Germany' …
[Job Fetch] Failed to fetch Indeed jobs: HTTPSConnectionPool(host='api.indeed.com', port=443): Max retries exceeded with url: /ads/apisearch?publisher=YOUR_INDEED_API_KEY&v=2&q=Data+Analyst&l=Germany&format=json&limit=50 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f982f39a950>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))
[Job Fetch] Ret