In [None]:
import google.generativeai as genai
import json
import os
# Set your API key (assumes you have a service account key JSON or API key)
api_key = os.getenv("GEMINI_API_KEY")

if api_key:
    genai.configure(api_key=api_key)
    print("Gemini API key configured successfully from environment variable.")
else:
    print("Error: GEMINI_API_KEY environment variable not found.")
    # You might want to raise an exception or exit the script here
    # raise ValueError("GE

In [None]:
def ask_gemini(prompt, model="gemini-2.5-flash", temperature=0):
    """
    Call Gemini 2.5 Flash for a single-turn generative response.
    """
    model = genai.GenerativeModel(model_name=model)
    response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(temperature=temperature)
    )
    return response.text

In [None]:
import csv 
import time 

import json

def generate_field_answers(fields, resume_json):
    answers = []
    for field in fields:
        question = field["field_name"]
        options_text = "; ".join(field["options"]) if field["options"] else ""
        
        prompt = f"""
You are an expert LinkedIn Easy Apply assistant that auto-fills form fields intelligently.

🎯 **Your goal**: Provide the most accurate and recruiter-friendly answer for each field using the candidate’s resume data.

🧠 **Input Context**
- Resume JSON: {json.dumps(resume_json)}
- Field question: {question}
- Field type: {field['field_type']}
- Current value: {field['value']}
- Options (if any): {options_text}

---

### 🧩 **Core Principles**
1️⃣ Favor the candidate by highlighting skills, experience, and achievements.  
2️⃣ Favor the recruiter by ensuring the response looks relevant, confident, and professional.  
3️⃣ Always produce an answer that can be directly inserted into a form field — no commentary, no extra text.

---

### 🧠 **Intelligent Behavior Rules**

#### 📊 **1. Numerical / Experience-based questions**
- If the question asks:
  - “How many years of experience”, “Experience in”, or similar → output ONLY a number.
  - Example: "Experience in Gen-AI?" → `2`
  - Example: "How many years of experience in Python?" → `3`
- Use fractional years:
  - 2 years 3 months → 2
  - 2 years 8 months → 3
- Don’t include words like “years”, “yrs”, or “months” — just the number.

#### 💰 **2. Salary or compensation**
- If question involves salary, CTC, or pay expectations → return a **range** like:
  - `100000 - 200000`
- Use realistic and market-aligned numbers based on resume experience.

#### 💬 **3. Text / Paragraph fields**
- If question expects a descriptive answer (like “Why should we hire you?” or “Tell us about yourself”):
  - Write 1–3 sentences that sound professional and natural.
  - Use resume highlights (skills, projects, experience) to make it personal.
  - Example: "I'm a software engineer with strong expertise in Gen-AI, automation, and large-scale system design."

#### 🎯 **4. Select / Multi-select / Radio / Checkbox**
- Select the **most relevant option(s)** aligned with the candidate’s skills, job role, or experience.
- Return **only** the selected option(s) text, not an explanation.

#### 🧾 **5. Missing or unclear info**
- If resume doesn’t explicitly provide the answer:
  - Infer a realistic, professional value.
  - Avoid placeholders like “N/A”, “sample”, “not applicable”.

#### ⚡ **6. Strict output format**
- Return only the final value — no explanations, quotes, or formatting.
- The answer must be:
  - a number → for numeric questions
  - a text phrase → for open-ended questions
  - a valid option → for dropdowns/radio/multi-select
  - a salary range → for pay-related fields

---

### 🧩 **Output Expectation**
Return ONLY the answer text (no markdown, no commentary).
"""

        try:
            answer = ask_gemini(prompt)
        except Exception as e:
            print(f"❌ Error generating answer for field '{question}': {e}")
            answer = ""
        
        field_copy = field.copy()
        field_copy["generated_answer"] = answer.strip()
        answers.append(field_copy)
        time.sleep(0.5)  # rate-limit Gemini calls lightly
    return answers


# -----------------------------
# Save generated answers CSV
# -----------------------------
def save_answers_to_csv(fields_with_answers, filename="easy_apply_answers.csv"):
    file_exists = os.path.exists(filename)
    with open(filename, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["field_name", "field_type", "value", "options", "generated_answer"])
        if not file_exists:
            writer.writeheader()
        for row in fields_with_answers:
            row_copy = row.copy()
            row_copy.pop("element", None)
            row_copy["options"] = "; ".join(row_copy.get("options", [])) if row_copy.get("options") else ""
            writer.writerow(row_copy)
    print(f"✅ Answers appended to {filename}")


In [None]:
import time
import pandas as pd
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ==== Country domain mapping ====
indeed_domains = {
    "Argentina": "ar", "Australia": "au", "Austria": "at", "Bahrain": "bh", "Belgium": "be",
    "Brazil": "br", "Canada": "ca", "Chile": "cl", "China": "cn", "Colombia": "co",
    "Costa Rica": "cr", "Czech Republic": "cz", "Denmark": "dk", "Ecuador": "ec", "Egypt": "eg",
    "Finland": "fi", "France": "fr", "Germany": "de", "Greece": "gr", "Hong Kong": "hk",
    "Hungary": "hu", "India": "in", "Indonesia": "id", "Ireland": "ie", "Israel": "il",
    "Italy": "it", "Japan": "jp", "Kuwait": "kw", "Luxembourg": "lu", "Malaysia": "my",
    "Mexico": "mx", "Morocco": "ma", "Netherlands": "nl", "New Zealand": "nz", "Nigeria": "ng",
    "Norway": "no", "Oman": "om", "Pakistan": "pk", "Panama": "pa", "Peru": "pe",
    "Philippines": "ph", "Poland": "pl", "Portugal": "pt", "Qatar": "qa", "Romania": "ro",
    "Saudi Arabia": "sa", "Singapore": "sg", "South Africa": "za", "South Korea": "kr",
    "Spain": "es", "Sweden": "se", "Switzerland": "ch", "Taiwan": "tw", "Thailand": "th",
    "Turkey": "tr", "Ukraine": "ua", "United Arab Emirates": "ae", "United Kingdom": "uk",
    "United States": "", "Uruguay": "uy", "Venezuela": "ve", "Vietnam": "vn"
}

# ==== Build Indeed URL ====
def build_indeed_url(job_title, location, country="India", start=0):
    country_code = indeed_domains.get(country, "")
    base_url = f"https://{country_code}.indeed.com/jobs" if country_code else "https://www.indeed.com/jobs"
    job_title_encoded = job_title.replace(" ", "+")
    location_encoded = location.replace(" ", "+")
    return f"{base_url}?q={job_title_encoded}&l={location_encoded}&from=searchOnDesktopSerp&start={start}"

# ==== Load preferred title and location from JSON ====
def get_preferred_job_and_location(json_path="resumes/Yeswanth_Yerra_CV_structured.json"):
    with open(json_path, "r") as f:
        data = json.load(f)
    preferred_title = data.get("preferred_title", "Software Engineer")
    preferred_location = data.get("preferred_location", "India")
    return preferred_title, preferred_location

# ==== Modular scraper function ====
def scrape_indeed_jobs(job_title=None, location=None, country="India", max_pages=3, save_csv=True, output_file="indeed_jobs.csv"):
    # If not provided, get from JSON
    if not job_title or not location:
        job_title, location = get_preferred_job_and_location()

    print(f"🔍 Scraping Indeed for '{job_title}' jobs in '{location}', Country: {country}")

    # ==== Setup Chrome driver ====
    chrome_options = Options()
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--headless=new")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    jobs = []

    for page in range(max_pages):
        start = page * 10
        search_url = build_indeed_url(job_title, location, country, start)
        print(f"\nDEBUG: Processing page {page+1} | URL: {search_url}")

        driver.get(search_url)
        time.sleep(3)

        # ==== Wait for job cards ====
        try:
            job_cards = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.job_seen_beacon"))
            )
            print(f"DEBUG: Found {len(job_cards)} job cards on page {page+1}")
        except:
            print(f"No job cards found on page {page+1}, stopping.")
            break

        for idx, card in enumerate(job_cards, start=1):
            print(f"\nDEBUG: Processing job card #{idx} on page {page+1}")
            
            # Title
            try:
                title_elem = card.find_element(By.CSS_SELECTOR, "h2 > a")
                title = title_elem.text
                job_link = title_elem.get_attribute("href")
            except:
                title = "N/A"
                job_link = None
            
            # Company
            try:
                company_elem = card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']")
                company = company_elem.text
            except:
                company = "N/A"
            
            # Location
            try:
                location_elem = card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']")
                location_text = location_elem.text
            except:
                location_text = "N/A"
            
            # ==== Click to load job description in right pane ====
            try:
                driver.execute_script("arguments[0].scrollIntoView();", title_elem)
                title_elem.click()
                job_desc_elem = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.ID, "jobDescriptionText"))
                )
                job_description = job_desc_elem.text
            except:
                job_description = "N/A"

            # ==== Detect Apply Type and Apply Link ====
            try:
                apply_button = driver.find_element(By.CSS_SELECTOR, "span.indeed-apply-status-not-applied button")
                apply_type = "Apply Now"
                apply_link = apply_button.get_attribute("onclick") or job_link
            except:
                apply_type = "Apply on Company Site"
                apply_link = job_link

            jobs.append({
                "Title": title,
                "Company": company,
                "Location": location_text,
                "Description": job_description,
                "Apply Type": apply_type,
                "Apply Link": apply_link
            })

    driver.quit()

    # ==== Save to CSV ====
    if save_csv and jobs:
        df = pd.DataFrame(jobs)
        df.replace("N/A", pd.NA, inplace=True)
        df.dropna(subset=["Title", "Company", "Location", "Description"], inplace=True)
        df.to_csv(output_file, index=False)
        print(f"\n✅ Scraping complete. Saved to {output_file}")

    return jobs




In [3]:
import time
import random
import logging
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.parse


class LinkedInJobScraper:
    def __init__(self, headless=True):
        self.headless = headless
        self.driver = None
        self.setup_driver()

    def setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        options.add_argument("--disable-popup-blocking")
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-infobars")
        options.add_argument(
            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        if self.headless:
            options.add_argument("--headless=new")

        self.driver = webdriver.Chrome(options=options)
        self.driver.execute_script(
            "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
        )
        logging.info("Chrome WebDriver initialized successfully")

    # ---------------------------
    def _get_time_filter(self, days: int) -> str:
        return {1: "r86400", 3: "r259200", 7: "r604800", 30: "r2592000"}.get(days, "r604800")

    def build_search_url(self, job_title: str, location: str, experience_level=None, time_posted=None):
        base_url = "https://www.linkedin.com/jobs/search/"
        params = {
            "keywords": urllib.parse.quote(job_title),
            "location": urllib.parse.quote(location),
            "f_TPR": time_posted
        }
        query = "&".join([f"{k}={v}" for k, v in params.items() if v])
        return f"{base_url}?{query}"

    # ---------------------------
    def scrape_job_details(self, job_url: str):
        """
        Open a job detail page and extract:
        - Apply type (Easy Apply or Apply)
        - Employment type
        - Job description
        """
        self.driver.get(job_url)
        time.sleep(3)  # Let the page load

        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        # Apply type detection
        # Wait for apply button or code tag dynamically
        apply_type = "Unknown"
        try:
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "button.top-card-layout__cta--primary, code#applyUrl")
                )
            )
        except:
            pass

        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        # First, check code tag
        code_tag = soup.find("code", id="applyUrl")
        if code_tag and code_tag.text.strip():
            apply_type = "Apply (Offsite)"
        else:
            # Then check button attribute
            apply_button = soup.find("button", class_="top-card-layout__cta--primary")
            if apply_button:
                dt_name = apply_button.get("data-tracking-control-name", "").lower()
                if "offsite" in dt_name:
                    apply_type = "Apply (Offsite)"
                elif "onsite" in dt_name:
                    apply_type = "Easy Apply"


        # Employment type
        employment_type = None
        criteria_items = soup.select("ul.description__job-criteria-list li.description__job-criteria-item")
        for item in criteria_items:
            header = item.find("h3", class_="description__job-criteria-subheader")
            if header and header.text.strip() == "Employment type":
                span = item.find("span", class_="description__job-criteria-text--criteria")
                if span:
                    employment_type = span.text.strip()
                break

        # Job description
        job_description = None
        desc_div = soup.find("div", class_="description__text--rich")
        if desc_div:
            markup_div = desc_div.find("div", class_="show-more-less-html__markup")
            if markup_div:
                job_description = markup_div.get_text(separator="\n", strip=True)

        return {
            "apply_link": job_url,  # keep job url as fallback
            "apply_type": apply_type,
            "employment_type": employment_type,
            "job_description": job_description
        }

    # ---------------------------
    def scrape_jobs(self, job_title: str, location: str = "India", pages: int = 1,
                    experience_level: str = None, days_back: int = 7,
                    csv_filename: str = "csv/linkedin_jobs.csv"):

        time_filter = self._get_time_filter(days_back)
        search_url = self.build_search_url(job_title, location, experience_level, time_filter)
        self.driver.get(search_url)
        time.sleep(4)

        logging.info(f"Scraping '{job_title}' jobs in {location} (pages={pages})")

        # Scroll and click "Show more" to load more jobs
        for _ in range(pages):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(3, 5))
            try:
                btn = WebDriverWait(self.driver, 3).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Show more')]"))
                )
                btn.click()
            except:
                pass

        # Parse loaded page
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        job_cards = soup.find_all("div", class_="base-card")
        logging.info(f"Found {len(job_cards)} job cards")

        jobs_data = []
        # job_cards = job_cards[:3]
        # print("scraping only 3 jobs")
        for card in job_cards:
            try:
                title = card.find("h3", class_="base-search-card__title").get_text(strip=True)
                company = card.find("h4", class_="base-search-card__subtitle").get_text(strip=True)
                location_text = card.find("span", class_="job-search-card__location").get_text(strip=True)
                job_url_elem = card.find("a", class_="base-card__full-link")
                if not job_url_elem:
                    continue
                job_url = job_url_elem.get("href")

                # Scrape details from job page
                details = self.scrape_job_details(job_url)

                job_data = {
                    "title": title,
                    "company": company,
                    "location": location_text,
                    **details
                }
                jobs_data.append(job_data)

            except Exception as e:
                logging.warning(f"Skipping a job due to error: {e}")
                continue

        # Save to CSV
        if jobs_data:
            keys = jobs_data[0].keys()
            with open(csv_filename, "w", newline="", encoding="utf-8") as f:
                dict_writer = csv.DictWriter(f, fieldnames=keys)
                dict_writer.writeheader()
                dict_writer.writerows(jobs_data)
            logging.info(f"Saved {len(jobs_data)} jobs to {csv_filename}")

        return jobs_data





In [2]:
import fitz
import re
import json
import logging
import unicodedata
from typing import Dict, List


# Logger Setup

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# Regex Patterns

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(\+?\d[\d\s\-\(\)]{8,}\d)")
LINK_RE = re.compile(
    r"(?:https?://)?(?:www\.)?(?:linkedin|github|portfolio|medium|personal|behance)\.[^\s,]+",
    re.IGNORECASE,
)


# Text Utilities

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\x20-\x7E\n]+", "", text)
    text = re.sub(r"[•●–~►|#]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()


# Font Analysis

def calculate_average_font_size(doc: fitz.Document) -> float:
    total, count = 0, 0
    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") == 0:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        total += span.get("size", 0)
                        count += 1
    return total / count if count else 12


# Name Extraction

def extract_name_from_font(page: fitz.Page) -> str:
    """Detect candidate's name using largest font on the first page."""
    max_font = 0
    name_candidate = ""
    for block in page.get_text("dict")["blocks"]:
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                if span["size"] > max_font:
                    max_font = span["size"]
                    name_candidate = span["text"].strip()
    logging.info(f"🧠 Name candidate (largest font): {name_candidate}")
    return name_candidate


# Contact Info Extraction

def extract_contact_info(text: str) -> Dict:
    emails = EMAIL_RE.findall(text)
    raw_phones = [m.group(0).strip() for m in PHONE_RE.finditer(text)]
    links = LINK_RE.findall(text)

    phones = []
    for p in raw_phones:
        digits = re.sub(r"\D", "", p)
        if len(digits) < 9 or len(digits) > 15:
            continue
        if re.match(r"20\d{2}", digits[:4]):  # avoid years
            continue
        phones.append(p.strip())

    return {
        "emails": list(set(emails)),
        "phones": list(set(phones)),
        "links": list(set(links)),
    }


# Section Extraction

def extract_sections_from_resume(pdf_path: str, headings: List[str] = None) -> Dict:
    if headings is None:
        headings = [
            r"Objective", r"Summary", r"Education", r"Experience", r"Work Experience",
            r"Professional Experience", r"Projects", r"Skills", r"Technical Skills",
            r"Certifications", r"Internship", r"Achievements", r"Hobbies", r"Interests"
        ]

    heading_pattern = re.compile(r"^\s*(" + r"|".join(headings) + r")\s*$", re.IGNORECASE)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f" Failed to open PDF: {e}")
        return {}

    avg_font = calculate_average_font_size(doc)
    logging.info(f" Average font size: {avg_font:.2f}")

    sections = {}
    current_heading = None
    current_text = []

    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                line_text = "".join(span.get("text", "") for span in line.get("spans", [])).strip()
                if not line_text:
                    continue

                max_font = max((span.get("size", 0) for span in line.get("spans", [])), default=0)
                is_heading = bool(heading_pattern.match(line_text)) or max_font > 1.5 * avg_font

                if is_heading:
                    if current_heading:
                        sections[current_heading] = clean_text("\n".join(current_text))
                    current_heading = line_text.strip().title()
                    current_text = []
                else:
                    if current_heading:
                        current_text.append(line_text)

    if current_heading:
        sections[current_heading] = clean_text("\n".join(current_text))

    return sections


# Combine All Logic

def extract_resume_data(pdf_path: str) -> Dict:
    doc = fitz.open(pdf_path)
    first_page = doc[0]

    first_page_text = first_page.get_text("text")
    name = extract_name_from_font(first_page)
    contact_info = extract_contact_info(first_page_text)

    personal_info = {
        "name": name,
        "emails": contact_info.get("emails", []),
        "phones": contact_info.get("phones", []),
        "links": contact_info.get("links", []),
        "raw": clean_text(str(first_page_text.split("\n")[0:10]))
    }

    sections = extract_sections_from_resume(pdf_path)

    result = {"Personal Info": personal_info}
    result.update(sections)
    return result


# Split Multiline Sections (No Regex)

def split_multiline_sections(data: dict) -> dict:
    """Convert multiline strings into list items safely (no regex)."""
    for key, value in list(data.items()):
        if isinstance(value, dict):
            data[key] = split_multiline_sections(value)
        elif isinstance(value, str) and "\n" in value:
            lines = [line.strip() for line in value.split("\n") if line.strip()]
            data[key] = lines
    return data


# Save to JSON

def save_to_json(pdf_path: str, data: Dict):
    # --- Modification Start ---
    # Add the hardcoded key-value pairs to the data dictionary
    data["preferred_title"] = "Machine Learning Engineer"
    data["preferred_job_location"] = "India"
    # --- Modification End ---

    output_path = pdf_path.replace(".pdf", "_structured.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    logging.info(f"✅ Extracted structured resume saved to: {output_path}")




In [None]:
def apply_easy_apply_jobs(csv_path, resume_json_path, resume_drive_url):
    import os
    import time
    import csv
    import pickle
    import json
    import requests
    import tempfile
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.support.ui import WebDriverWait, Select
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import NoSuchElementException
    from webdriver_manager.chrome import ChromeDriverManager

    # -----------------------------
    # Configuration
    # -----------------------------
    LINKEDIN_EMAIL = os.getenv("LINKEDIN_EMAIL")
    LINKEDIN_PASSWORD = os.getenv("LINKEDIN_PASSWORD")
    COOKIE_FILE = "linkedin_cookies.pkl"
    CSV_PATH = csv_path
    RESUME_DRIVE_URL = resume_drive_url

    # -----------------------------
    # Launch Chrome
    # -----------------------------
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 10)

    # -----------------------------
    # Login functions
    # -----------------------------
    def try_login_with_cookies():
        if os.path.exists(COOKIE_FILE):
            driver.get("https://www.linkedin.com")
            with open(COOKIE_FILE, "rb") as f:
                cookies = pickle.load(f)
                for cookie in cookies:
                    driver.add_cookie(cookie)
            driver.refresh()
            time.sleep(3)
            try:
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "img.global-nav__me-photo")))
                print("✅ Logged in using cookies")
                return True
            except:
                print("❌ Cookies expired or invalid")
                return False
        return False

    def login_with_credentials():
        driver.get("https://www.linkedin.com/login")
        username_input = wait.until(EC.presence_of_element_located((By.ID, "username")))
        username_input.send_keys(LINKEDIN_EMAIL)
        password_input = driver.find_element(By.ID, "password")
        password_input.send_keys(LINKEDIN_PASSWORD)
        password_input.send_keys(Keys.RETURN)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "img.global-nav__me-photo")))
        with open(COOKIE_FILE, "wb") as f:
            pickle.dump(driver.get_cookies(), f)
        print("✅ Logged in and cookies saved")

    # -----------------------------
    # Resume download with '-drive'
    # -----------------------------
    def download_resume_from_drive(drive_url):
        file_id = drive_url.split("/d/")[1].split("/")[0]
        download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
        resp = requests.get(download_url, stream=True)
        if resp.status_code != 200:
            raise Exception(f"Failed to download resume, status code {resp.status_code}")

        cd = resp.headers.get("content-disposition", "")
        if "filename=" in cd:
            original_name = cd.split("filename=")[1].strip('"')
        else:
            original_name = f"resume_{file_id}.pdf"

        base, ext = os.path.splitext(original_name)
        new_name = f"{base}-drive{ext}"
        temp_dir = tempfile.gettempdir()
        temp_path = os.path.join(temp_dir, new_name)

        with open(temp_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

        return temp_path

    # -----------------------------
    # Easy Apply click
    # -----------------------------
    def click_easy_apply():
        try:
            easy_apply_button = driver.find_element(By.CSS_SELECTOR, "button.jobs-apply-button")
            if "applied" in easy_apply_button.text.strip().lower():
                print("ℹ️ Already applied. Skipping this job.")
                return "Already applied"
            driver.execute_script("arguments[0].scrollIntoView(true);", easy_apply_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", easy_apply_button)
            print("✅ Easy Apply clicked!")
            return "Clicked Easy Apply"
        except:
            print("ℹ️ Easy Apply button not found or job not available. Skipping.")
            return "Already applied"

    # -----------------------------
    # Extract Easy Apply fields
    # -----------------------------
    def extract_easy_apply_fields():
        fields = []
        try:
            form_container = wait.until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, "div.jobs-easy-apply-modal"))
            )

            for inp in form_container.find_elements(By.TAG_NAME, "input"):
                try:
                    label_el = inp.find_element(By.XPATH, "ancestor::div[@data-test-single-line-text-form-component]//label")
                    question_text = label_el.text.strip() if label_el else inp.get_attribute("aria-label") or inp.get_attribute("id")
                    field_type = inp.get_attribute("type")
                    value = inp.get_attribute("value") if field_type not in ["checkbox", "radio", "file"] else inp.is_selected()
                    options = []

                    if field_type in ["radio", "checkbox"]:
                        name_attr = inp.get_attribute("name")
                        group = form_container.find_elements(By.NAME, name_attr)
                        options = [el.get_attribute("aria-label") for el in group if el.get_attribute("aria-label")]

                    fields.append({
                        "field_name": question_text,
                        "field_type": field_type,
                        "value": value,
                        "options": options,
                        "element": inp
                    })
                except:
                    continue

            for ta in form_container.find_elements(By.TAG_NAME, "textarea"):
                try:
                    label_el = ta.find_element(By.XPATH, "ancestor::div[@data-test-single-line-text-form-component]//label")
                    question_text = label_el.text.strip() if label_el else ta.get_attribute("aria-label") or ta.get_attribute("id")
                    fields.append({
                        "field_name": question_text,
                        "field_type": "textarea",
                        "value": ta.get_attribute("value"),
                        "options": [],
                        "element": ta
                    })
                except:
                    continue

            for sel in form_container.find_elements(By.TAG_NAME, "select"):
                try:
                    label_el = sel.find_element(By.XPATH, "ancestor::div[@data-test-form-element]//label")
                    question_text = label_el.text.strip() if label_el else sel.get_attribute("aria-label") or sel.get_attribute("id")
                    options = [opt.text for opt in sel.find_elements(By.TAG_NAME, "option")]
                    fields.append({
                        "field_name": question_text,
                        "field_type": "select",
                        "value": sel.get_attribute("value"),
                        "options": options,
                        "element": sel
                    })
                except:
                    continue
        except:
            pass
        return fields

    # -----------------------------
    # Fill Easy Apply fields
    # -----------------------------
    def fill_easy_apply_fields(fields):
        for field in fields:
            try:
                el = field.get("element")
                if not el:
                    continue
                generated = field.get("generated_answer", "").strip()
                f_type = field["field_type"]

                if f_type in ["text", "textarea"] or el.get_attribute("contenteditable") == "true":
                    if generated:
                        driver.execute_script("""
                            arguments[0].focus();
                            arguments[0].value = arguments[1];
                            arguments[0].dispatchEvent(new Event('input', { bubbles: true }));
                            arguments[0].dispatchEvent(new Event('change', { bubbles: true }));
                        """, el, generated)
                        time.sleep(0.2)

                elif f_type == "select" and generated:
                    try:
                        Select(el).select_by_visible_text(generated)
                    except:
                        el.click()
                        time.sleep(0.3)
                        option = el.find_element(By.XPATH, f".//li[normalize-space(text())='{generated}']")
                        option.click()
                        time.sleep(0.2)
            except:
                continue

    # -----------------------------
    # Dummy answer generator
    # -----------------------------
    def generate_field_answers(fields, resume_json):
        for field in fields:
            field["generated_answer"] = "Sample answer"
        return fields

    # -----------------------------
    # Login flow
    # -----------------------------
    if not try_login_with_cookies():
        login_with_credentials()

    # -----------------------------
    # Load resume + CSV
    # -----------------------------
    with open(resume_json_path, "r") as f:
        resume_data = json.load(f)

    with open(CSV_PATH, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        jobs = [row for row in reader if row["apply_type"].lower() == "easy apply"]

    print(f"\n🚀 Starting auto-apply for {len(jobs)} Easy Apply jobs...")

    # -----------------------------
    # Main job loop
    # -----------------------------
    for idx, job in enumerate(jobs, 1):
        print(f"\n🎯 Processing job {idx}/{len(jobs)}: {job['title']} at {job['company']} ({job['location']})")
        driver.get(job["apply_link"])
        time.sleep(3)

        apply_status = click_easy_apply()
        if apply_status == "Already applied":
            continue

        step = 1
        while True:
            print(f"➡️ Step {step}...")
            fields = extract_easy_apply_fields()

            if not fields:
                print("ℹ️ No fields detected. Checking for resume upload or next step...")

                try:
                    upload_input = driver.find_element(By.CSS_SELECTOR, "input[type='file'].hidden")
                    if upload_input.is_displayed() or "hidden" in upload_input.get_attribute("class"):
                        print("📎 Found resume upload input, uploading resume...")
                        local_resume = download_resume_from_drive(RESUME_DRIVE_URL)
                        upload_input.send_keys(local_resume)
                        time.sleep(2)
                        os.remove(local_resume)
                        print("✅ Resume uploaded successfully.")
                except:
                    pass

                try:
                    next_button = driver.find_element(
                        By.XPATH,
                        "//button[contains(., 'Next') or contains(., 'Continue') or contains(., 'Review')]"
                    )
                    if next_button and next_button.is_enabled():
                        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                        time.sleep(0.5)
                        driver.execute_script("arguments[0].click();", next_button)
                        print("➡️ Proceeded to next step.")
                        time.sleep(2)
                        step += 1
                        continue
                except:
                    print("⚠️ No next button found. Assuming final review.")
                    break

                break

            fields_with_answers = generate_field_answers(fields, resume_data)
            fill_easy_apply_fields(fields_with_answers)

            try:
                next_button = wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//button[contains(., 'Next') or contains(., 'Review') or contains(., 'Continue')]")
                    )
                )
                if "disabled" in next_button.get_attribute("class"):
                    break
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                time.sleep(0.5)
                driver.execute_script("arguments[0].click();", next_button)
                time.sleep(2)
                step += 1
            except:
                break

        try:
            submit_button = wait.until(
                EC.presence_of_element_located(
                    (By.XPATH, "//button[contains(., 'Submit') or contains(., 'Done')]")
                )
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", submit_button)
            time.sleep(0.5)
            driver.execute_script("arguments[0].click();", submit_button)
            print("✅ Application submitted successfully!")
            time.sleep(2)
        except:
            print("⚠️ Could not find Submit button, skipped.")

    driver.quit()
    print("\n🎉 All Easy Apply jobs processed successfully!")


In [8]:
# ===============================
# 🚀 Full AI Career Pipeline Test
# ===============================

from pathlib import Path

# --- File Paths ---
pdf_resume_path = "resumes/Yeswanth_Yerra_CV.pdf"
structured_json_path = pdf_resume_path.replace(".pdf", "_structured.json")
linkedin_csv_path = "csv/linkedin_jobs.csv"
indeed_csv_path = "csv/indeed_jobs.csv"

# --- Google Drive resume URL (for Easy Apply upload) ---
resume_drive_url = "https://drive.google.com/file/d/1ZCnnE0SHsyPqZpZBbnWz5npLppl7UZXh/view?usp=drive_link"

# ===============================
# 🧩 1. Resume Extraction
# ===============================
print("\n🧠 Extracting structured resume data...")
resume_data = extract_resume_data(pdf_resume_path)
resume_data = split_multiline_sections(resume_data)
save_to_json(pdf_resume_path, resume_data)
print(f"✅ Resume extraction complete → {structured_json_path}")

# ===============================
# 🌍 2. Indeed Job Scraping
# ===============================
print("\n🌍 Starting Indeed scraping...")
indeed_jobs = scrape_indeed_jobs(max_pages=2, save_csv=True, output_file=indeed_csv_path)
print(f"✅ Indeed scraping complete → {indeed_csv_path}")

# ===============================
# 💼 3. LinkedIn Job Scraping
# ===============================
print("\n💼 Starting LinkedIn scraping...")
linkedin_scraper = LinkedInJobScraper(headless=True)
preferred_title, preferred_location = get_preferred_job_and_location(structured_json_path)
linkedin_jobs = linkedin_scraper.scrape_jobs(
    job_title=preferred_title,
    location=preferred_location,
    pages=1,
    days_back=7,
    csv_filename=linkedin_csv_path
)
print(f"✅ LinkedIn scraping complete → {linkedin_csv_path}")

# ===============================
# 🤖 4. Auto-Apply on LinkedIn (Easy Apply)
# ===============================
print("\n🤖 Starting LinkedIn auto-apply (Easy Apply only)...")
apply_easy_apply_jobs(
    csv_path=linkedin_csv_path,
    resume_json_path=structured_json_path,
    resume_drive_url=resume_drive_url
)
print("\n🎯 Pipeline completed successfully!")


2025-10-28 14:32:32,839 - INFO - 🧠 Name candidate (largest font): Yeswanth Yerra
2025-10-28 14:32:32,848 - INFO -  Average font size: 9.63
2025-10-28 14:32:32,856 - INFO - ✅ Extracted structured resume saved to: resumes/Yeswanth_Yerra_CV_structured.json
2025-10-28 14:32:32,909 - INFO - Get LATEST chromedriver version for google-chrome
2025-10-28 14:32:32,978 - INFO - Get LATEST chromedriver version for google-chrome



🧠 Extracting structured resume data...
✅ Resume extraction complete → resumes/Yeswanth_Yerra_CV_structured.json

🌍 Starting Indeed scraping...
🔍 Scraping Indeed for 'Machine Learning Engineer' jobs in 'India', Country: India


2025-10-28 14:32:33,047 - INFO - Driver [/home/acer/.wdm/drivers/chromedriver/linux64/141.0.7390.122/chromedriver-linux64/chromedriver] found in cache



DEBUG: Processing page 1 | URL: https://in.indeed.com/jobs?q=Machine+Learning+Engineer&l=India&from=searchOnDesktopSerp&start=0
DEBUG: Found 16 job cards on page 1

DEBUG: Processing job card #1 on page 1

DEBUG: Processing job card #2 on page 1

DEBUG: Processing job card #3 on page 1

DEBUG: Processing job card #4 on page 1

DEBUG: Processing job card #5 on page 1

DEBUG: Processing job card #6 on page 1

DEBUG: Processing job card #7 on page 1

DEBUG: Processing job card #8 on page 1

DEBUG: Processing job card #9 on page 1

DEBUG: Processing job card #10 on page 1

DEBUG: Processing job card #11 on page 1

DEBUG: Processing job card #12 on page 1

DEBUG: Processing job card #13 on page 1

DEBUG: Processing job card #14 on page 1

DEBUG: Processing job card #15 on page 1

DEBUG: Processing job card #16 on page 1

DEBUG: Processing page 2 | URL: https://in.indeed.com/jobs?q=Machine+Learning+Engineer&l=India&from=searchOnDesktopSerp&start=10
DEBUG: Found 16 job cards on page 2

DEBUG

2025-10-28 14:33:04,116 - INFO - Chrome WebDriver initialized successfully
2025-10-28 14:33:09,742 - INFO - Scraping 'Machine Learning Engineer' jobs in India (pages=1)
2025-10-28 14:33:17,839 - INFO - Found 70 job cards
2025-10-28 14:38:12,761 - INFO - Saved 70 jobs to csv/linkedin_jobs.csv


✅ LinkedIn scraping complete → csv/linkedin_jobs.csv

🤖 Starting LinkedIn auto-apply (Easy Apply only)...


NameError: name 'apply_easy_apply_jobs' is not defined