### Linkedin Jobs Scraping


In [2]:
!pip install selenium
!pip install beautifulsoup4
!pip install pandas
!pip install undetected-chromedriver
!pip install selenium-stealth
!pip install selenium-wire
!pip install selenium-wire

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Using cached trio-0.31.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.10.5 (from selenium)
  Using cached certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions<5.0,>=4.15.0 (from selenium)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting attrs>=23.2.0 (from trio<1.0,>=0.31.0->selenium)
  Using cached attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading selenium-4.38.0-py3-none-any.whl 

In [None]:
import time
import random
import logging
import csv
import urllib.parse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class LinkedInJobScraper:
    def __init__(self, headless=True, tabs_per_batch=3):
        self.headless = headless
        self.tabs_per_batch = tabs_per_batch
        self.driver = None
        self.setup_driver()

    # ---------------------------
    def setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        options.add_argument("--disable-popup-blocking")
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-infobars")
        options.add_argument(
            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        if self.headless:
            options.add_argument("--headless=new")

        self.driver = webdriver.Chrome(options=options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        logging.info("Chrome WebDriver initialized successfully")

    # ---------------------------
    def _get_time_filter(self, days: int) -> str:
        return {1: "r86400", 3: "r259200", 7: "r604800", 30: "r2592000"}.get(days, "r604800")

    def _get_experience_filter(self, experience_level: str) -> str:
        exp_map = {
            "Internship": "1",
            "Entry Level": "2",
            "Associate": "3",
            "Mid-Senior": "4",
            "Director": "5"
        }
        return exp_map.get(experience_level, "1")

    def build_search_url(self, job_title: str, location: str, experience_level=None, time_posted=None):
        base_url = "https://www.linkedin.com/jobs/search/"
        params = {
            "keywords": urllib.parse.quote(job_title),
            "location": urllib.parse.quote(location),
            "f_TPR": time_posted,
            # "f_E": self._get_experience_filter(experience_level) if experience_level else None
        }
        query = "&".join([f"{k}={v}" for k, v in params.items() if v])
        url = f"{base_url}?{query}"
        logging.info(f"Search URL built: {url}")
        return url

    # ---------------------------
    def _scrape_description_from_tab(self):
        """Scrape only job description and job type from the side panel"""
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            soup = BeautifulSoup(self.driver.page_source, "html.parser")

            # Job description
            desc_elem = (
                soup.find("div", class_="show-more-less-html__markup") or
                soup.find("div", class_="description__text") or
                soup.find("div", id="job-details")
            )
            description = desc_elem.get_text(" ", strip=True) if desc_elem else "N/A"

            # Job type
            job_type_elem = soup.find("span", class_="description__job-criteria-text")
            job_type = job_type_elem.get_text(strip=True) if job_type_elem else "N/A"

            return {"description": description, "job_type": job_type}

        except Exception as e:
            logging.warning(f"Error scraping description: {e}")
            return {"description": "N/A", "job_type": "N/A"}

    # ---------------------------
    def scrape_jobs(self, job_title: str, location: str = "India", pages: int = 1,
                experience_level: str = None, days_back: int = 7,
                csv_filename: str = "linkedin_jobs.csv"):

        time_filter = self._get_time_filter(days_back)
        search_url = self.build_search_url(job_title, location, experience_level, time_filter)
        self.driver.get(search_url)
        time.sleep(4)

        logging.info(f"Scraping '{job_title}' jobs in {location} (pages={pages})")

        # Scroll through jobs
        for _ in range(pages):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(3, 5))
            try:
                btn = WebDriverWait(self.driver, 3).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Show more')]"))
                )
                btn.click()
            except:
                pass

        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        job_cards = soup.find_all("div", class_="base-card")
        logging.info(f"Found {len(job_cards)} job cards")

        jobs = []
        for card in job_cards:
            link_elem = card.find("a", class_="base-card__full-link")
            if not link_elem:
                continue

            # Extract apply link/type directly from job card
            apply_type = "N/A"
            apply_link = "N/A"
            easy_apply_btn = card.find("button", class_="jobs-apply-button")
            external_apply_link = card.find("a", {"data-tracking-control-name": "public_jobs_apply-link-offsite"})
            if easy_apply_btn:
                apply_type = "Easy Apply"
                apply_link = link_elem["href"]
            elif external_apply_link:
                apply_type = "Apply"
                apply_link = external_apply_link.get("href", "N/A")

            job_data = {
                "title": (card.find("h3") or {}).get_text(strip=True),
                "company": (card.find("h4") or {}).get_text(strip=True),
                "location": (card.find("span", class_="job-search-card__location") or {}).get_text(strip=True),
                "link": link_elem["href"],
                "apply_type": apply_type,
                "apply_link": apply_link,
                "description": "Pending...",
                "job_type": "Pending..."
            }

            # ---------------- Debug Print ----------------
            print("üîπ Job Found:")
            print(f"  Title      : {job_data['title']}")
            print(f"  Company    : {job_data['company']}")
            print(f"  Location   : {job_data['location']}")
            print(f"  Link       : {job_data['link']}")
            print(f"  Apply Type : {job_data['apply_type']}")
            print(f"  Apply Link : {job_data['apply_link']}")
            print(f"  Job Type   : {job_data['job_type']}")
            print("-" * 60)

            jobs.append(job_data)

        logging.info(f"Collected {len(jobs)} job links")

        # ---------------- Batch description scraping ----------------
        logging.info(f"Fetching job descriptions in batches of {self.tabs_per_batch}...")
        main_handle = self.driver.current_window_handle

        for i in range(0, len(jobs), self.tabs_per_batch):
            batch = jobs[i:i + self.tabs_per_batch]

            # open new tabs
            for job in batch:
                self.driver.execute_script(f"window.open('{job['link']}', '_blank');")
                time.sleep(random.uniform(1.5, 2.5))

            time.sleep(random.uniform(3, 5))

            # scrape each tab
            for tab_idx, job in enumerate(batch, start=1):
                self.driver.switch_to.window(self.driver.window_handles[tab_idx])
                desc_data = self._scrape_description_from_tab()
                job.update(desc_data)
                logging.info(f"Scraped description for: {job['title'][:60]}")

            # close batch tabs
            for tab in self.driver.window_handles[1:]:
                self.driver.switch_to.window(tab)
                self.driver.close()
            self.driver.switch_to.window(main_handle)
            time.sleep(random.uniform(3, 6))

        # ---------------- Save to CSV ----------------
        self.save_to_csv(jobs, csv_filename)
        logging.info(f"Saved {len(jobs)} jobs to {csv_filename}")
        return jobs


    # ---------------------------
    @staticmethod
    def save_to_csv(jobs, filename):
        if not jobs:
            logging.warning("No jobs to save.")
            return
        keys = jobs[0].keys()
        with open(filename, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(jobs)

    # ---------------------------
    def close(self):
        if self.driver:
            self.driver.quit()
            logging.info("WebDriver closed.")


# ---------------- Test ----------------
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    scraper = LinkedInJobScraper(headless=True, tabs_per_batch=3)
    jobs = scraper.scrape_jobs(
        job_title="Machine Learning Engineer",
        location="India",
        pages=1,
        days_back=1,
        csv_filename="linkedin_jobs.csv"
    )
    scraper.close()


2025-10-27 19:20:40,385 - INFO - Chrome WebDriver initialized successfully
2025-10-27 19:20:40,385 - INFO - Search URL built: https://www.linkedin.com/jobs/search/?keywords=Machine%20Learning%20Engineer&location=India&f_TPR=r86400
2025-10-27 19:20:45,864 - INFO - Scraping 'Machine Learning Engineer' jobs in India (pages=1)
2025-10-27 19:20:53,288 - INFO - Found 70 job cards
2025-10-27 19:20:53,301 - INFO - Collected 70 job links
2025-10-27 19:20:53,302 - INFO - Fetching job descriptions in batches of 3...


üîπ Job Found:
  Title      : Data Scientist - AI
  Company    : Weekday AI (YC W21)
  Location   : Navi Mumbai, Maharashtra, India
  Link       : https://in.linkedin.com/jobs/view/data-scientist-ai-at-weekday-ai-yc-w21-4332041338?position=1&pageNum=0&refId=ZuwAJta1SXtpwE9CybSHdQ%3D%3D&trackingId=31KH1vohvoVKJfYYMNrIgA%3D%3D
  Apply Type : N/A
  Apply Link : N/A
  Job Type   : Pending...
------------------------------------------------------------
üîπ Job Found:
  Title      : Machine Learning Engineer
  Company    : Infosys
  Location   : Bengaluru East, Karnataka, India
  Link       : https://in.linkedin.com/jobs/view/machine-learning-engineer-at-infosys-4318878931?position=2&pageNum=0&refId=ZuwAJta1SXtpwE9CybSHdQ%3D%3D&trackingId=R6kzrL9e9UfuO6fglf5KgQ%3D%3D
  Apply Type : N/A
  Apply Link : N/A
  Job Type   : Pending...
------------------------------------------------------------
üîπ Job Found:
  Title      : Software Engineer
  Company    : HID
  Location   : Chennai, Tamil Nad

KeyboardInterrupt: 

In [4]:
import time
import random
import logging
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.parse


class LinkedInJobScraper:
    def __init__(self, headless=True):
        self.headless = headless
        self.driver = None
        self.setup_driver()

    def setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option("useAutomationExtension", False)
        options.add_argument("--disable-popup-blocking")
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-infobars")
        options.add_argument(
            "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        if self.headless:
            options.add_argument("--headless=new")

        self.driver = webdriver.Chrome(options=options)
        self.driver.execute_script(
            "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
        )
        logging.info("Chrome WebDriver initialized successfully")

    # ---------------------------
    def _get_time_filter(self, days: int) -> str:
        return {1: "r86400", 3: "r259200", 7: "r604800", 30: "r2592000"}.get(days, "r604800")

    def build_search_url(self, job_title: str, location: str, experience_level=None, time_posted=None):
        base_url = "https://www.linkedin.com/jobs/search/"
        params = {
            "keywords": urllib.parse.quote(job_title),
            "location": urllib.parse.quote(location),
            "f_TPR": time_posted
        }
        query = "&".join([f"{k}={v}" for k, v in params.items() if v])
        return f"{base_url}?{query}"

    # ---------------------------
    def scrape_job_details(self, job_url: str):
        """
        Open a job detail page and extract:
        - Apply type (Easy Apply or Apply)
        - Employment type
        - Job description
        """
        self.driver.get(job_url)
        time.sleep(3)  # Let the page load

        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        # Apply type detection
        # Wait for apply button or code tag dynamically
        apply_type = "Unknown"
        try:
            WebDriverWait(self.driver, 5).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "button.top-card-layout__cta--primary, code#applyUrl")
                )
            )
        except:
            pass

        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        # First, check code tag
        code_tag = soup.find("code", id="applyUrl")
        if code_tag and code_tag.text.strip():
            apply_type = "Apply (Offsite)"
        else:
            # Then check button attribute
            apply_button = soup.find("button", class_="top-card-layout__cta--primary")
            if apply_button:
                dt_name = apply_button.get("data-tracking-control-name", "").lower()
                if "offsite" in dt_name:
                    apply_type = "Apply (Offsite)"
                elif "onsite" in dt_name:
                    apply_type = "Easy Apply"


        # Employment type
        employment_type = None
        criteria_items = soup.select("ul.description__job-criteria-list li.description__job-criteria-item")
        for item in criteria_items:
            header = item.find("h3", class_="description__job-criteria-subheader")
            if header and header.text.strip() == "Employment type":
                span = item.find("span", class_="description__job-criteria-text--criteria")
                if span:
                    employment_type = span.text.strip()
                break

        # Job description
        job_description = None
        desc_div = soup.find("div", class_="description__text--rich")
        if desc_div:
            markup_div = desc_div.find("div", class_="show-more-less-html__markup")
            if markup_div:
                job_description = markup_div.get_text(separator="\n", strip=True)

        return {
            "apply_link": job_url,  # keep job url as fallback
            "apply_type": apply_type,
            "employment_type": employment_type,
            "job_description": job_description
        }

    # ---------------------------
    def scrape_jobs(self, job_title: str, location: str = "India", pages: int = 1,
                    experience_level: str = None, days_back: int = 7,
                    csv_filename: str = "csv/linkedin_jobs.csv"):

        time_filter = self._get_time_filter(days_back)
        search_url = self.build_search_url(job_title, location, experience_level, time_filter)
        self.driver.get(search_url)
        time.sleep(4)

        logging.info(f"Scraping '{job_title}' jobs in {location} (pages={pages})")

        # Scroll and click "Show more" to load more jobs
        for _ in range(pages):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(3, 5))
            try:
                btn = WebDriverWait(self.driver, 3).until(
                    EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Show more')]"))
                )
                btn.click()
            except:
                pass

        # Parse loaded page
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        job_cards = soup.find_all("div", class_="base-card")
        logging.info(f"Found {len(job_cards)} job cards")

        jobs_data = []
        # job_cards = job_cards[:3]
        # print("scraping only 3 jobs")
        for card in job_cards:
            try:
                title = card.find("h3", class_="base-search-card__title").get_text(strip=True)
                company = card.find("h4", class_="base-search-card__subtitle").get_text(strip=True)
                location_text = card.find("span", class_="job-search-card__location").get_text(strip=True)
                job_url_elem = card.find("a", class_="base-card__full-link")
                if not job_url_elem:
                    continue
                job_url = job_url_elem.get("href")

                # Scrape details from job page
                details = self.scrape_job_details(job_url)

                job_data = {
                    "title": title,
                    "company": company,
                    "location": location_text,
                    **details
                }
                jobs_data.append(job_data)

            except Exception as e:
                logging.warning(f"Skipping a job due to error: {e}")
                continue

        # Save to CSV
        if jobs_data:
            keys = jobs_data[0].keys()
            with open(csv_filename, "w", newline="", encoding="utf-8") as f:
                dict_writer = csv.DictWriter(f, fieldnames=keys)
                dict_writer.writeheader()
                dict_writer.writerows(jobs_data)
            logging.info(f"Saved {len(jobs_data)} jobs to {csv_filename}")

        return jobs_data


# ===============================
# Example usage
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    scraper = LinkedInJobScraper(headless=False)
    jobs = scraper.scrape_jobs("Machine Learning Engineer", location="India", pages=3)
    for idx, job in enumerate(jobs, 1):
        print(f"{idx}. {job['title']} | {job['company']} | {job['location']} | {job['apply_type']} | {job['employment_type']}")


INFO:root:Chrome WebDriver initialized successfully
INFO:root:Scraping 'Machine Learning Engineer' jobs in India (pages=3)
INFO:root:Found 70 job cards
INFO:root:Saved 70 jobs to csv/linkedin_jobs.csv


1. Data Scientist | Microsoft | Hyderabad, Telangana, India | Unknown | Full-time
2. Data Scientist | BitGo | Bengaluru, Karnataka, India | Unknown | Full-time
3. Software Engineer | AXA XL | Bengaluru East, Karnataka, India | Unknown | Full-time
4. Data Scientist | AB InBev GCC India | Bengaluru, Karnataka, India | Easy Apply | Full-time
5. Data Scientist - Junior | Schaeffler | Pune, Maharashtra, India | Unknown | Full-time
6. Software Engineer | Microsoft | Hyderabad, Telangana, India | Unknown | Full-time
7. Data Scientist | Deloitte | Gurugram, Haryana, India | Unknown | Full-time
8. Software Engineer | Microsoft | Hyderabad, Telangana, India | Unknown | Full-time
9. Data Scientist - Spendscape | McKinsey & Company | Gurugram, Haryana, India | Unknown | Full-time
10. Associate - Data Scientist-Data Science-Data Scientist | EXL | Bengaluru, Karnataka, India | Easy Apply | Full-time
11. Software Engineer | Microsoft | Bengaluru, Karnataka, India | Unknown | Full-time
12. Software En