In [49]:
import requests
import pandas as pd
import random
from bs4 import BeautifulSoup
import math
import time

In [63]:
def get_with_backoff(url, max_retries=6, timeout=20):
    """
    Retries on 429 (rate limit) with exponential backoff + jitter.
    Respects Retry-After header when present.
    Also retries on 5xx and some network errors.
    """
    for attempt in range(max_retries):
        try:
            r = session.get(url, timeout=timeout)

            # Debug (optional)
            print(r.status_code, r.headers.get("Retry-After"))

            # Handle rate limiting
            if r.status_code == 429:
                retry_after = r.headers.get("Retry-After")
                if retry_after:
                    # Retry-After can be seconds. If it's a date, this may fail -> fallback.
                    try:
                        wait = float(retry_after)
                    except ValueError:
                        wait = (2 ** attempt) + random.uniform(0.5, 1.5)
                else:
                    wait = (2 ** attempt) + random.uniform(0.5, 1.5)

                print(f"429 rate-limited. Sleeping {wait:.2f}s then retrying... ({attempt+1}/{max_retries})")
                time.sleep(wait)
                continue

            # Retry transient server errors
            if 500 <= r.status_code < 600:
                wait = (2 ** attempt) + random.uniform(0.5, 1.5)
                print(f"{r.status_code} server error. Sleeping {wait:.2f}s then retrying... ({attempt+1}/{max_retries})")
                time.sleep(wait)
                continue

            # Raise for other non-200s (403, 404, etc.)
            r.raise_for_status()
            return r

        except (requests.exceptions.Timeout,
                requests.exceptions.ConnectionError,
                requests.exceptions.ChunkedEncodingError) as e:
            wait = (2 ** attempt) + random.uniform(0.5, 1.5)
            print(f"Network error: {e}. Sleeping {wait:.2f}s then retrying... ({attempt+1}/{max_retries})")
            time.sleep(wait)

    raise RuntimeError(f"Failed after {max_retries} retries: {url}")

In [88]:
# Data that the user will be able to change
jobTitle = "Junior Python"
location = "Spain"
# Limit it to 1 type
workType = "en remoto"
# WiLL default to 10
numberOfJobs = 5

In [90]:
job_id_list = set()
jobTitleFormatted = jobTitle.replace(' ', "%20")

pageNumber = 0
pageNumberObj =  math.ceil(numberOfJobs / 10)
numberResults = ""

wt_map = {
    "en remoto": "2",
    "hibrido": "3",
    "presencial": "1"
}
wt_value = wt_map.get(workType)
WT = f"f_WT={wt_value}"

while pageNumber < pageNumberObj:
    # Form the link
    jobs_list_link = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={jobTitleFormatted}&location={location}&{WT}{numberResults}"
    print(jobs_list_link)
    # Save the response in text and format it to get all job posts
    response_jlg = requests.get(jobs_list_link)
    jobs_list_data = response_jlg.text
    jobs_list_soup = BeautifulSoup(jobs_list_data, "html.parser")
    jobs_page = jobs_list_soup.find_all("li")

    # Get all job ids from the posts
    for job in jobs_page:
        base_card_div = job.find("div", {"class": "base-card"})
        job_id = base_card_div.get("data-entity-urn").split(":")[3]
        job_id_list.add(job_id)

    pageNumber += 1
    numberResults = f"&start={pageNumber * 10}" 
# Debug
print(job_id_list)

https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Junior%20Python&location=Spain&f_WT=2
{'4344139135', '4272412678', '4343901472', '4371175870', '4344988192', '4344099336', '4342452527', '4373555973', '4370527191', '4368612475'}


In [94]:
jobs_list = []
session = requests.Session()

for job_id in job_id_list:
    job_especifics_link = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"

    try:
        response_je = get_with_backoff(job_especifics_link, max_retries=6)
    except Exception as e:
        print(f"Skipping job_id {job_id} due to request failure: {e}")
        continue

    job_soup = BeautifulSoup(response_je.text, "html.parser")
    job_info = {"job_id": job_id}

    # Job Title (raise if missing)
    tag = job_soup.select_one("a.topcard__link h2.top-card-layout__title")
    if not tag:
        print(f"Missing job title for job_id {job_id} â€” skipping")
        continue
    job_info["job_title"] = tag.get_text(strip=True)
    
    # Company Info (raise if missing)
    tag = job_soup.find("a", class_="topcard__org-name-link")
    if not tag:
        print(f"Missing company tag for job_id {job_id} â€” skipping")
        continue
    job_info["company_name"] = tag.get_text(strip=True)
    job_info["company_link"] = tag.get("href")
    
    # Workplace access (optional)
    job_info["workplace_access"] = workType

    # Job Type (optional)
    tag = job_soup.select_one("span.posted-time-ago__text")
    job_info["work_type"] = tag.get_text(strip=True) if tag else None

    # Job Level and wory schedule (optional)
    criteria_items = job_soup.select("li.description__job-criteria-item")
    
    job_info["seniority_level"] = None
    job_info["employment_type"] = None
    
    for item in criteria_items:
        header = item.select_one("h3.description__job-criteria-subheader")
        value = item.select_one("span.description__job-criteria-text")
    
        if not header or not value:
            continue
    
        header_text = header.get_text(strip=True)
    
        if header_text == "Seniority level":
            job_info["seniority_level"] = value.get_text(strip=True)
    
        elif header_text == "Employment type":
            job_info["employment_type"] = value.get_text(strip=True)
        
    # Number of applicants (optional)
    tag = job_soup.select_one("figcaption.num-applicants__caption")
    job_info["num_applicants"] = tag.get_text(strip=True) if tag else None

    # Time posted (optional)
    tag = job_soup.select_one("span.posted-time-ago__text")
    job_info["time_posted"] = tag.get_text(strip=True) if tag else None


    jobs_list.append(job_info)

    # Gentle pacing between job detail requests (helps reduce 429s)
    time.sleep(random.uniform(1.0, 2.5))

print("Scraping ended")

200 None
200 None
200 None
200 None
200 None
200 None
200 None
200 None
200 None
200 None
Scraping ended


In [95]:
jobs_df = pd.DataFrame(jobs_list)
jobs_df

Unnamed: 0,job_id,job_title,company_name,company_link,workplace_access,work_type,seniority_level,employment_type,num_applicants,time_posted
0,4344139135,Junior Software Developer - Observability,Canonical,https://uk.linkedin.com/company/canonical?trk=...,en remoto,2 months ago,Entry level,Full-time,,2 months ago
1,4272412678,Full Stack ( Python + Vue) en remoto,SlashMobility,https://es.linkedin.com/company/slashmobility?...,en remoto,6 months ago,Not Applicable,Full-time,,6 months ago
2,4343901472,Junior Software Developer - Observability,Canonical,https://uk.linkedin.com/company/canonical?trk=...,en remoto,2 months ago,Entry level,Full-time,Be among the first 25 applicants,2 months ago
3,4371175870,Desarrollador/a Backend â€“ Python/Java â€“ IA y C...,TheWhiteam,https://es.linkedin.com/company/the-white-team...,en remoto,4 days ago,Not Applicable,Full-time,,4 days ago
4,4344988192,Junior Software Developer - Observability,Canonical,https://uk.linkedin.com/company/canonical?trk=...,en remoto,1 month ago,Entry level,Full-time,Be among the first 25 applicants,1 month ago
5,4344099336,Junior Software Developer - Observability,Canonical,https://uk.linkedin.com/company/canonical?trk=...,en remoto,2 months ago,Entry level,Full-time,,2 months ago
6,4342452527,Join Our Talent Pipeline for QA Engineers!,Nuvolar Works,https://es.linkedin.com/company/nuvolarworks?t...,en remoto,2 months ago,Mid-Senior level,Full-time,,2 months ago
7,4373555973,Desarrollador/a Junior Python con BigQuery,Abalia,https://es.linkedin.com/company/abalia?trk=pub...,en remoto,58 minutes ago,Mid-Senior level,Full-time,Be among the first 25 applicants,58 minutes ago
8,4370527191,Desarrollador de Python,Axpe Consulting,https://es.linkedin.com/company/axpe-consultin...,en remoto,1 week ago,Mid-Senior level,Full-time,,1 week ago
9,4368612475,ðŸ¤–ðŸ“Š INGENIERO/A DE DATOS Junior Big Data SQL PY...,Quental,https://es.linkedin.com/company/quental?trk=pu...,en remoto,1 week ago,Associate,Full-time,Over 200 applicants,1 week ago
