In [None]:
import requests
from bs4 import BeautifulSoup
import math
import csv
import re
from tqdm import tqdm

# Target URL format with placeholder for pagination
target_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location={}&start={}"


def text_clean(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"\t+", "\t", text)
    text = re.sub(r"\s+", " ", text)
    return text


# Define a user agent
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
}

# List to store job IDs
job_ids = []
job_data = []

# Calculate number of pages to scrape (25 jobs per page)
total_jobs = 200  # Example number
pages = math.ceil(total_jobs / 25)

# Get all job IDs
for i in tqdm(range(0, pages)):
    res = requests.get(
        target_url.format("senior Data scientist", "remote", i * 25), headers=headers
    )
    soup = BeautifulSoup(res.text, "html.parser")
    jobs_on_page = soup.find_all("li")

    for job in jobs_on_page:
        try:
            job_id = (
                job.find("div", {"class": "base-card"})
                .get("data-entity-urn")
                .split(":")[-1]
            )
            job_ids.append(job_id)
        except:
            continue

# Get details for each job
job_details_url = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}"
for job_id in tqdm(job_ids):
    resp = requests.get(job_details_url.format(job_id), headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    job_info = {}
    try:
        job_info["company"] = (
            soup.find("div", {"class": "top-card-layout__card"})
            .find("a")
            .find("img")
            .get("alt")
        )
        job_info["title"] = (
            soup.find("div", {"class": "top-card-layout__entity-info"})
            .find("a")
            .text.strip()
        )
        # job_info["level"] = soup.find("ul", {"class": "description__job-criteria-list"}).find("li").text.replace("Seniority level", "").strip()

        # Get all li elements in the list
        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Field names in order they typically appear
        field_names = ["level", "employment_type", "job_function", "industries"]
        field_labels = [
            "Seniority level",
            "Employment type",
            "Job function",
            "Industries",
        ]

        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Process each field
        for i, field in enumerate(field_names):
            if i < len(criteria_list):
                job_info[field] = (
                    criteria_list[i].text.replace(field_labels[i], "").strip()
                )
            else:
                job_info[field] = ""
        job_info["description"] = text_clean(
            soup.find(
                "div", {"class": "description__text description__text--rich"}
            ).text.strip()
        )
        job_data.append(job_info)
    except:
        continue

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:06<00:00,  1.20it/s]
100%|██████████| 80/80 [00:31<00:00,  2.53it/s]


In [159]:
import pandas as pd

df = pd.DataFrame.from_dict(job_data)
df.to_csv("linkedin_jobs.csv", index=False, encoding="utf-8-sig")

## load vectore store

In [6]:
from langchain_openai import OpenAIEmbeddings
from genu.Job_agent.config import HEADERS, LINKEDIN_JOB_SEARCH_PARAMS, PERSIST_PATH
from langchain.vectorstores import FAISS

loaded_vectorstore = FAISS.load_local(
    f"../{PERSIST_PATH}",
    OpenAIEmbeddings(),
    allow_dangerous_deserialization=True,
)
# For FAISS

print("FAISS vectorstore count:", len(loaded_vectorstore.index_to_docstore_id))

FAISS vectorstore count: 30


In [7]:
loaded_vectorstore.similarity_search("price optimization", k=2)

[Document(id='07c0b42d-5c95-49f5-b9f1-cd269f2024a0', metadata={'company': 'Optum', 'title': 'Senior Data Analyst, Quality - Remote', 'level': 'Mid-Senior level', 'employment_type': 'Full-time', 'job_function': 'Information Technology', 'industries': 'Hospitals and Health Care, Non-profit Organizations, and Government Administration', 'posted_time': '3 days ago', 'applicants': '78 applicants', 'parsing_link': 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/4235771207', 'job_posting_link': 'https://www.linkedin.com/jobs/view/senior-data-analyst-quality-remote-at-optum-4235771207?trk=public_jobs_topcard-title', 'job_id': '4235771207', 'date': '2025-05-26'}, page_content='Senior Data Analyst, Quality - Remote \n Optum \n Optum is a global organization that delivers care, aided by technology to help millions of people live healthier lives. The work you do with our team will directly improve health outcomes by connecting people with the care, pharmacy benefits, data and resources th

In [8]:
from pprint import pprint

pprint(loaded_vectorstore.similarity_search("price optimization", k=5)[0].page_content)

('Senior Data Analyst, Quality - Remote \n'
 ' Optum \n'
 ' Optum is a global organization that delivers care, aided by technology to '
 'help millions of people live healthier lives. The work you do with our team '
 'will directly improve health outcomes by connecting people with the care, '
 'pharmacy benefits, data and resources they need to feel their best. Here, '
 'you will find a culture guided by inclusion, talented peers, comprehensive '
 'benefits and career development opportunities. Come make an impact on the '
 'communities we serve as you help us advance health optimization on a global '
 'scale. Join us to start Caring. Connecting. Growing together.As a Senior '
 'Data Analyst in the Quality Performance Analytics team in Optum, you will be '
 'working with data that contributes to clinical quality gap closures, and the '
 'improvement of Medicare Stars & HEDIS measures. Responsibilities include '
 'reporting and analysis, data validations, consulting between internal and