In [None]:
import requests
from bs4 import BeautifulSoup
import math
import csv
import re
from tqdm import tqdm

# Target URL format with placeholder for pagination
target_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location={}&start={}"


def text_clean(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"\t+", "\t", text)
    text = re.sub(r"\s+", " ", text)
    return text


# Define a user agent
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
}

# List to store job IDs
job_ids = []
job_data = []

# Calculate number of pages to scrape (25 jobs per page)
total_jobs = 200  # Example number
pages = math.ceil(total_jobs / 25)

# Get all job IDs
for i in tqdm(range(0, pages)):
    res = requests.get(
        target_url.format("senior Data scientist", "remote", i * 25), headers=headers
    )
    soup = BeautifulSoup(res.text, "html.parser")
    jobs_on_page = soup.find_all("li")

    for job in jobs_on_page:
        try:
            job_id = (
                job.find("div", {"class": "base-card"})
                .get("data-entity-urn")
                .split(":")[-1]
            )
            job_ids.append(job_id)
        except:
            continue

# Get details for each job
job_details_url = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}"
for job_id in tqdm(job_ids):
    resp = requests.get(job_details_url.format(job_id), headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    job_info = {}
    try:
        job_info["company"] = (
            soup.find("div", {"class": "top-card-layout__card"})
            .find("a")
            .find("img")
            .get("alt")
        )
        job_info["title"] = (
            soup.find("div", {"class": "top-card-layout__entity-info"})
            .find("a")
            .text.strip()
        )
        # job_info["level"] = soup.find("ul", {"class": "description__job-criteria-list"}).find("li").text.replace("Seniority level", "").strip()

        # Get all li elements in the list
        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Field names in order they typically appear
        field_names = ["level", "employment_type", "job_function", "industries"]
        field_labels = [
            "Seniority level",
            "Employment type",
            "Job function",
            "Industries",
        ]

        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Process each field
        for i, field in enumerate(field_names):
            if i < len(criteria_list):
                job_info[field] = (
                    criteria_list[i].text.replace(field_labels[i], "").strip()
                )
            else:
                job_info[field] = ""
        job_info["description"] = text_clean(
            soup.find(
                "div", {"class": "description__text description__text--rich"}
            ).text.strip()
        )
        job_data.append(job_info)
    except:
        continue

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:06<00:00,  1.20it/s]
100%|██████████| 80/80 [00:31<00:00,  2.53it/s]


In [159]:
import pandas as pd

df = pd.DataFrame.from_dict(job_data)
df.to_csv("linkedin_jobs.csv", index=False, encoding="utf-8-sig")

## load vectore store

In [None]:
from langchain_openai import OpenAIEmbeddings
from genu.Job_agent.config import HEADERS, LINKEDIN_JOB_SEARCH_PARAMS, PERSIST_PATH
from langchain.vectorstores import FAISS
from genu.Job_agent.vectorestore import vectorstore_to_dataframe

loaded_vectorstore = FAISS.load_local(
    f"../{PERSIST_PATH}",
    OpenAIEmbeddings(),
    allow_dangerous_deserialization=True,
)
# For FAISS

print("FAISS vectorstore count:", len(loaded_vectorstore.index_to_docstore_id))

FAISS vectorstore count: 180


In [11]:
vectorstore_to_dataframe(f"../{PERSIST_PATH}")

Loaded FAISS index with 180 documents.
Created DataFrame with shape: (180, 13)


Unnamed: 0,content,company,title,level,employment_type,job_function,industries,posted_time,applicants,parsing_link,job_posting_link,job_id,date
0,Data Science Manager \n HCA Healthcare \n Job ...,HCA Healthcare,Data Science Manager,Mid-Senior level,Full-time,Engineering and Information Technology,"Technology, Information and Media",3 days ago,,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scienc...,4222341754,2025-05-26
1,Data Scientist \n STG Logistics - Tropicana/Na...,STG Logistics - Tropicana/Naked Juice Div.,Data Scientist,Entry level,Part-time,"Accounting/Auditing, Advertising, and Administ...",Education Administration Programs and Governme...,3 days ago,103 applicants,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scient...,4234370785,2025-05-26
2,Data Scientist \n STG Logistics - Tropicana/Na...,STG Logistics - Tropicana/Naked Juice Div.,Data Scientist,Entry level,Part-time,"Accounting/Auditing, Advertising, and Administ...",Education Administration Programs and Governme...,3 days ago,130 applicants,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scient...,4234374284,2025-05-26
3,Data Scientist 3 \n Gormat \n Job Description:...,Gormat,Data Scientist 3,Entry level,Full-time,Engineering and Information Technology,"Hospitals and Health Care, Non-profit Organiza...",1 day ago,42 applicants,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scient...,4237502589,2025-05-26
4,Data Scientist \n STG Logistics - Tropicana/Na...,STG Logistics - Tropicana/Naked Juice Div.,Data Scientist,Entry level,Part-time,"Accounting/Auditing, Advertising, and Administ...",Education Administration Programs and Governme...,3 days ago,146 applicants,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scient...,4234374318,2025-05-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,"Data Scientist \n Render \n At Render, we're d...",Render,Data Scientist,Associate,Full-time,"Engineering, Information Technology, and Analyst","Hospitals and Health Care, Non-profit Organiza...",6 days ago,,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/data-scient...,4216706661,2025-05-26
176,Senior Clinical Bioinformatics Data Scientist ...,Lensa,Senior Clinical Bioinformatics Data Scientist,Mid-Senior level,Part-time,Engineering and Information Technology,"Hospitals and Health Care, Non-profit Organiza...",2 days ago,50 applicants,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/senior-clin...,4236792521,2025-05-26
177,People Tech - System Architect-Data Science Se...,PwC,People Tech - System Architect-Data Science Se...,Director,Full-time,Engineering and Information Technology,"Hospitals and Health Care, Non-profit Organiza...",3 days ago,,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/people-tech...,4233953533,2025-05-26
178,Principal Data Scientist \n Brahma Consulting ...,Brahma Consulting Group,Principal Data Scientist,Mid-Senior level,Contract,Analyst and Information Technology,IT Services and IT Consulting and Public Polic...,3 days ago,,https://www.linkedin.com/jobs-guest/jobs/api/j...,https://www.linkedin.com/jobs/view/principal-d...,4233744513,2025-05-26


In [24]:
loaded_vectorstore.similarity_search("forecasting", k=2)

[Document(id='7e27dee0-a8ef-4c28-a166-2e334ba2845f', metadata={'company': 'RandomTrees', 'title': 'IoT Data Scientist', 'level': 'Mid-Senior level', 'employment_type': 'Contract', 'job_function': 'Information Technology', 'industries': 'IT Services and IT Consulting', 'posted_time': '4 days ago', 'applicants': 'N/A', 'parsing_link': 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/4232805911', 'job_posting_link': 'https://www.linkedin.com/jobs/view/iot-data-scientist-at-randomtrees-4232805911?trk=public_jobs_topcard-title', 'job_id': '4232805911', 'date': '2025-05-26'}, page_content='IoT Data Scientist \n RandomTrees \n Building AI models for analyzing Oscillation FrequencyReal-time analytics & dashboarding for operational insights and predictive maintenanceCustom IoT solutions leveraging edge compute and cloud services. Show more Show less \n Information Technology \n IT Services and IT Consulting'),
 Document(id='9f016c67-523a-4cc0-943a-d80d7dbe431c', metadata={'company': 'Ha

In [29]:
from pprint import pprint

pprint(loaded_vectorstore.similarity_search("forecast", k=5)[0].page_content)

('Director Data Science \n'
 ' Harnham \n'
 ' Director Data ScienceRemoteUp to $215kA primary financial services company '
 'focused on leveraging advanced technology, predictive analytics and data '
 'science to enhance customer experience and drive better financial '
 'outcomes.The RoleThis role is key for driving model development in the '
 'company, focused on developing predictive and prescriptive models for '
 'business impact within marketing, sales and operations.Lead the data science '
 'function, delivering on a roadmap of 10–12 production models annually across '
 'key business domains.Drive full model lifecycle: ideation, prototyping, '
 'stakeholder alignment, and delivery into production (in partnership with '
 'MLOps/engineering).Contribute 50% of your time to hands-on tasks, including '
 'writing and reviewing Python and SQL code, developing models, and performing '
 'exploratory analysis.Manage and grow a small but scaling team: currently 2 '
 'ICs (senior and regular 