In [None]:
import requests
from bs4 import BeautifulSoup
import math
import csv
import re
from tqdm import tqdm

# Target URL format with placeholder for pagination
target_url = "https://www.linkedin.com/jobs/search/?currentJobId=4231638091&distance=25&geoId=90000724&keywords=senior%20data%20scientist&origin=JOBS_HOME_KEYWORD_HISTORY&refresh=true"


def text_clean(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"\t+", "\t", text)
    text = re.sub(r"\s+", " ", text)
    return text


# Define a user agent
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0",
}

# List to store job IDs
job_ids = []
job_data = []

# Calculate number of pages to scrape (25 jobs per page)
total_jobs = 10  # Example number
pages = math.ceil(total_jobs / 25)

# Get all job IDs
for i in tqdm(range(0, pages)):
    res = requests.get(target_url.format("Python", "New York", i * 25), headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    jobs_on_page = soup.find_all("li")

    for job in jobs_on_page:
        try:
            job_id = (
                job.find("div", {"class": "base-card"})
                .get("data-entity-urn")
                .split(":")[-1]
            )
            job_ids.append(job_id)
        except:
            continue

# Get details for each job
job_details_url = "https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}"
for job_id in tqdm(job_ids):
    resp = requests.get(job_details_url.format(job_id), headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    job_info = {}
    try:
        job_info["company"] = (
            soup.find("div", {"class": "top-card-layout__card"})
            .find("a")
            .find("img")
            .get("alt")
        )
        job_info["title"] = (
            soup.find("div", {"class": "top-card-layout__entity-info"})
            .find("a")
            .text.strip()
        )
        # job_info["level"] = soup.find("ul", {"class": "description__job-criteria-list"}).find("li").text.replace("Seniority level", "").strip()

        # Get all li elements in the list
        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Field names in order they typically appear
        field_names = ["level", "employment_type", "job_function", "industries"]
        field_labels = [
            "Seniority level",
            "Employment type",
            "Job function",
            "Industries",
        ]

        criteria_list = soup.find(
            "ul", {"class": "description__job-criteria-list"}
        ).find_all("li")

        # Process each field
        for i, field in enumerate(field_names):
            if i < len(criteria_list):
                job_info[field] = (
                    criteria_list[i].text.replace(field_labels[i], "").strip()
                )
            else:
                job_info[field] = ""
        job_info["description"] = text_clean(
            soup.find(
                "div", {"class": "description__text description__text--rich"}
            ).text.strip()
        )
        job_data.append(job_info)
    except:
        continue

100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 27/27 [00:10<00:00,  2.64it/s]


In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(job_data)
df.to_csv("linkedin_jobs.csv", index=False, encoding="utf-8-sig")

In [None]:
import pprint

pprint.pprint(df.iloc[0, -1])

('Optum is a global organization that delivers care, aided by technology to '
 'help millions of people live healthier lives. The work you do with our team '
 'will directly improve health outcomes by connecting people with the care, '
 'pharmacy benefits, data and resources they need to feel their best. Here, '
 'you will find a culture guided by inclusion, talented peers, comprehensive '
 'benefits and career development opportunities. Come make an impact on the '
 'communities we serve as you help us advance health optimization on a global '
 'scale. Join us to start Caring. Connecting. Growing together.As a Senior '
 'Data Analyst in the Quality Performance Analytics team in Optum, you will be '
 'working with data that contributes to clinical quality gap closures, and the '
 'improvement of Medicare Stars & HEDIS measures. Responsibilities include '
 'reporting and analysis, data validations, consulting between internal and '
 'external teams, proactively identifying performance i

In [72]:
import random
import time
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

from genu.Job_agent.config import LINKEDIN_JOB_SEARCH_URL, HEADERS
from genu.utils import text_clean

persist_directory = "../data/job_data/vectorstore"

# # Later, to load the saved vector store:
# loaded_vectorstore = Chroma(
#     persist_directory=f"{persist_directory}_chroma",
#     embedding_function=OpenAIEmbeddings(),
# )

In [73]:
# # Get all documents
# all_docs = loaded_vectorstore.get()

# # Access the documents
# documents = all_docs["documents"]  # List of document contents
# metadatas = all_docs["metadatas"]  # List of document metadata
# ids = all_docs["ids"]  # List of document IDs

# # Print first few documents with their metadata
# for i in range(min(5, len(documents))):
#     print(f"Document {i+1} (ID: {ids[i]}):")
#     print(f"Content: {documents[i]}")
#     print(f"Metadata: {metadatas[i]}")
#     print("-" * 50)

# # Get total count
# print(f"Total documents: {len(documents)}")

In [74]:
# # Get similar documents to a query
# query = "Optum"
# similar_docs = loaded_vectorstore.similarity_search(
#     query, k=5
# )  # k is number of results

# # Print the retrieved documents
# for i, doc in enumerate(similar_docs):
#     print(f"Result {i+1}:")
#     print(f"Content: {doc.page_content}")
#     print(f"Metadata: {doc.metadata}")
#     print("-" * 50)

In [87]:
from langchain.vectorstores import FAISS

persist_directory = "data/job_data/vectorstore"
loaded_vectorstore = FAISS.load_local(
        f"../{persist_directory}_faiss",
        OpenAIEmbeddings(),
        allow_dangerous_deserialization=True,
    )  
    # For FAISS

print("FAISS vectorstore count:", len(loaded_vectorstore.index_to_docstore_id))

FAISS vectorstore count: 113


In [88]:
loaded_vectorstore.similarity_search("price optimization", k=2)

[Document(id='59fe6a1e-77dc-4dcc-bbfa-d9cfd139e911', metadata={'company': 'Quince', 'title': 'Senior Data Analyst- Merchandising', 'level': 'Mid-Senior level', 'employment_type': 'Full-time', 'job_function': 'Information Technology', 'industries': 'Technology, Information and Internet', 'link': 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/4220647907'}, page_content="Senior Data Analyst- Merchandising \n Quince \n OUR STORY Quince was started to challenge the existing idea that nice things should cost a lot. Our mission was simple: create an item of equal or greater quality than the leading luxury brands and sell them at a much lower price.OUR VALUES Customer First. Customer satisfaction is our highest priority.High Quality. True quality is a combination of premium materials and high production standards that everyone can feel good about.Essential design. We don't chase trends, and we don't sell everything. We're expert curators that find the very best and bring it to you at

In [89]:
from pprint import pprint
pprint(loaded_vectorstore.similarity_search("price optimization", k=5)[0].page_content)

('Senior Data Analyst- Merchandising \n'
 ' Quince \n'
 ' OUR STORY Quince was started to challenge the existing idea that nice '
 'things should cost a lot. Our mission was simple: create an item of equal or '
 'greater quality than the leading luxury brands and sell them at a much lower '
 'price.OUR VALUES Customer First. Customer satisfaction is our highest '
 'priority.High Quality. True quality is a combination of premium materials '
 'and high production standards that everyone can feel good about.Essential '
 "design. We don't chase trends, and we don't sell everything. We're expert "
 'curators that find the very best and bring it to you at the lowest '
 'prices.Always a better deal. Through innovation and real price transparency '
 'we want to offer the best deal to both our customers and our factory '
 "partners.Environmentally and Socially conscious. We're committed to "
 'sustainable materials and sustainable production methods. That means a '
 'cleaner environment and fai