In [1]:
# Import BS4
from bs4 import BeautifulSoup
import requests
import re
import httpx
from IPython.display import display, HTML

def fetch_html_using_requests(url):
    HEADERS = {
        # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        # "Connection": "keep-alive",
        # "Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
    }
    response = requests.get(url, headers=HEADERS)
    return response.status_code, response.text

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from bs4 import BeautifulSoup
import time

def fetch_html_using_webdriver(url):
    """
    Scrapes the given URL using Edge WebDriver and returns the HTML content as a string.

    :param url: The URL of the page to scrape.
    :return: The HTML content of the page as a string.
    """
    # Set Up WebDriver
    edge_options = Options()
    edge_options.add_argument("--disable-popup-blocking")  # Enable popups

    # Initialize EdgeDriver
    driver_path = '/usr/local/bin/msedgedriver'
    service = Service(driver_path)
    driver = webdriver.Edge(service=service, options=edge_options)

    try:
        # Navigate to the Page
        driver.get(url)

        # Handle Popups (if any)
        try:
            alert = driver.switch_to.alert
            alert.accept()
        except:
            pass

        # Wait for the page to load completely
        time.sleep(5)

        # Extract Content and Status Code
        html_content = driver.page_source
        return html_content
    finally:
        # Close the browser
        driver.quit()

def fetch_html(url):
    """
    Fetches the HTML content of the given URL using Requests or WebDriver.

    :param url: The URL of the page to scrape.
    :return: The HTML content of the page as a string.
    """
    short_url = url[:25] + "..." + url[-25:] if len(url) > 50 else url
    try:
        # Try fetching using Requests
        status_code, html_content = fetch_html_using_requests(url)
        if status_code == 200:
            print(f"Fetched HTML {short_url} using Requests")
            return html_content
    except Exception as e:
        print(f"Failed to fetch HTML {short_url} using Requests due to: {e}")

    try:
        # Try fetching using WebDriver
        html_content = fetch_html_using_webdriver(url)
        print(f"Fetched HTML {short_url} using WebDriver")
        return html_content
    except Exception as e:
        print(f"Failed to fetch HTML {short_url} using WebDriver due to: {e}")

    # Return None if both methods fail
    return None

def to_bs4(html_content):
    """
    Converts the given HTML content into a BeautifulSoup object.

    :param html_content: The HTML content to parse.
    :return: BeautifulSoup object.
    """
    return BeautifulSoup(html_content, "html.parser")

In [2]:
def parse_job_posting_linkedin(html:str):
    soup = to_bs4(html)
    company_selector = "div.sub-nav-cta__sub-text-container > a"
    loaction_selector = "div.sub-nav-cta__sub-text-container > span"
    job_title_selector = "div.sub-nav-cta__text-container > h3.sub-nav-cta__header"
    job_description_selector = "div.description__text > section > div"

    company = soup.select(company_selector)[0].text.strip()
    location = soup.select(loaction_selector)[0].text.strip()
    job_title = soup.select(job_title_selector)[0].text.strip()
    job_description = "\n".join([child.text for child in soup.select(job_description_selector)[0].children])
    job_description = re.sub(r"\n+", "\n", job_description).strip()

    job_posting_data = {
        "job_title": job_title,
        "company": company,
        "location": location,
        "job_description": job_description
    }
    return job_posting_data

def parse_job_posting_indeed(html:str):
    soup = to_bs4(html)
    company_selector = "div[data-testid='inlineHeader-companyName']"
    loaction_selector = "div[data-testid='inlineHeader-companyLocation']"
    job_title_selector = "h1[data-testid='jobsearch-JobInfoHeader-title']"
    job_description_selector = "div#jobDescriptionText"

    company = soup.select(company_selector)[0].text.strip()
    location = soup.select(loaction_selector)[0].text.strip()
    job_title = soup.select(job_title_selector)[0].text.strip()
    job_description = "\n".join([child.text for child in soup.select(job_description_selector)[0].children])
    job_description = re.sub(r"\n+", "\n", job_description).strip()

    job_posting_data = {
        "job_title": job_title,
        "company": company,
        "location": location,
        "job_description": job_description
    }
    return job_posting_data

In [3]:
def import_from_url (url):
    try:
        html_content = fetch_html(url)
        if "linkedin" in url:
            job_posting_data = parse_job_posting_linkedin(html_content)
        elif "indeed" in url:
            job_posting_data = parse_job_posting_indeed(html_content)
        else:
            raise ValueError("Unsupported job posting URL")
        return job_posting_data
    except Exception as e:
        return f"Failed to import from url due to {str(e)}"

In [7]:
# url = "https://www.linkedin.com/jobs/view/4023953957/?eBP=CwEAAAGSF707ObQCMY82VUMK3ANR3h_axbkGZjGbdhm7XAXPqohuwQsLyfHMwRmrGRNolcKSmAsFGjHSzQsLhp8wcKwqpYR8ok1ymzgYS9xrVQT3njRv2ClHmFQ_92EbPOZBxGraUzrf6-FkvN581EDJLnn6tZmINeNTYn72x3fo1PJCDQcWA1dT2dydE9PzM19Waa9wxvnRThQMOfRsc4kNNOZEPi8W_9wE0A7z8YXE7WF7_8jCa-OkNPUmuPHamOA9XhwP_HS5IzqxE2zUJDHNLhn2F2JqPbxXqohFV2OnQfBdiwc7J3lmCXpMbyRdCc9YxW0e4PaMs9EYXsj7zcHN5ZZ0TatQ0dOOgqVJAU3vfk5sB3IbJZxXLoPOERrsnzi74F8mgdGV0qHjJuhn_nT7VAZU-agf850Jy5HwXNt8OaksRAcs4TunapyfDJaRv"
url = "https://www.linkedin.com/jobs/view/4029494858/?eBP=NOT_ELIGIBLE_FOR_CHARGING&refId=5eWuEhVJjv1DTtpUZ%2B6zYA%3D%3D&trackingId=eDKfaKdvhTSNvWRbOJxCfg%3D%3D&trk=flagship3_search_srp_jobs"

# Indeed
# url = "https://ca.indeed.com/viewjob?jk=e2886f2049fab19d&tk=1i8fdfr3ii0ju801&from=hp&advn=6478378941413849&adid=436307881&ad=-6NYlbfkN0C6j33RemckVVehcfj4bL0-vd56yrrXbDcl1-irHbPaNGkNCbS2MpZXNWKfuAJAOpvZMJAmQvEEIQWiWUKVSXfp7Q11JlGVObDRcZ-3n0Q-DnZh0KUWUWSU14qyraLF8pafx1lWNutI29jW8vkjlakBEx4Hq_K1uK3oMIv8aBqqmV8fLQ-GkPluHF6oj4yQc1PrBuFyptLkU13EqUTaQeavPn6KQeH4Oeiwjz2ZFeV5ZMDD24LSas9gid-mxjpPgqnuf8r2CPy0DvyH1Apk3O49veJN848a8Qzm2ENjyuSxFS6MUyt6vTUUbf8CmGaG5GOfukfgQa1HronI21KHI9vEhvaxILgwvrrr1nvsUnYt8bmxw8tqJ8byB_kfG5vmuSYHivX7yrzut3ehu02HkBtSZ_RCCF0o0XkFgbEozjqqMCHNVBgL0HVg6-fc_LubczponczCPUQ8S6xnx93hGA3-a5QeUUajlfSX6sxAQairqeayAW1J6_viREudz3RlEO58hy3jOti8DdPyB7GPd7ONwJlkFTsw41jIzN78iYN69YzXrLba0YTDSUKNXQRdDRZR1FdLMNwaeA%3D%3D&pub=4a1b367933fd867b19b072952f68dceb&camk=nUmJqO2E8rglK4IRZXb6Lg%3D%3D&xkcb=SoC46_M37PSwq8gM8D0LbzkdCdPP&xpse=SoB-6_I37PTEPcRwRJ0ObzkdCdPP&xfps=74ac8182-cd45-4566-a835-b5ea17c552ff&vjs=3"
# url = "https://ca.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0C6j33RemckVVehcfj4bL0-vd56yrrXbDcl1-irHbPaNGkNCbS2MpZXNWKfuAJAOpvZMJAmQvEEIQWiWUKVSXfp7Q11JlGVObDRcZ-3n0Q-DsTtJNo8reDwWP9tEtqDVGp4gak-6_Dbny8WC_RCU2FxssDW2xKMsjNKXWY4OXoeMXkoUpGFUgxdl6keyf3aw-b1S6j-6PXHxYwav3kZTKixfqS2AFIfvIhdRJLJz_Le9ZRu-3Oe5EfN2kXuMTk0OaNWOalZF-Y27JHbaYCCLYko1CsV6tevlQFPv7kUPnnED7901yQZ12rZtjTB-xs79JGSk-DEjKsczPSVUpHa86Pl5-S9JeJX8JuJ2d6QrlYOjrvjSE0dhHBz7Wwyz9FB9Hqx4j5Vi8fsJz_0OFwtjSne4rgUQk9hPddYgXU0TMFkurgagKOrXo8xP2q9n9ueF1__T8GFaj60YcJ9wkQEk1R4VdP7R2vj44XJz84hNkumSl9L7qtirQBVYUA9bIck_7O2FHlooCKaUMO58ggIiscczhwXM6Hu8HPWAUkJd_gT5ELIYHLSWsehO2fOvGzVLX24eihxlnh7wj-neaPwpr4NkgpBBCKq32BxwRW6BtYLiP_FojaR3cT67xaVbbl_51Zex_wDBKn1RJS9tYyNVF1qtsN4fotEI44Fbm72HuU9sg%3D%3D&xkcb=SoC46_M37PSwq8gM8D0LbzkdCdPP&camk=nUmJqO2E8rglK4IRZXb6Lg%3D%3D&from=hp&tk=1i8fdfr3ii0ju801&jsa=242&vjs=3"

html_content = fetch_html(url)
job_posting_data = parse_job_posting_linkedin(html_content)
# job_posting_data = parse_job_posting_indeed(html_content)

print(job_posting_data)
display(HTML(html_content))

Fetched HTML https://www.linkedin.com/...flagship3_search_srp_jobs using Requests
{'job_title': 'Software Developer [Co-op]', 'company': 'Varian', 'location': 'Winnipeg, Manitoba, Canada', 'job_description': "Together, we can beat cancer.\nAt Varian, a Siemens Healthineers Company, we bring together the world's best talent to realize our vision of a world without fear of cancer. Together, we work passionately to develop and deliver easy-to-use, efficient oncology solutions.\nWe are part of an incredible community of scientists, clinicians, developers, researchers, professionals, and skilled specialists pushing the boundaries of what’s possible, to improve people’s lives around the world. We embrace a culture of inclusivity in which the power and potential of every individual can be unleashed. We spark ideas that lead to positive impact and continued success.\nIf you want to be part of this important mission, we want to hear from you.\nIn your role as Cloud Software Developer at Varian,