# Install and Import 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import random
import requests
import csv

# Mapping for States / Provinces

In [2]:
# State Mapping for ease of use
state_mapping = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming',
    'DC': 'Washington D.C.',
    'AB': 'Alberta', 
    'BC': 'British Columbia', 
    'MB': 'Manitoba', 
    'NB': 'New Brunswick', 
    'NL': 'Newfoundland and Labrador', 
    'NS': 'Nova Scotia', 
    'NT': 'Northwest Territories', 
    'NU': 'Nunavut', 
    'ON': 'Ontario', 
    'PE': 'Prince Edward Island', 
    'QC': 'Quebec', 
    'SK': 'Saskatchewan', 
    'YT': 'Yukon'
}

# Initialize the Webpage

In [3]:
def initialize_webpage(base_url):
    driver = webdriver.Chrome()
    driver.get(base_url)    
    return driver

# Accept Cookies

In [4]:
def accept_cookies(driver, wait):
    try:
        cookie_accept_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')] | //button[contains(text(), 'OK')]")))
        cookie_accept_button.click()
    except Exception as e:
        print(f"An error occurred while accepting cookies: {str(e)}")

# GDPR Alert Check

In [5]:
def gdpr_alert_check(driver, wait):
    try:
        gdpr_alert = driver.find_element(By.ID, "gdpr-alert")
        if gdpr_alert.is_displayed():
            gdpr_button = gdpr_alert.find_element(By.TAG_NAME, "button")
            gdpr_button.click()
        wait.until(EC.presence_of_element_located((By.ID, "search-results-list")))
    except Exception as e:
        print(f"An error occurred while checking GDPR alert: {str(e)}")

# Prep the webpage for use

In [6]:
def load_items(driver, fixed_wait_time):    
    wait = WebDriverWait(driver, fixed_wait_time)
    accept_cookies(driver, wait)
    gdpr_alert_check(driver,wait)    
    job_list = []
    return driver, wait, job_list

# Wait for correct page to load

In [7]:
def wait_for_page(driver, page_number):
    try:
        WebDriverWait(driver, 10).until(
            EC.text_to_be_present_in_element((By.ID, "search-results"), f"data-current-page={page_number}")
        )
        print(f"Page {page_number} loaded successfully.")
    except Exception as e:
        print(f"An error occurred while waiting for page {page_number}: {str(e)}")

# Get page contents and update CSV

In [8]:
def get_page_contents_and_update_csv(driver, job_list, wait, page_number):
    try:
        # Wait for the correct page to load
        wait_for_page(driver, page_number)

        # Extract the data-current-page attribute from the search-results section
        current_page = driver.find_element(By.ID, "search-results").get_attribute("data-current-page")

        # Find the job list element by its ID
        job_list_element = driver.find_element(By.ID, "search-results-list")

        # Use BeautifulSoup to parse the HTML content of the job list element
        soup = BeautifulSoup(job_list_element.get_attribute("innerHTML"), 'lxml')
        
        # Find all list items within the parsed job list element
        job_items = soup.find_all('li')  # Define job_items here

        for job_item in job_items:
            job_name = job_item.find('h2').text.strip()
            job_location = job_item.find('span', {'class': 'job-location'}).text.strip()
            job_link = job_item.find('a')['href']

            city, state_abbr = job_location.split(', ')
            state = state_mapping.get(state_abbr, state_abbr)

            url = f"https://careers.chevron.com{job_link}"
            job_list.append([job_name, url, city, state, current_page, page_number])

        # Save the job information to a CSV file with pipe character as delimiter
        with open('job_information.csv', 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile, delimiter='|')  # Use '|' as the delimiter

            # Write header row
            csv_writer.writerow(['Job Name', 'URL', 'City', 'State', 'Current Page', 'Page Number'])

            # Write job information rows
            csv_writer.writerows(job_list)
        
    except Exception as e:
        print(f"An error occurred while scraping page {page_number}: {str(e)}")   

# Determine if there is a next page and click if needed

In [9]:
def determine_and_click_next_page(driver, wait):
    try:
        next_page_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))
        if 'disabled' in next_page_link.get_attribute("class"):
            return False
        
        # Scroll the next_page_link into view
        actions = ActionChains(driver)
        actions.move_to_element(next_page_link).perform()
        
        # Click the "Next Page" link
        next_page_link.click()
        return True
    except Exception as e:
        print(f"An error occurred while determining and clicking the next page: {str(e)}")
        return False

# Run the above

In [13]:
if __name__ == "__main__":
    # Provide the base_url
    base_url = "https://careers.chevron.com/search-jobs/United%20States/35016/2/6252001/39x76/-98x5/50/2"
    
    # Obtain the html text from the website
    html_text = requests.get(base_url).text

    # Store the information as soup
    soup = BeautifulSoup(html_text, 'lxml')
    
    fixed_wait_time = 10
    driver, wait, job_list = load_items(initialize_webpage(base_url), fixed_wait_time)  # Load items and get wait object
    page_number = 1
    total_pages = int(soup.find('section', {'id': 'search-results'})['data-total-pages'])  # Extract total pages

    # Loop to scrape data from pages
    for page_number in range(1, total_pages + 1):
        get_page_contents_and_update_csv(driver, job_list, wait, page_number)
        determine_and_click_next_page(driver, wait)

    # Close the webpage
    driver.quit()

An error occurred while accepting cookies: Message: element click intercepted: Element is not clickable at point (878, 1252)
  (Session info: chrome=117.0.5938.132)
Stacktrace:
	GetHandleVerifier [0x00007FF6925D7892+54818]
	(No symbol) [0x00007FF692546AC2]
	(No symbol) [0x00007FF6923FDA3B]
	(No symbol) [0x00007FF6924447CB]
	(No symbol) [0x00007FF692442B99]
	(No symbol) [0x00007FF692440968]
	(No symbol) [0x00007FF69243FA23]
	(No symbol) [0x00007FF69243571F]
	(No symbol) [0x00007FF69245EAAA]
	(No symbol) [0x00007FF692435036]
	(No symbol) [0x00007FF69245ECC0]
	(No symbol) [0x00007FF6924775A2]
	(No symbol) [0x00007FF69245E883]
	(No symbol) [0x00007FF692433691]
	(No symbol) [0x00007FF6924348D4]
	GetHandleVerifier [0x00007FF69293B992+3610402]
	GetHandleVerifier [0x00007FF692991860+3962352]
	GetHandleVerifier [0x00007FF692989D4F+3930847]
	GetHandleVerifier [0x00007FF692673646+693206]
	(No symbol) [0x00007FF692551628]
	(No symbol) [0x00007FF69254D934]
	(No symbol) [0x00007FF69254DA62]
	(No sym