In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from lxml import html
import random
import requests
import csv
import re

# Act: Initialize lists

In [2]:
# Initialize lists to store the extracted data
job_names = []
total_openings_list = []
job_description_list = []
required_qualifications_list = []
preferred_qualifications_list = []
relocation_options_list = []
international_consideration_list = []

# Def: Extract Job Name

In [3]:
def read_csv_and_extract_job_names(csv_file_path):
    with open(csv_file_path, 'r') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter='|')
        header = next(csv_reader)  # Skip the header row
        for row in csv_reader:
            job_names.append(row[0])  # Assuming the job names are in the first column

    return job_names

# Def: Extract URLs

In [4]:
def read_csv_and_extract_urls(csv_file_path):
    urls = []
    with open(csv_file_path, 'r') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter='|')
        header = next(csv_reader)  # Skip the header row
        for row in csv_reader:
            urls.append(row[1])  # Assuming the job URLs are in the second column

    return urls

# Def: Retrieve HTML content from a URL

In [5]:
def get_job_html(job_url):
    response = requests.get(job_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve HTML content for job URL: {job_url}")
        return None

# Def: Extract Description Text

In [6]:
def extract_description_text(job_html):
    soup = BeautifulSoup(job_html, 'html.parser')

    # Try finding the first div with class 'ats-description ajd_job-details__ats-description'
    description_div = soup.find('div', class_='ats-description ajd_job-details__ats-description')

    if not description_div:
        # If not found, try finding a div with class 'job-description'
        job_description_div = soup.find('div', class_='job-description')
        
        if job_description_div:
            # If 'job-description' div is found, look for 'ats-description' within it
            description_div = job_description_div.find('div', class_='ats-description')
        else:
            description_div = job_description_div            

    # Check if a div element was found
    if description_div:
        # Extract all the text content under the div
        description_text = description_div.get_text(separator=' ')  # Use space as a separator
        description_text = ' '.join(description_text.split())  # Remove extra spaces and newline characters
        return description_text.strip()
    else:
        return "N/A - Description Text from job_html"

# Def: Total Openings

In [7]:
def extract_total_openings(description_text):
    # Define a regular expression pattern to match the total openings
    pattern = r"Total Number of Openings\s*(\d+)"

    # Use re.search to find the pattern in the description text
    match = re.search(pattern, description_text)

    if match:
        total_openings = match.group(1)  # Extract the captured number

        # Remove the information about "Total Number of Openings" and the captured number
        description_text = re.sub(pattern, "", description_text).strip()

        return total_openings, description_text
    else:
        return "N/A - Extract Total Openings", description_text  # Return default values if not found

# Def: Job Description and Responsibilities

In [8]:
def extract_job_description(description_text):
    # Define a list of potential delimiters to split by
    delimiters = [
        "Required Qualifications:",
        "Required Qualifications",
        "Required qualifications",
        "Required Skills and Experience:",
        "JOB QUALIFICATIONS:",
        "Requirements",
        "Relocation Options:",
    ]

    for delimiter in delimiters:
        parts = description_text.split(delimiter, 1)
        if len(parts) > 1:
            job_description = parts[0].strip()  # Text before the delimiter
            description_text = delimiter + parts[1].strip()  # Text after the delimiter
            return job_description, description_text

    # If none of the delimiters were found, return the "N/A" message
    return "N/A - Extract Job Description", "N/A - Extract Job Description"

# Def: Qualifications

In [9]:
def extract_qualifications(description_text):
    # Check if "Relocation Options:" exists in the description text
    if "Relocation Options:" in description_text:
        # Split the description text by "Relocation Options:"
        parts = description_text.split("Relocation Options:")
        qualifications = parts[0].strip()  # Text before Relocation Options
        description_text = parts[1].strip()  # Text after Relocation Options
    else:
        # Find the index of the first instance of "Chevron"
        chevron_index = description_text.lower().find("chevron")
        if chevron_index != -1:
            # Split the description text at the first instance of "Chevron"
            qualifications = description_text[:chevron_index].strip()
            description_text = description_text[chevron_index:].strip()
        else:
            qualifications = "N/A - Extract Qualifications"

    return qualifications, description_text

# Def: Relocation Options

In [10]:
def extract_relocation_options(description_text):
    # Split the description text by "International Considerations:"
    parts = description_text.split("International Considerations:")
    if len(parts) > 1:
        relocation_options = parts[0].strip()  # Text before Relocation Options
        description_text = parts[1].strip()  # Text after Relocation Options
        
        return relocation_options, description_text
    else:
        return "N/A - Extract Relocation", description_text

# Def: International Consideration

In [11]:
def extract_international_considerations(description_text):
    # Define a list of potential delimiters to split by
    delimiters = [
        "IMPORTANT NOTE:",
        "Regulatory Disclosure for US Positions",
        "Flexible Working",
        "Chevron offers"
    ]

    for _ in range(2):  # Repeat the process twice
        for delimiter in delimiters:
            if delimiter in description_text:
                # Split the description text by the delimiter
                parts = description_text.split(delimiter)
                international_considerations = parts[0].strip()  # Text before the delimiter
                description_text = delimiter + parts[1].strip()  # Text after the delimiter
                return international_considerations, description_text

    # If none of the delimiters were found, return the "N/A" message
    return "N/A - Extract International Considerations", description_text

# Action: Get the URLs

In [16]:
# Define the CSV file path for reading and writing
csv_file_path = 'job_information.csv'

# Read the CSV file and extract URLs
urls = read_csv_and_extract_urls(csv_file_path)

# Print the number of URLs read
print(f"Number of URLs read: {len(urls)}")

Number of URLs read: 69


# Action: Create the job_html_list (Time Consuming)

In [17]:
# Create a list to store the job_html
job_html_list = []

# Iterate through the URLs and extract job_html for each job URL
for job_url in urls:
    # Get HTML content from the job URL
    job_html = get_job_html(job_url)

    # Append the job_html to the list along with the URL
    job_html_list.append([job_url, job_html])

Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/houston/administrative-assistant/38138/55025779888
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/midland/instrumentation-electrical-and-power-technician/38138/54987842384
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/midland/mcbu-reliability-specialist/38138/54942306592
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/houston/senior-ccus-policy-advisor/38138/54905477120
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/pascagoula/machinist-maintenance-mechanic-trainee/38138/54888146736
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/pascagoula/instrument-and-electrical-maintenance-mechanic-trainee/38138/54888145344
Failed to retrieve HTML content for job URL: https://careers.chevron.com/job/bakersfield/land-representative/38138/54759402864
Failed to retrieve HTML con

# Test: Test Subject

In [14]:
# Define the CSV file path
# csv_file_path = 'job_information.csv'

# Extract job names and URLs using the provided functions
# job_names = read_csv_and_extract_job_names(csv_file_path)
# urls = read_csv_and_extract_urls(csv_file_path)
# url = 'https://careers.chevron.com/job/albert-lea/biodiesel-operator-loader/38138/55270585744'

# Get the job HTML
job_html = get_job_html(urls[67])
print("URL:", urls[67])
print()

# Extract data from HTML
description_text = extract_description_text(job_html)
print("Description_text:", description_text)
print()

# Call the function to extract total openings
total_openings, description_text = extract_total_openings(description_text)
print("Total Openings:", total_openings)
print()

# Call the function to extract the updated description text
job_description, description_text = extract_job_description(description_text)
print("Job Description:", job_description)
print()

# Call the function to extract the required and preferred qualifications
qualifications, description_text = extract_qualifications(description_text)
print("Qualifications:", qualifications)
print()

# Call the function to extract the relocation options
relocation_options, description_text = extract_relocation_options(description_text)
print("Relocation Options:", relocation_options)
print()

# Call the function to extract the relocation options
international_considerations, description_text = extract_international_considerations(description_text)
print("International Considerations:", international_considerations)
print()

URL: https://careers.chevron.com/job/houston/senior-application-engineer/38138/43808365120

Description_text: Chevron is accepting online applications for the position Senior Application Engineer Join our Team Chevron’s strategy is straight-forward: be a leader in efficient and lower carbon production of traditional energy, in high demand today and for decades to come, while growing lower carbon businesses that will be a bigger part of the future. To achieve these goals, we’ll build on the assets, experience, capabilities, and relationships we’ve developed over 140 years to incubate and grow new business. Technology will play a crucial role in unlocking ever cleaner and more affordable sources of energy. Chevron is seeking innovative, technology professionals with a desire to thrive in the global digital environment and help us lead the global energy transition. An IT career at Chevron offers you the opportunity to work in a technical environment with a global reach. You’ll find that w

# Action: Extract the data from the HTML

In [15]:
# Create a list to store updated rows
updated_rows = []

# Iterate through job_html_list and extract information
for job_url, job_html in job_html_list:
    # Extract data from HTML
    description_text = extract_description_text(job_html)

    # Call the function to extract total openings
    total_openings, description_text = extract_total_openings(description_text)

    # Call the function to extract the updated description text
    job_description, description_text = extract_job_description(description_text)

    # Call the function to extract the required and preferred qualifications
    qualifications, description_text = extract_qualifications(description_text)

    # Call the function to extract the relocation options
    relocation_options, description_text = extract_relocation_options(description_text)

    # Call the function to extract the international considerations
    international_considerations, _ = extract_international_considerations(description_text)

    # Append the extracted data to the list
    updated_row = [job_url, total_openings, job_description, qualifications, relocation_options, international_considerations]
    updated_rows.append(updated_row)
    
    # Remove all instances of "\n" from the elements of updated_rows
    for i, row in enumerate(updated_rows):
        updated_rows[i] = [item.replace("\n", " ") if isinstance(item, str) else item for item in row]

TypeError: object of type 'NoneType' has no len()

# Action: Save to the updated_job_information.csv

In [None]:
import csv

def save_updated_rows_to_csv(updated_rows, output_csv_file_path):
    """
    Save the updated rows to a CSV file.

    Args:
        updated_rows (list): List of lists containing updated row data.
        output_csv_file_path (str): Path to the output CSV file.
    """
    # Open the CSV file for writing
    with open(output_csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter='|')

        # Iterate through the updated rows and write them to the CSV file
        for row in updated_rows:
            csv_writer.writerow(row)

# Usage example
output_csv_file_path = 'updated_job_information.csv'
save_updated_rows_to_csv(updated_rows, output_csv_file_path)
print("Data saved to CSV file:", output_csv_file_path)