# JOB OFFER'S DATA (part 1: extraction of the job id and url)

### 1. Setting up for web scraping.

In [1]:
# Load nessesary libraries.
import time
import selenium 
import pandas as pd
import os
from datetime import datetime
from time import sleep
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
# Define global variables.
jobs = ['registered+nurse', 'electrician', 'data+analyst']
job_titles = ['registered nurse', 'electrician', 'data analyst']
job_list = []
pagination_url = 'https://ie.indeed.com/jobs?q={}&l=Dublin%2C+County+Dublin&radius=25&filter=0&sort=date&start={}'
max_iter_pgs = int()
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
csv_file_name = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv']

In [3]:
# Define a custom function that verifies how many positions are available for the specified job and how many pages can be iterated.
def get_job_info(job_to_look, job_print):
    
    # Declare global variables.
    global pagination_url
    global max_iter_pgs
    job = job_to_look
    
    # Set up Chrome webdriver options.
    option= webdriver.ChromeOptions()
    option.add_argument("--incognito")
    
    # Specify the date.
    current_date = datetime.now().date().strftime('%B %d, %Y')
    start = time.time()
    
    # Initialize Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
    driver.get(pagination_url.format(job, 0))

    sleep(randint(4, 9))
    job_number = driver.find_element(By.CLASS_NAME,'jobsearch-JobCountAndSortPane-jobCount').text
    max_iter_pgs=int(job_number.split(' ')[0]) // 15 

    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Print results.
    print(f'{job_print.upper()}')
    print(f'Total number of vacancies available in Dublin area on {current_date}: {job_number}.')
    print('Maximum number of iterable pages for the search:', max_iter_pgs, 'pages')
    print('\n')
    print('Action was completed in:', end - start, 'seconds.')

In [4]:
# Define a costum function that will extract data from the web-page and create a table with the information for the specified job.
def scrape_job_details(job_to_look):
    
    # Declare global variables.
    global max_iter_pgs
    global job_list
    global job_titles
    global pagination_url
    job =  job_to_look
    
    # Specify the date.
    start = time.time()
    
    # Set up Chrome WebDriver.
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    sleep(randint(4, 9))

    # Loop through each job posting through the pages and extract job details.
    for i in range(0, max_iter_pgs):
        driver.get(pagination_url.format(job, i * 10))
        sleep(randint(4, 9))

        job_page = driver.find_element(By.ID, 'mosaic-jobResults')
        job_posts = job_page.find_elements(By.CLASS_NAME, 'job_seen_beacon')

        for job_post in job_posts:
            job_title = job_post.find_element(By.CLASS_NAME, 'jobTitle')
            job_id = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("id")
            job_link = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("href")

            try:
                job_date = job_post.find_element(By.CLASS_NAME, 'date').text
            except Exception as e:
                job_date = 'not available'

            # Append job details to the job_list.
            job_list.append([job_title.text, job_id, job_link, job_date])
    
    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Check results.
    print(f'{job.upper()} JOB ADS')
    print(f'The total count of scraped job vacancies: {len(job_list)} jobs.\n')
    for x in range(min(2, len(job_list))):
        print(f'JOB AD NO.{x + 1}:')
        print(job_list[x][0])
        print(job_list[x][1])
        print(job_list[x][2])
        print(job_list[x][3])
    print('\n')

    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [5]:
# Define a costum function that creates a new DataFrame of given job title, transforms the data, exports it as a CSV file.
def df_create_export_csv(new_df, csv):
    
    # Declare global variables.
    global job_list
    global directory
    
    # Create a new pandas Dataframe with the given job title.
    column = ['title', 'id', 'link', 'date']
    new_df = pd.DataFrame(job_list, columns=column)

    # Loop through each row in the DataFrame for data transformation.
    for x in range(int(new_df.shape[0])):
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()
        new_df.iat[x, 3] = new_df.iat[x, 3].replace('Posted\n', '')
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()

    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was transformed and exported successfully as {file_path}.")

### 2. Web scraping
**registered nurse ads**

In [6]:
# Check the job availability on Indeed.com.
get_job_info(jobs[0], job_titles[0])

REGISTERED NURSE
Total number of vacancies available in Dublin area on January 10, 2024: 580 jobs.
Maximum number of iterable pages for the search: 38 pages


Action was completed in: 14.394474983215332 seconds.


In [7]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[0])

REGISTERED+NURSE JOB ADS
The total count of scraped job vacancies: 564 jobs.

JOB AD NO.1:
Assistant Director of Nursing
sj_3c7e64c7996bb9d6
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BI8IerbVtBNz9lxt-gGwgL1yqzg7rkL65oBt9kXiGchRfMu5HD70ENCeZDeMdLuF3Q6xzcJEOdJs3kZF1vPa7hsSvWDw387HGAhy09ybLyjuHgMbirifct4XvSysIpp9rS6Ba6AU4LkoNkAgvufXadgt9JcsvxlhJuDKpnvDOFeBo5dzQWo7pFgkc2ydoCMEtW2iWHYK8s0NBZGIKl4r7fRa5BSkndStcyPVvOs74vHKeTHQ7SUXy9Jvocb78gEg_dj47DKiNE4DkFJO0_gAKH8J7zdVJ2_SA0SWcKExPMVuj8cvElgzB1sNgtlr6Tj_tvJYbJguf9j_ZbW40QusqIW71ACZtcIEQtNDB2ACq92fxa2jeykoUuFHgXBCVybxWvDPNxEKC8ZR-CQ2U-uxIe24BspMOf_Q7ubq8y7Rn2TC_ww1c3qeBtg0-HG1nayqXJlu8_Q8JQxpi2Orw0JMwSvDbknG-Palp8oXVzt3wQSEGXv7POm22gus8PSdFTRhqn53K7WFof92B7UKnzoXDCOCUog7ovhXz-0jUTkR7jGF9BISOe6IYI7eSukOK5IeKuwIwAbEZoHgZeUmjLEFn634S8b_7FwUAq9oF0v_2kip74rvbHu1veWC9xeqPIVLR7LE43bTRiSpyNt1sryfSapz_GPLdBSSY=&xkcb=SoAG6_M3G50qpcR9Np0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Just posted
JOB AD NO.2:
Clinical Nurse Manager (CNM)
sj_358f1f68cde928c4
ht

In [8]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[0], csv_file_name[0])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn.csv.


In [9]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**electrician ads**

In [10]:
# Check the job availability on Indeed.com.
get_job_info(jobs[1], job_titles[1])

ELECTRICIAN
Total number of vacancies available in Dublin area on January 10, 2024: 166 jobs.
Maximum number of iterable pages for the search: 11 pages


Action was completed in: 15.139153242111206 seconds.


In [11]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[1])

ELECTRICIAN JOB ADS
The total count of scraped job vacancies: 155 jobs.

JOB AD NO.1:
Industrial Electrician
sj_7cba7a465e6641fe
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AnEYSss4eDbTLySY2p2efwzP3CilC5Xfeyf166GCcN9JJMj_EWM2A3xodspN8Pi50AQZK4e0X589pQszonXkMkH0RQ88koQVn7ZT1ssPBpQ_NkkqQhHLy474KArTvD5gBqTeIgUZnCJu_dKf5oe4HAkKcLEP2oyZYldxw_5f6gvn14xcRPMRza8-QyIqzv0byDd4PvDXXzJE4qOMdLC5mFAHmK1MYBMw5p7DiJwb-0l9Wn2nPXXR0fXfDw3ZGgpTK2fEod5hefk8SHbbBTg5XXqH-cqkCUwZQ9x2EWDy0u4UzQIublT1xvmjgu6FKChkkSwYiOeLdBpClFaBomH_21iQ0g5LHJAjhpQKbe5Rsk1qWO2CAoxB8uzG0hGry5nE_Kk10xn7b1s6IhQqZOo55YeLj-GzbQ2w-DTtb3Sp4X3C8eZfASRrQDZn5dl8gS8f63CC-Vla0OSTSqrKEexz5CHQ9huC6i-HpqLL5cp0JRmXoTInszOBMVKdAONBilh2S8z-AKr5sldSdsgOg8u-rA4qQggC-Z_S6yGMKpGlEqi3SmevB2y9AV74UJrhYkGLBICcE3LTwSgJcgH953pBiTYB-Ykz2oCw3HwOcSIklmmHLJtX1pEGpD9CH-Kc449-wE79tKz3lqJA==&xkcb=SoBz6_M3G51AfAR9Np0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Just posted
JOB AD NO.2:
Apprentice Electrician
job_a6937c5385359d51
https://ie.indeed.com/rc/clk?jk=a6937c5

In [12]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[1], csv_file_name[1])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e.csv.


In [13]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**data analyst ads**

In [14]:
# Check the job availability on Indeed.com.
get_job_info(jobs[2], job_titles[2])

DATA ANALYST
Total number of vacancies available in Dublin area on January 10, 2024: 342 jobs.
Maximum number of iterable pages for the search: 22 pages


Action was completed in: 12.003286123275757 seconds.


In [15]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[2])

DATA+ANALYST JOB ADS
The total count of scraped job vacancies: 315 jobs.

JOB AD NO.1:
Inventory Analyst
sj_d72c4cf42121ca22
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AnxfxssoujI3GvYsxvmNCzxzm6VnPtwq4vmbotio0JvQP4qXO3kiNsl1yUmVNF84MzBzmk8-uNcalZeVGMR2tlbw0j0ghDB32zhGcWEZiFsh6Zhjvcky-uhasSq90-Q5tQuwoeDbw8SltNynq_LNJvZUMRCpbvjFC48w4dMABSo4ebojJMmZ--dHZHBiOPE135_T6Xa7ud4X-cfbHmQ6gOKJv1AWhNFU4c5f5OSZMb91QEb925KxbCkocbzbpn9dQx3uhvO9W8KGpcJhKtOfLETuEJsfe-UuHiydeNE0qvY3PL0wvF7dhZ_XawpnVmj6C9DhBC5cnxriFAefUXHrWNQmj-WT9vVpoL95aOJdIc7s7y1ypjJI3yNWLyvdOVs6umuVRUmwN_uqs6ePa9Dc4h8SHkPM_GPMXsZLziJ3wN7iIsmpT6cHqk2BA4lWfDKw0S5d6TckE8Xzv-8UyHiDSQaOsejKEjKLi1uYZCRQAnHUSP5nZLrZdqQ18ZggJQSwBOJNIXuo3CsOYtW0UYGcMiBfWpdPuCYCdmtzEAcKghJZr_S1MYHgV56H-H6wpXyZ44S6SmdX4gn64qpFPnTP6sWpx5E6Pd6TpjCYvf9GoRuaeJDPvU4xDHpCQpEE2ih4frPZwuYx3gDPR5XMbklqnoNQaAOZZ4X18oOwIhhmHa4kVpQor0_f9mKTUHw5oxri0AkpEu-qcHoQ==&xkcb=SoDk6_M3G51RjlQHEx0LbzkdCdPP&p=0&fvj=0&vjs=3
Posted
Just posted
JOB AD NO.2:
Defect Control Engineer

In [16]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[2], csv_file_name[2])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da.csv.
