# JOB OFFER'S DATA (part 1: extraction of the job id and url)

### 1. Setting up for web scraping.

In [1]:
# Load nessesary libraries.
import time
import selenium 
import pandas as pd
import os
from datetime import datetime
from time import sleep
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
# Define global variables.
jobs = ['registered+nurse', 'electrician', 'data+analyst']
job_titles = ['registered nurse', 'electrician', 'data analyst']
job_list = []
pagination_url = 'https://ie.indeed.com/jobs?q={}&l=Dublin%2C+County+Dublin&radius=25&filter=0&sort=date&start={}'
max_iter_pgs = int()
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
csv_file_name = ['data_jobads_rn_20jan.csv', 'data_jobads_e_20jan.csv', 'data_jobads_da_20jan.csv']

In [3]:
# Define a custom function that verifies how many positions are available for the specified job and how many pages can be iterated.
def get_job_info(job_to_look, job_print):
    
    # Declare global variables.
    global pagination_url
    global max_iter_pgs
    job = job_to_look
    
    # Set up Chrome webdriver options.
    option= webdriver.ChromeOptions()
    option.add_argument("--incognito")
    
    # Specify the date.
    current_date = datetime.now().date().strftime('%B %d, %Y')
    start = time.time()
    
    # Initialize Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
    driver.get(pagination_url.format(job, 0))

    sleep(randint(4, 9))
    job_number = driver.find_element(By.CLASS_NAME,'jobsearch-JobCountAndSortPane-jobCount').text
    max_iter_pgs=int(job_number.split(' ')[0]) // 15 

    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Print results.
    print(f'{job_print.upper()}')
    print(f'Total number of vacancies available in Dublin area on {current_date}: {job_number}.')
    print('Maximum number of iterable pages for the search:', max_iter_pgs, 'pages')
    print('\n')
    print('Action was completed in:', end - start, 'seconds.')

In [4]:
# Define a costum function that will extract data from the web-page and create a table with the information for the specified job.
def scrape_job_details(job_to_look):
    
    # Declare global variables.
    global max_iter_pgs
    global job_list
    global job_titles
    global pagination_url
    job =  job_to_look
    
    # Specify the date.
    start = time.time()
    
    # Set up Chrome WebDriver.
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    sleep(randint(4, 9))

    # Loop through each job posting through the pages and extract job details.
    for i in range(0, max_iter_pgs):
        driver.get(pagination_url.format(job, i * 10))
        sleep(randint(4, 9))

        job_page = driver.find_element(By.ID, 'mosaic-jobResults')
        job_posts = job_page.find_elements(By.CLASS_NAME, 'job_seen_beacon')

        for job_post in job_posts:
            job_title = job_post.find_element(By.CLASS_NAME, 'jobTitle')
            job_id = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("id")
            job_link = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("href")

            try:
                job_date = job_post.find_element(By.CLASS_NAME, 'date').text
            except Exception as e:
                job_date = 'not available'

            # Append job details to the job_list.
            job_list.append([job_title.text, job_id, job_link, job_date])
    
    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Check results.
    print(f'{job.upper()} JOB ADS')
    print(f'The total count of scraped job vacancies: {len(job_list)} jobs.\n')
    for x in range(min(2, len(job_list))):
        print(f'JOB AD NO.{x + 1}:')
        print(job_list[x][0])
        print(job_list[x][1])
        print(job_list[x][2])
        print(job_list[x][3])
    print('\n')

    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [5]:
# Define a costum function that creates a new DataFrame of given job title, transforms the data, exports it as a CSV file.
def df_create_export_csv(new_df, csv):
    
    # Declare global variables.
    global job_list
    global directory
    
    # Create a new pandas Dataframe with the given job title.
    column = ['title', 'id', 'link', 'date']
    new_df = pd.DataFrame(job_list, columns=column)

    # Loop through each row in the DataFrame for data transformation.
    for x in range(int(new_df.shape[0])):
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()
        new_df.iat[x, 3] = new_df.iat[x, 3].replace('Posted\n', '')
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()

    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was transformed and exported successfully as {file_path}.")

### 2. Web scraping
**registered nurse ads**

In [6]:
# Check the job availability on Indeed.com.
get_job_info(jobs[0], job_titles[0])

REGISTERED NURSE
Total number of vacancies available in Dublin area on January 20, 2024: 625 jobs.
Maximum number of iterable pages for the search: 41 pages


Action was completed in: 14.182044982910156 seconds.


In [7]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[0])

REGISTERED+NURSE JOB ADS
The total count of scraped job vacancies: 612 jobs.

JOB AD NO.1:
Cardiac Staff Nurse
job_1df6cdf12a7ff3b4
https://ie.indeed.com/rc/clk?jk=1df6cdf12a7ff3b4&bb=KDvhOqIgqZ5NSFT5QjrOO23C8PptS_z7nVKCwpR56LoW4Fzx2imYf-7VMaZlwzzxoISVquSka--EG3V8ehIRKA5OjDsy4MsGmD2jxDyABCk%3D&xkcb=SoBL67M3FTSZ_k07Kh0LbzkdCdPP&fccid=c6715a18e860f1f6&cmp=White-Label-Management&ti=Cardiac+Nurse&vjs=3
Posted
Today
JOB AD NO.2:
Theatre Staff Nurse
job_8dac085957f00f90
https://ie.indeed.com/rc/clk?jk=8dac085957f00f90&bb=KDvhOqIgqZ5NSFT5QjrOO6vLP6RzbBCm65ZCHQdoTmzEiqePkoPCU2j6-hqwGtfBBeRpxgNS3TG6UM9DKg08LpgcfZy1hPi48cPuJnbG5Go%3D&xkcb=SoD_67M3FTSZ_k07Kh0KbzkdCdPP&fccid=c6715a18e860f1f6&cmp=White-Label-Management&ti=Theatre+Nurse&vjs=3
Posted
Today


The extraction was completed in: 5.0 minutes and 17.33529567718506 seconds.


In [8]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[0], csv_file_name[0])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.


In [9]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**electrician ads**

In [10]:
# Check the job availability on Indeed.com.
get_job_info(jobs[1], job_titles[1])

ELECTRICIAN
Total number of vacancies available in Dublin area on January 20, 2024: 151 jobs.
Maximum number of iterable pages for the search: 10 pages


Action was completed in: 16.498794078826904 seconds.


In [11]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[1])

ELECTRICIAN JOB ADS
The total count of scraped job vacancies: 145 jobs.

JOB AD NO.1:
Maintenance Technician (Electrical) - Dublin 3 (AM18158)
sj_8406b1af34f03d2b
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0D8963UVSHQvE14Gz87xEjFAazKwo8pwGt2AcCk0nHr42kDSuf3Mi8ZyaVg53Qijir90_Qm1Q588QQ6lWUwXDJ9nXC3sedbrJN4vxMujoOrQL0P5BIeuVMeMEdWqKYoybDsEv-NLXMbDBTSK3axGohD5OOQrNi0BrvJrdlam0i1_kdtCLwFtT0UwVETkTSJlFdYUqkYowxlhmhAAj-DGUAn7ppL5nftu46BXs_hX00-9myD5T9LcnIx0xATWT8kBCkAa9MKK5MVU73xK5joXIIrlK4_owpuEg6_dglcR6u1G9NK9ZcNoXIjFV_vw7Uo4_IHMJiqhZYm7LMtoUx3ufx8oQy9ThJmsA7VBhClDeIpp8xhycYf4v5k-EJpZMRRqatfYIwToH6y777i3bX6gAwoMwpv7xgE-c_FxeGCt2b2oLnnll1S7XHF09wGR4NK3vwGgitHCOBZ_gbYOjei-V6sAz3rlads2XRQM5oZm7RxVje364Sk_9DZGjkx2w3APZDc_9giWsD5eLo_v6HYUzj24SuUMZpbjnAcJQ1t0JNstpTSIUm4z1Ua68G8dxFp6KYBWx0eThN_BgxtUGIJq4WfUmYQmdk0yOsnpwLehOHXJeYIK5CTNfixZKZy3a8wFJJLnsAjCnq-YCMcIMhT6q50dxVjdvBwjvrSlso9yXZDN3bd1HRBIk0Z&xkcb=SoC86_M3FTSwrK07Mx0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Today
JOB AD NO.2:
Electrician
sj_

In [12]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[1], csv_file_name[1])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.


In [13]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**data analyst ads**

In [14]:
# Check the job availability on Indeed.com.
get_job_info(jobs[2], job_titles[2])

DATA ANALYST
Total number of vacancies available in Dublin area on January 20, 2024: 346 jobs.
Maximum number of iterable pages for the search: 23 pages


Action was completed in: 17.141117811203003 seconds.


In [15]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[2])

DATA+ANALYST JOB ADS
The total count of scraped job vacancies: 334 jobs.

JOB AD NO.1:
Data Analyst
job_3ffe8294304602bd
https://ie.indeed.com/rc/clk?jk=3ffe8294304602bd&bb=Lda2kDTxpK_Bw6ZiyREAuz9Bque7xHJNVLFtjAnziN5Rikp1PLpDzD63vAnfHjd2GGsoOHslDLTNc1jHq4XAFOv-nHBFjj1w299F_1-gEsA%3D&xkcb=SoC367M3FTTCt1WHMZ0LbzkdCdPP&fccid=e74773ca4b4eccf9&vjs=3
Posted
Just posted
JOB AD NO.2:
Data Analyst
job_84870afff879430a
https://ie.indeed.com/rc/clk?jk=84870afff879430a&bb=Lda2kDTxpK_Bw6ZiyREAu1JmhJayD93Zna4KCNZPIeMyOckg4Cld4HDuA1FJ7Bl4ftI0c8POrW2ax59bVi2U8_kCNm04f0fL7UBE3IYXDWg%3D&xkcb=SoAD67M3FTTCt1WHMZ0KbzkdCdPP&fccid=855f48b961808012&vjs=3
Posted
Today


The extraction was completed in: 3.0 minutes and 3.578531503677368 seconds.


In [16]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[2], csv_file_name[2])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.
