# JOB OFFER'S DATA (part 2: extraction of the job description)

### 1. Setting up for web scraping.

In [1]:
# Load nessesary libraries.
import pandas as pd
import time
import os
from time import sleep
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Define global variables.
job_details = []
csv_files = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv']
csv_files_20jan = ['data_jobads_rn_20jan.csv', 'data_jobads_e_20jan.csv', 'data_jobads_da_20jan.csv']
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
df_name_20jan = ['df_rn_20jan', 'df_e_20jan', 'df_da_20jan']
first_date = 'January 10, 2024'
keywords = ['REGISTERED NURSE', 'ELECTRICIAN', 'DATA ANALYST']

In [3]:
# Define a custom function to remove duplicate job ads.
def remove_duplicates(csv_new, csv_old, df_new, df_old, key_word):
    
    # Declare global variables.
    global directory
    
    df_new = pd.read_csv(csv_new, index_col=None)
    df_old = pd.read_csv(csv_old, index_col=None)
    
    merged_df = pd.merge(df_new, df_old[['id']], on='id', how='left', indicator=True)

    # Filter rows where the job ID is not present in both DataFrames.
    df_new = merged_df[merged_df['_merge'] == 'left_only']

    # Drop the indicator column.
    df_new = df_new.drop('_merge', axis=1)
    
    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv_new)
    
    # Update the existing DataFrame.
    df_new.to_csv(file_path, index=False)
    
    print(f'The raw data was updated successfully as {file_path}.')
    print(f'There are {df_new.shape[0]} new job ads added since {first_date} with the keyword <{key_word}>.')

In [4]:
# Define a custom function to extract job details from the available hyperlinks.
def get_job_details(csv):
    
    # Declare global variables.
    global job_details
    
    # Set up Chrome webdriver options.
    option=Options()
    option.add_experimental_option('debuggerAddress', 'localhost:0820')
    
    # Specify the start time.
    start = time.time()
    
    # Initialize Chrome driver.
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
    
    # Access each hyperlink, retrieve information about the job, and store it in the global variable 'job_details'.
    df_ads = pd.read_csv(csv, index_col=None)
    total_rows = df_ads.shape[0]
    
    for x in range(0, total_rows):
        link = df_ads.iat[x, 2]
        driver.get(link)
        sleep(randint(2, 4))
    
        job_page = driver.find_element(By.ID, 'jobDescriptionText')
        job_details.append(job_page.text)
        sleep(randint(2, 4))
        
    # Specify the end time.
    end = time.time()
    
    # Check results.
    print(f'Total number of extracted job ads details: {len(job_details)}.\n')
    print('EXAMPLE:')
    print(job_details[randint(0, total_rows)], '\n')
    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [5]:
# Define a costum function to rewrite extracted information into existing csv files.
def df_create_export_csv(new_df, csv):

    # Declare global variables.
    global job_details
    global directory
    
    # Create a new pandas Dataframe using the ads csv file.
    new_df = pd.read_csv(csv, index_col=None)
    new_df['job_description'] = job_details
    
    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was rewritten to existing file and successfully exported as {file_path}.")

### 2. Remove duplicates and web scraping.
**registered nurse ads**

In [6]:
remove_duplicates(csv_files_20jan[0], csv_files[0], df_name_20jan[0], df_name[0], keywords[0])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.
There are 194 new job ads added since January 10, 2024 with the keyword <REGISTERED NURSE>.


In [7]:
# Scrape the job details.
get_job_details(csv_files_20jan[0])

Total number of extracted job ads details: 194.

EXAMPLE:
Description:
Cpl Healthcare are seeking a Staff Nurse to join an excellent Ophthalmology Clinic
Our client is seeking a Staff Nurse to join their growing team. This clinic specializes in eye surgery and procedures on an outpatient basis. Prior ophthalmology experience not required as training will be provided
Shift Pattern: 4x10hour shifts per week
Applicant Requirements
NMBI Registered General Nurse
Previous experience in an acute surgical environment desirable
Good teamwork skills
Willingness to learn
Excellent clinical skills
Excellent communication skills

EMAIL: louise.omeara@cplhealthcare.com
Ref.no.:
JO-2307-519025
Locations:
Dublin
Salary:
€33000 - €50000
Employment type:
Full Time;
Tags:
Clinic Nurse,Day Nurse,ENT,Eye,Laser Surgery,Nurse,Nursing,Ophthalmology,Surgical

EMAIL: louise.omeara@cplhealthcare.com
Ref.no.:
JO-2307-519025
Locations:
Dublin
Salary:
€33000 - €50000
Employment type:
Full Time;
Tags:
Clinic Nurse,D

In [8]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[0], csv_files_20jan[0])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.


In [9]:
# Reset variable.
job_details = []

**electrician ads**

In [10]:
remove_duplicates(csv_files_20jan[1], csv_files[1], df_name_20jan[1], df_name[1], keywords[1])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.
There are 54 new job ads added since January 10, 2024 with the keyword <ELECTRICIAN>.


In [11]:
# Scrape the job details.
get_job_details(csv_files_20jan[1])

Total number of extracted job ads details: 54.

EXAMPLE:
Maintenance Electrician required Dublin, Salary 50k – 55k+ Bonus.
Your new Company
This company is part of one of the UK & Irelands largest water companies who provides water and recycling services to over 6 million customers in England. Operating for over 20 years in the Irish market they currently operate one of Europe’s largest wastewater treatment plants, at which they currently treat over 50% of Ireland’s wastewater. The are a proven leader in the provision of innovative water, wastewater, and resource recycling solutions for a range of sectors which include municipal, industrial, and commercial industries in Ireland.
From design and engineering to construction, through to site operation and management, they have a proven track record in the provision of Lean water and wastewater solutions that increase efficiency, reduce carbon footprint, and minimize operational cost for our customers.
Your new role
This role enquires you 

In [12]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[1], csv_files_20jan[1])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.


In [13]:
# Reset variable.
job_details = []

**data analyst ads**

In [14]:
remove_duplicates(csv_files_20jan[2], csv_files[2], df_name_20jan[2], df_name[2], keywords[2])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.
There are 85 new job ads added since January 10, 2024 with the keyword <DATA ANALYST>.


In [15]:
# Scrape the job details.
get_job_details(csv_files_20jan[2])

Total number of extracted job ads details: 85.

EXAMPLE:
Job Title: Strategy Analyst- Fintech
Sector: Fintech
Location: Dublin/Hybrid
Salary: DOE plus benefits

Our Client

Our client is an award-winning Fintech company headquartered in Dublin. Due to huge growth, there is a newly created Analyst opportunity within the Strategy function.

Why should you apply?

This is an extremely varied role working with the strategy Director on key company growth projects. This role will put you at the centre of decision making in a team responsible for driving the growth of new products in new markets (including US, Europe, and Asia). There is real scope for professional growth here, visibility of your achievements on the company’s success, and the chance to work in a collaborative and open environment.

Who should apply?

You will be a data-driven individual with at least 3 years’ experience within Consulting, Strategy, or Transformation, as well as:
Professional Services, Financial Services or Te

In [16]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[2], csv_files_20jan[2])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.
