TEMUULEN Bulgan
### [A COMPARATIVE EVALUATION OF TEXT REPRESENTATION TECHNIQUES FOR CONTENT-BASED JOB RECOMMENDATION SYSTEM](https://github.com/temulenbd/jrs)
#### `PART I: JOB OFFERS' DATASET` 
#### *This part of the project includes the coding for collecting the job offers dataset through web scraping and the subsequent processing of the collected data.*

# I.I Job offers' data collection (web scraping)

## 10th JAN, 2024. extraction of the job ID and URL

### 1. Setting up for web scraping.

**MODULE**

In [1]:
# Load nessesary libraries.
import time
import pandas as pd
import os
import re
from datetime import datetime, timedelta
from time import sleep
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import warnings
warnings.filterwarnings('ignore')

**GLOBAL VARIABLE**

In [2]:
# Define global variables.
jobs = ['registered+nurse', 'electrician', 'data+analyst']
job_titles = ['registered nurse', 'electrician', 'data analyst']
job_list = []
pagination_url = 'https://ie.indeed.com/jobs?q={}&l=Dublin%2C+County+Dublin&radius=25&filter=0&sort=date&start={}'
max_iter_pgs = int()
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
csv_file_name = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv']

### 2. Web scraping

**PREPARATION**

In [3]:
# Define a custom function that verifies how many positions are available for the specified job and how many pages can be iterated.
def get_job_info(job_to_look, job_print):
    
    # Declare global variables.
    global pagination_url
    global max_iter_pgs
    job = job_to_look
    
    # Set up Chrome webdriver options.
    option= webdriver.ChromeOptions()
    option.add_argument("--incognito")
    
    # Specify the date.
    current_date = datetime.now().date().strftime('%B %d, %Y')
    start = time.time()
    
    # Initialize Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
    driver.get(pagination_url.format(job, 0))

    sleep(randint(4, 9))
    job_number = driver.find_element(By.CLASS_NAME,'jobsearch-JobCountAndSortPane-jobCount').text
    max_iter_pgs=int(job_number.split(' ')[0]) // 15 

    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Print results.
    print(f'{job_print.upper()}')
    print(f'Total number of vacancies available in Dublin area on {current_date}: {job_number}.')
    print('Maximum number of iterable pages for the search:', max_iter_pgs, 'pages')
    print('\n')
    print('Action was completed in:', end - start, 'seconds.')

In [4]:
# Define a costum function that will extract data from the web-page and create a table with the information for the specified job.
def scrape_job_details(job_to_look):
    
    # Declare global variables.
    global max_iter_pgs
    global job_list
    global job_titles
    global pagination_url
    job =  job_to_look
    
    # Specify the date.
    start = time.time()
    
    # Set up Chrome WebDriver.
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    sleep(randint(4, 9))

    # Loop through each job posting through the pages and extract job details.
    for i in range(0, max_iter_pgs):
        driver.get(pagination_url.format(job, i * 10))
        sleep(randint(4, 9))

        job_page = driver.find_element(By.ID, 'mosaic-jobResults')
        job_posts = job_page.find_elements(By.CLASS_NAME, 'job_seen_beacon')

        for job_post in job_posts:
            job_title = job_post.find_element(By.CLASS_NAME, 'jobTitle')
            job_id = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("id")
            job_link = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("href")

            try:
                job_date = job_post.find_element(By.CLASS_NAME, 'date').text
            except Exception as e:
                job_date = 'not available'

            # Append job details to the job_list.
            job_list.append([job_title.text, job_id, job_link, job_date])
    
    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Check results.
    print(f'{job.upper()} JOB ADS')
    print(f'The total count of scraped job vacancies: {len(job_list)} jobs.\n')
    for x in range(min(2, len(job_list))):
        print(f'JOB AD NO.{x + 1}:')
        print(job_list[x][0])
        print(job_list[x][1])
        print(job_list[x][2])
        print(job_list[x][3])
    print('\n')

    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [5]:
# Define a costum function that creates a new DataFrame of given job title, transforms the data, exports it as a CSV file.
def df_create_export_csv(new_df, csv):
    
    # Declare global variables.
    global job_list
    global directory
    
    # Create a new pandas Dataframe with the given job title.
    column = ['title', 'id', 'link', 'date']
    new_df = pd.DataFrame(job_list, columns=column)

    # Loop through each row in the DataFrame for data transformation.
    for x in range(int(new_df.shape[0])):
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()
        new_df.iat[x, 3] = new_df.iat[x, 3].replace('Posted\n', '')
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()

    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    # new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was transformed and exported successfully as {file_path}.")

**SCRAPING**

*registered nurse ads*

In [6]:
# Check the job availability on Indeed.com.
get_job_info(jobs[0], job_titles[0])

REGISTERED NURSE
Total number of vacancies available in Dublin area on January 10, 2024: 580 jobs.
Maximum number of iterable pages for the search: 38 pages


Action was completed in: 14.394474983215332 seconds.


In [7]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[0])

REGISTERED+NURSE JOB ADS
The total count of scraped job vacancies: 564 jobs.

JOB AD NO.1:
Assistant Director of Nursing
sj_3c7e64c7996bb9d6
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BI8IerbVtBNz9lxt-gGwgL1yqzg7rkL65oBt9kXiGchRfMu5HD70ENCeZDeMdLuF3Q6xzcJEOdJs3kZF1vPa7hsSvWDw387HGAhy09ybLyjuHgMbirifct4XvSysIpp9rS6Ba6AU4LkoNkAgvufXadgt9JcsvxlhJuDKpnvDOFeBo5dzQWo7pFgkc2ydoCMEtW2iWHYK8s0NBZGIKl4r7fRa5BSkndStcyPVvOs74vHKeTHQ7SUXy9Jvocb78gEg_dj47DKiNE4DkFJO0_gAKH8J7zdVJ2_SA0SWcKExPMVuj8cvElgzB1sNgtlr6Tj_tvJYbJguf9j_ZbW40QusqIW71ACZtcIEQtNDB2ACq92fxa2jeykoUuFHgXBCVybxWvDPNxEKC8ZR-CQ2U-uxIe24BspMOf_Q7ubq8y7Rn2TC_ww1c3qeBtg0-HG1nayqXJlu8_Q8JQxpi2Orw0JMwSvDbknG-Palp8oXVzt3wQSEGXv7POm22gus8PSdFTRhqn53K7WFof92B7UKnzoXDCOCUog7ovhXz-0jUTkR7jGF9BISOe6IYI7eSukOK5IeKuwIwAbEZoHgZeUmjLEFn634S8b_7FwUAq9oF0v_2kip74rvbHu1veWC9xeqPIVLR7LE43bTRiSpyNt1sryfSapz_GPLdBSSY=&xkcb=SoAG6_M3G50qpcR9Np0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Just posted
JOB AD NO.2:
Clinical Nurse Manager (CNM)
sj_358f1f68cde928c4
ht

In [8]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[0], csv_file_name[0])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn.csv.


In [9]:
# Reset variables.
max_iter_pgs = int()
job_list = []

*electrician ads*

In [10]:
# Check the job availability on Indeed.com.
get_job_info(jobs[1], job_titles[1])

ELECTRICIAN
Total number of vacancies available in Dublin area on January 10, 2024: 166 jobs.
Maximum number of iterable pages for the search: 11 pages


Action was completed in: 15.139153242111206 seconds.


In [11]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[1])

ELECTRICIAN JOB ADS
The total count of scraped job vacancies: 155 jobs.

JOB AD NO.1:
Industrial Electrician
sj_7cba7a465e6641fe
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AnEYSss4eDbTLySY2p2efwzP3CilC5Xfeyf166GCcN9JJMj_EWM2A3xodspN8Pi50AQZK4e0X589pQszonXkMkH0RQ88koQVn7ZT1ssPBpQ_NkkqQhHLy474KArTvD5gBqTeIgUZnCJu_dKf5oe4HAkKcLEP2oyZYldxw_5f6gvn14xcRPMRza8-QyIqzv0byDd4PvDXXzJE4qOMdLC5mFAHmK1MYBMw5p7DiJwb-0l9Wn2nPXXR0fXfDw3ZGgpTK2fEod5hefk8SHbbBTg5XXqH-cqkCUwZQ9x2EWDy0u4UzQIublT1xvmjgu6FKChkkSwYiOeLdBpClFaBomH_21iQ0g5LHJAjhpQKbe5Rsk1qWO2CAoxB8uzG0hGry5nE_Kk10xn7b1s6IhQqZOo55YeLj-GzbQ2w-DTtb3Sp4X3C8eZfASRrQDZn5dl8gS8f63CC-Vla0OSTSqrKEexz5CHQ9huC6i-HpqLL5cp0JRmXoTInszOBMVKdAONBilh2S8z-AKr5sldSdsgOg8u-rA4qQggC-Z_S6yGMKpGlEqi3SmevB2y9AV74UJrhYkGLBICcE3LTwSgJcgH953pBiTYB-Ykz2oCw3HwOcSIklmmHLJtX1pEGpD9CH-Kc449-wE79tKz3lqJA==&xkcb=SoBz6_M3G51AfAR9Np0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Just posted
JOB AD NO.2:
Apprentice Electrician
job_a6937c5385359d51
https://ie.indeed.com/rc/clk?jk=a6937c5

In [12]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[1], csv_file_name[1])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e.csv.


In [13]:
# Reset variables.
max_iter_pgs = int()
job_list = []

*data analyst ads*

In [14]:
# Check the job availability on Indeed.com.
get_job_info(jobs[2], job_titles[2])

DATA ANALYST
Total number of vacancies available in Dublin area on January 10, 2024: 342 jobs.
Maximum number of iterable pages for the search: 22 pages


Action was completed in: 12.003286123275757 seconds.


In [15]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[2])

DATA+ANALYST JOB ADS
The total count of scraped job vacancies: 315 jobs.

JOB AD NO.1:
Inventory Analyst
sj_d72c4cf42121ca22
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AnxfxssoujI3GvYsxvmNCzxzm6VnPtwq4vmbotio0JvQP4qXO3kiNsl1yUmVNF84MzBzmk8-uNcalZeVGMR2tlbw0j0ghDB32zhGcWEZiFsh6Zhjvcky-uhasSq90-Q5tQuwoeDbw8SltNynq_LNJvZUMRCpbvjFC48w4dMABSo4ebojJMmZ--dHZHBiOPE135_T6Xa7ud4X-cfbHmQ6gOKJv1AWhNFU4c5f5OSZMb91QEb925KxbCkocbzbpn9dQx3uhvO9W8KGpcJhKtOfLETuEJsfe-UuHiydeNE0qvY3PL0wvF7dhZ_XawpnVmj6C9DhBC5cnxriFAefUXHrWNQmj-WT9vVpoL95aOJdIc7s7y1ypjJI3yNWLyvdOVs6umuVRUmwN_uqs6ePa9Dc4h8SHkPM_GPMXsZLziJ3wN7iIsmpT6cHqk2BA4lWfDKw0S5d6TckE8Xzv-8UyHiDSQaOsejKEjKLi1uYZCRQAnHUSP5nZLrZdqQ18ZggJQSwBOJNIXuo3CsOYtW0UYGcMiBfWpdPuCYCdmtzEAcKghJZr_S1MYHgV56H-H6wpXyZ44S6SmdX4gn64qpFPnTP6sWpx5E6Pd6TpjCYvf9GoRuaeJDPvU4xDHpCQpEE2ih4frPZwuYx3gDPR5XMbklqnoNQaAOZZ4X18oOwIhhmHa4kVpQor0_f9mKTUHw5oxri0AkpEu-qcHoQ==&xkcb=SoDk6_M3G51RjlQHEx0LbzkdCdPP&p=0&fvj=0&vjs=3
Posted
Just posted
JOB AD NO.2:
Defect Control Engineer

In [16]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[2], csv_file_name[2])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da.csv.


## 20th JAN, 2024

In [None]:
# Define global variables.
jobs = ['registered+nurse', 'electrician', 'data+analyst']
job_titles = ['registered nurse', 'electrician', 'data analyst']
job_list = []
pagination_url = 'https://ie.indeed.com/jobs?q={}&l=Dublin%2C+County+Dublin&radius=25&filter=0&sort=date&start={}'
max_iter_pgs = int()
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
csv_file_name = ['data_jobads_rn_20jan.csv', 'data_jobads_e_20jan.csv', 'data_jobads_da_20jan.csv']

In [None]:
# Define a custom function that verifies how many positions are available for the specified job and how many pages can be iterated.
def get_job_info(job_to_look, job_print):
    
    # Declare global variables.
    global pagination_url
    global max_iter_pgs
    job = job_to_look
    
    # Set up Chrome webdriver options.
    option= webdriver.ChromeOptions()
    option.add_argument("--incognito")
    
    # Specify the date.
    current_date = datetime.now().date().strftime('%B %d, %Y')
    start = time.time()
    
    # Initialize Chrome driver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=option)
    driver.get(pagination_url.format(job, 0))

    sleep(randint(4, 9))
    job_number = driver.find_element(By.CLASS_NAME,'jobsearch-JobCountAndSortPane-jobCount').text
    max_iter_pgs=int(job_number.split(' ')[0]) // 15 

    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Print results.
    print(f'{job_print.upper()}')
    print(f'Total number of vacancies available in Dublin area on {current_date}: {job_number}.')
    print('Maximum number of iterable pages for the search:', max_iter_pgs, 'pages')
    print('\n')
    print('Action was completed in:', end - start, 'seconds.')

In [None]:
# Define a costum function that will extract data from the web-page and create a table with the information for the specified job.
def scrape_job_details(job_to_look):
    
    # Declare global variables.
    global max_iter_pgs
    global job_list
    global job_titles
    global pagination_url
    job =  job_to_look
    
    # Specify the date.
    start = time.time()
    
    # Set up Chrome WebDriver.
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    sleep(randint(4, 9))

    # Loop through each job posting through the pages and extract job details.
    for i in range(0, max_iter_pgs):
        driver.get(pagination_url.format(job, i * 10))
        sleep(randint(4, 9))

        job_page = driver.find_element(By.ID, 'mosaic-jobResults')
        job_posts = job_page.find_elements(By.CLASS_NAME, 'job_seen_beacon')

        for job_post in job_posts:
            job_title = job_post.find_element(By.CLASS_NAME, 'jobTitle')
            job_id = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("id")
            job_link = job_title.find_element(By.CSS_SELECTOR, 'a').get_attribute("href")

            try:
                job_date = job_post.find_element(By.CLASS_NAME, 'date').text
            except Exception as e:
                job_date = 'not available'

            # Append job details to the job_list.
            job_list.append([job_title.text, job_id, job_link, job_date])
    
    # Close the WebDriver.
    driver.quit()
    end = time.time()

    # Check results.
    print(f'{job.upper()} JOB ADS')
    print(f'The total count of scraped job vacancies: {len(job_list)} jobs.\n')
    for x in range(min(2, len(job_list))):
        print(f'JOB AD NO.{x + 1}:')
        print(job_list[x][0])
        print(job_list[x][1])
        print(job_list[x][2])
        print(job_list[x][3])
    print('\n')

    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [None]:
# Define a costum function that creates a new DataFrame of given job title, transforms the data, exports it as a CSV file.
def df_create_export_csv(new_df, csv):
    
    # Declare global variables.
    global job_list
    global directory
    
    # Create a new pandas Dataframe with the given job title.
    column = ['title', 'id', 'link', 'date']
    new_df = pd.DataFrame(job_list, columns=column)

    # Loop through each row in the DataFrame for data transformation.
    for x in range(int(new_df.shape[0])):
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()
        new_df.iat[x, 3] = new_df.iat[x, 3].replace('Posted\n', '')
        new_df.iat[x, 0] = new_df.iat[x, 0].lower()

    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    # new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was transformed and exported successfully as {file_path}.")

### 2. Web scraping
**registered nurse ads**

In [None]:
# Check the job availability on Indeed.com.
get_job_info(jobs[0], job_titles[0])

REGISTERED NURSE
Total number of vacancies available in Dublin area on January 20, 2024: 625 jobs.
Maximum number of iterable pages for the search: 41 pages


Action was completed in: 14.182044982910156 seconds.


In [None]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[0])

REGISTERED+NURSE JOB ADS
The total count of scraped job vacancies: 612 jobs.

JOB AD NO.1:
Cardiac Staff Nurse
job_1df6cdf12a7ff3b4
https://ie.indeed.com/rc/clk?jk=1df6cdf12a7ff3b4&bb=KDvhOqIgqZ5NSFT5QjrOO23C8PptS_z7nVKCwpR56LoW4Fzx2imYf-7VMaZlwzzxoISVquSka--EG3V8ehIRKA5OjDsy4MsGmD2jxDyABCk%3D&xkcb=SoBL67M3FTSZ_k07Kh0LbzkdCdPP&fccid=c6715a18e860f1f6&cmp=White-Label-Management&ti=Cardiac+Nurse&vjs=3
Posted
Today
JOB AD NO.2:
Theatre Staff Nurse
job_8dac085957f00f90
https://ie.indeed.com/rc/clk?jk=8dac085957f00f90&bb=KDvhOqIgqZ5NSFT5QjrOO6vLP6RzbBCm65ZCHQdoTmzEiqePkoPCU2j6-hqwGtfBBeRpxgNS3TG6UM9DKg08LpgcfZy1hPi48cPuJnbG5Go%3D&xkcb=SoD_67M3FTSZ_k07Kh0KbzkdCdPP&fccid=c6715a18e860f1f6&cmp=White-Label-Management&ti=Theatre+Nurse&vjs=3
Posted
Today


The extraction was completed in: 5.0 minutes and 17.33529567718506 seconds.


In [None]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[0], csv_file_name[0])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.


In [None]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**electrician ads**

In [None]:
# Check the job availability on Indeed.com.
get_job_info(jobs[1], job_titles[1])

ELECTRICIAN
Total number of vacancies available in Dublin area on January 20, 2024: 151 jobs.
Maximum number of iterable pages for the search: 10 pages


Action was completed in: 16.498794078826904 seconds.


In [None]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[1])

ELECTRICIAN JOB ADS
The total count of scraped job vacancies: 145 jobs.

JOB AD NO.1:
Maintenance Technician (Electrical) - Dublin 3 (AM18158)
sj_8406b1af34f03d2b
https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0D8963UVSHQvE14Gz87xEjFAazKwo8pwGt2AcCk0nHr42kDSuf3Mi8ZyaVg53Qijir90_Qm1Q588QQ6lWUwXDJ9nXC3sedbrJN4vxMujoOrQL0P5BIeuVMeMEdWqKYoybDsEv-NLXMbDBTSK3axGohD5OOQrNi0BrvJrdlam0i1_kdtCLwFtT0UwVETkTSJlFdYUqkYowxlhmhAAj-DGUAn7ppL5nftu46BXs_hX00-9myD5T9LcnIx0xATWT8kBCkAa9MKK5MVU73xK5joXIIrlK4_owpuEg6_dglcR6u1G9NK9ZcNoXIjFV_vw7Uo4_IHMJiqhZYm7LMtoUx3ufx8oQy9ThJmsA7VBhClDeIpp8xhycYf4v5k-EJpZMRRqatfYIwToH6y777i3bX6gAwoMwpv7xgE-c_FxeGCt2b2oLnnll1S7XHF09wGR4NK3vwGgitHCOBZ_gbYOjei-V6sAz3rlads2XRQM5oZm7RxVje364Sk_9DZGjkx2w3APZDc_9giWsD5eLo_v6HYUzj24SuUMZpbjnAcJQ1t0JNstpTSIUm4z1Ua68G8dxFp6KYBWx0eThN_BgxtUGIJq4WfUmYQmdk0yOsnpwLehOHXJeYIK5CTNfixZKZy3a8wFJJLnsAjCnq-YCMcIMhT6q50dxVjdvBwjvrSlso9yXZDN3bd1HRBIk0Z&xkcb=SoC86_M3FTSwrK07Mx0LbzkdCdPP&p=0&fvj=1&vjs=3
Posted
Today
JOB AD NO.2:
Electrician
sj_

In [None]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[1], csv_file_name[1])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.


In [None]:
# Reset variables.
max_iter_pgs = int()
job_list = []

**data analyst ads**

In [None]:
# Check the job availability on Indeed.com.
get_job_info(jobs[2], job_titles[2])

DATA ANALYST
Total number of vacancies available in Dublin area on January 20, 2024: 346 jobs.
Maximum number of iterable pages for the search: 23 pages


Action was completed in: 17.141117811203003 seconds.


In [None]:
# Scrape the data available on Indeed.com and save into 'job_list' variable.
scrape_job_details(jobs[2])

DATA+ANALYST JOB ADS
The total count of scraped job vacancies: 334 jobs.

JOB AD NO.1:
Data Analyst
job_3ffe8294304602bd
https://ie.indeed.com/rc/clk?jk=3ffe8294304602bd&bb=Lda2kDTxpK_Bw6ZiyREAuz9Bque7xHJNVLFtjAnziN5Rikp1PLpDzD63vAnfHjd2GGsoOHslDLTNc1jHq4XAFOv-nHBFjj1w299F_1-gEsA%3D&xkcb=SoC367M3FTTCt1WHMZ0LbzkdCdPP&fccid=e74773ca4b4eccf9&vjs=3
Posted
Just posted
JOB AD NO.2:
Data Analyst
job_84870afff879430a
https://ie.indeed.com/rc/clk?jk=84870afff879430a&bb=Lda2kDTxpK_Bw6ZiyREAu1JmhJayD93Zna4KCNZPIeMyOckg4Cld4HDuA1FJ7Bl4ftI0c8POrW2ax59bVi2U8_kCNm04f0fL7UBE3IYXDWg%3D&xkcb=SoAD67M3FTTCt1WHMZ0KbzkdCdPP&fccid=855f48b961808012&vjs=3
Posted
Today


The extraction was completed in: 3.0 minutes and 3.578531503677368 seconds.


In [None]:
# Transform the scraped data and export it as a CSV file.
df_create_export_csv(df_name[2], csv_file_name[2])

The raw data was transformed and exported successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.


# I.II Job offers' data collection (extraction of the job description)

## 10th JAN, 2024

### 1. Setting up for web scraping.

In [None]:
# Define global variables.
job_details = []
csv_files = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv']
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']

In [None]:
# Define a custom function to extract job details from the available hyperlinks.
def get_job_details(csv):
    
    # Declare global variables.
    global job_details
    
    # Set up Chrome webdriver options.
    option=Options()
    option.add_experimental_option('debuggerAddress', 'localhost:0820')
    
    # Specify the start time.
    start = time.time()
    
    # Initialize Chrome driver.
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
    
    # Access each hyperlink, retrieve information about the job, and store it in the global variable 'job_details'.
    df_ads = pd.read_csv(csv, index_col=None)
    total_rows = df_ads.shape[0]
    
    for x in range(0, total_rows):
        link = df_ads.iat[x, 2]
        driver.get(link)
        sleep(randint(2, 4))
    
        job_page = driver.find_element(By.ID, 'jobDescriptionText')
        job_details.append(job_page.text)
        sleep(randint(2, 4))
        
    # Specify the end time.
    end = time.time()
    
    # Check results.
    print(f'Total number of extracted data: {len(job_details)}.\n')
    print('EXAMPLE:')
    print(job_details[randint(0, total_rows)], '\n')
    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [None]:
# Define a costum function to rewrite extracted information into existing csv files.
def df_create_export_csv(new_df, csv):

    # Declare global variables.
    global job_details
    global directory
    
    # Create a new pandas Dataframe using the ads csv file.
    new_df = pd.read_csv(csv, index_col=None)
    new_df['job_description'] = job_details
    
    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    # new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was  rewritten to existing file and successfully exported as {file_path}.")

### 2. Web scraping
**registered nurse ads**

In [None]:
# Scrape the job details.
get_job_details(csv_files[0])

Total number of extracted data: 564.

EXAMPLE:
We are now hiring Nurses in Dublin
Allied and Clinical are now Recruiting Nurses (all grades welcome) for Agency shifts in Tallaght Hospital.
Excellent opportunity for Nurses looking for a better work life balance, career change or extra shifts.
At Allied and Clinical, our healthcare professionals come first. We strive to treat each applicant as a person with their own career path and ambitions by being professional, warm, and approachable.
Why Choose Allied and Clinical?
-Choose your own working schedule.
-Competitive rates of pay in line with pay scales (€16.67-€57.00 per hour)
-Choice of different healthcare facilities.
-Gain experience in private/public healthcare facilities.
-One to One consultancy.
-Refer a friend bonus scheme
-Free Uniform.
-Free mandatory training.
-Free Life support training.
-Free Fit to work
Schedule:
Day Shift
Night Shift
12 HR Shift
8 HR Shift
Experience
-Registered on the Irish live register (NMBI). Must have

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[0], csv_files[0])

The raw data was  rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn.csv.


In [None]:
# Reset variable.
job_details = []

**electrician ads**

In [None]:
# Scrape the job details.
get_job_details(csv_files[1])

Total number of extracted data: 155.

EXAMPLE:
General information
Organisation
Egis is an international player active in the consulting, construction engineering and mobility service sectors.
We design and operate intelligent infrastructure and buildings capable of responding to the climate emergency and helping to achieve more balanced, sustainable and resilient territorial development.
With operations in 120 countries, Egis places the expertise of its 18,000 employees at the disposal of its clients and develops cutting-edge innovation accessible to all projects.
Improving people's quality of life and supporting communities in their social and economic development, whilst drastically reducing carbon emissions and achieving vital 2050 net zero targets, that's our purpose.
Reference
2023-9904
Position description
Job title
Maintenance Technician M/F
Contract type
Permanent contract
Business Line specific context
POSITION:
Maintenance Technician

COMPANY:
Egis Road & Tunnel Operation

P

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[1], csv_files[1])

The raw data was  rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e.csv.


In [None]:
# Reset variable.
job_details = []

**data analyst ads**

In [None]:
# Scrape the job details.
get_job_details(csv_files[2])

Total number of extracted data: 315.

EXAMPLE:
As a member of the Accounting Operations team, you will be responsible for assisting day-to-day operations and continuous improvement initiatives for the Accounting department. Your main tasks will include gathering data, performing analysis, and supporting projects and initiatives as needed.

A typical day might include the following:
Perform ad hoc requests related to reporting and data analysis to assist other team members, management, and audit requests.
Support the analysis and implementation of accounting operation functions relating to new technology projects throughout the organization.
Ensure quality control over the financial transactions and financial reporting.
Demonstrating knowledge of technical accounting standards under IFRS and US GAAP, and its application to tasks at hand.
Support month-end and year-end close processes.
Prepare documentation and Standard Operating Procedures (SOPs) for processes, enhancements and projects

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[2], csv_files[2])

The raw data was  rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da.csv.


## 20th JAN, 2024

### 1. Setting up for web scraping.

In [None]:
# Define global variables.
job_details = []
csv_files = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv']
csv_files_20jan = ['data_jobads_rn_20jan.csv', 'data_jobads_e_20jan.csv', 'data_jobads_da_20jan.csv']
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df_name = ['df_rn', 'df_e', 'df_da']
df_name_20jan = ['df_rn_20jan', 'df_e_20jan', 'df_da_20jan']
first_date = 'January 10, 2024'
keywords = ['REGISTERED NURSE', 'ELECTRICIAN', 'DATA ANALYST']

In [None]:
# Define a custom function to remove duplicate job ads.
def remove_duplicates(csv_new, csv_old, df_new, df_old, key_word):
    
    # Declare global variables.
    global directory
    
    df_new = pd.read_csv(csv_new, index_col=None)
    df_old = pd.read_csv(csv_old, index_col=None)
    
    merged_df = pd.merge(df_new, df_old[['id']], on='id', how='left', indicator=True)

    # Filter rows where the job ID is not present in both DataFrames.
    df_new = merged_df[merged_df['_merge'] == 'left_only']

    # Drop the indicator column.
    df_new = df_new.drop('_merge', axis=1)
    
    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv_new)
    
    # Update the existing DataFrame.
    df_new.to_csv(file_path, index=False)
    
    print(f'The raw data was updated successfully as {file_path}.')
    print(f'There are {df_new.shape[0]} new job ads added since {first_date} with the keyword <{key_word}>.')

In [None]:
# Define a custom function to extract job details from the available hyperlinks.
def get_job_details(csv):
    
    # Declare global variables.
    global job_details
    
    # Set up Chrome webdriver options.
    option=Options()
    option.add_experimental_option('debuggerAddress', 'localhost:0820')
    
    # Specify the start time.
    start = time.time()
    
    # Initialize Chrome driver.
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=option)
    
    # Access each hyperlink, retrieve information about the job, and store it in the global variable 'job_details'.
    df_ads = pd.read_csv(csv, index_col=None)
    total_rows = df_ads.shape[0]
    
    for x in range(0, total_rows):
        link = df_ads.iat[x, 2]
        driver.get(link)
        sleep(randint(2, 4))
    
        job_page = driver.find_element(By.ID, 'jobDescriptionText')
        job_details.append(job_page.text)
        sleep(randint(2, 4))
        
    # Specify the end time.
    end = time.time()
    
    # Check results.
    print(f'Total number of extracted job ads details: {len(job_details)}.\n')
    print('EXAMPLE:')
    print(job_details[randint(0, total_rows)], '\n')
    print(f'The extraction was completed in: {(end - start) // 60} minutes and {(end - start) % 60} seconds.')

In [None]:
# Define a costum function to rewrite extracted information into existing csv files.
def df_create_export_csv(new_df, csv):

    # Declare global variables.
    global job_details
    global directory
    
    # Create a new pandas Dataframe using the ads csv file.
    new_df = pd.read_csv(csv, index_col=None)
    new_df['job_description'] = job_details
    
    # Create the file path for CSV export.
    file_path = os.path.join(directory, csv)
    
    # Export the DataFrame to CSV file.
    new_df.to_csv(file_path, index=False)
    
    print(f"The raw data was rewritten to existing file and successfully exported as {file_path}.")

### 2. Remove duplicates and web scraping.
**registered nurse ads**

In [None]:
remove_duplicates(csv_files_20jan[0], csv_files[0], df_name_20jan[0], df_name[0], keywords[0])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.
There are 194 new job ads added since January 10, 2024 with the keyword <REGISTERED NURSE>.


In [None]:
# Scrape the job details.
get_job_details(csv_files_20jan[0])

Total number of extracted job ads details: 194.

EXAMPLE:
Description:
Cpl Healthcare are seeking a Staff Nurse to join an excellent Ophthalmology Clinic
Our client is seeking a Staff Nurse to join their growing team. This clinic specializes in eye surgery and procedures on an outpatient basis. Prior ophthalmology experience not required as training will be provided
Shift Pattern: 4x10hour shifts per week
Applicant Requirements
NMBI Registered General Nurse
Previous experience in an acute surgical environment desirable
Good teamwork skills
Willingness to learn
Excellent clinical skills
Excellent communication skills

EMAIL: louise.omeara@cplhealthcare.com
Ref.no.:
JO-2307-519025
Locations:
Dublin
Salary:
€33000 - €50000
Employment type:
Full Time;
Tags:
Clinic Nurse,Day Nurse,ENT,Eye,Laser Surgery,Nurse,Nursing,Ophthalmology,Surgical

EMAIL: louise.omeara@cplhealthcare.com
Ref.no.:
JO-2307-519025
Locations:
Dublin
Salary:
€33000 - €50000
Employment type:
Full Time;
Tags:
Clinic Nurse,D

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[0], csv_files_20jan[0])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_rn_20jan.csv.


In [None]:
# Reset variable.
job_details = []

**electrician ads**

In [None]:
remove_duplicates(csv_files_20jan[1], csv_files[1], df_name_20jan[1], df_name[1], keywords[1])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.
There are 54 new job ads added since January 10, 2024 with the keyword <ELECTRICIAN>.


In [None]:
# Scrape the job details.
get_job_details(csv_files_20jan[1])

Total number of extracted job ads details: 54.

EXAMPLE:
Maintenance Electrician required Dublin, Salary 50k – 55k+ Bonus.
Your new Company
This company is part of one of the UK & Irelands largest water companies who provides water and recycling services to over 6 million customers in England. Operating for over 20 years in the Irish market they currently operate one of Europe’s largest wastewater treatment plants, at which they currently treat over 50% of Ireland’s wastewater. The are a proven leader in the provision of innovative water, wastewater, and resource recycling solutions for a range of sectors which include municipal, industrial, and commercial industries in Ireland.
From design and engineering to construction, through to site operation and management, they have a proven track record in the provision of Lean water and wastewater solutions that increase efficiency, reduce carbon footprint, and minimize operational cost for our customers.
Your new role
This role enquires you 

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[1], csv_files_20jan[1])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_e_20jan.csv.


In [None]:
# Reset variable.
job_details = []

**data analyst ads**

In [None]:
remove_duplicates(csv_files_20jan[2], csv_files[2], df_name_20jan[2], df_name[2], keywords[2])

The raw data was updated successfully as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.
There are 85 new job ads added since January 10, 2024 with the keyword <DATA ANALYST>.


In [None]:
# Scrape the job details.
get_job_details(csv_files_20jan[2])

Total number of extracted job ads details: 85.

EXAMPLE:
Job Title: Strategy Analyst- Fintech
Sector: Fintech
Location: Dublin/Hybrid
Salary: DOE plus benefits

Our Client

Our client is an award-winning Fintech company headquartered in Dublin. Due to huge growth, there is a newly created Analyst opportunity within the Strategy function.

Why should you apply?

This is an extremely varied role working with the strategy Director on key company growth projects. This role will put you at the centre of decision making in a team responsible for driving the growth of new products in new markets (including US, Europe, and Asia). There is real scope for professional growth here, visibility of your achievements on the company’s success, and the chance to work in a collaborative and open environment.

Who should apply?

You will be a data-driven individual with at least 3 years’ experience within Consulting, Strategy, or Transformation, as well as:
Professional Services, Financial Services or Te

In [None]:
# Update the extracted data and save the changes.
df_create_export_csv(df_name[2], csv_files_20jan[2])

The raw data was rewritten to existing file and successfully exported as C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project\data_jobads_da_20jan.csv.


# I.III Job offers' data processing (cleaning and transforming)

### 1. Setting up

In [None]:
# Define global variables.
label_value = ['registered_nurse', 'electrician', 'data_analyst', 'registered_nurse', 'electrician', 'data_analyst']
csv_file_name = ['data_jobads_rn.csv', 'data_jobads_e.csv', 'data_jobads_da.csv', 'data_jobads_rn_20jan.csv', 'data_jobads_e_20jan.csv', 'data_jobads_da_20jan.csv']
data_frame = ['df1', 'df2', 'df3', 'df4', 'df5', 'df6']
to_remove = ['salary', 'schedule', 'benefit', 'location', 'job type', 'office', 'tag', 'employment type', 'email', 'ref.no', 
             'contact name', 'job ref', 'offer in return', 'job title', 'received by', 'signature date', '______', 'block capitals']
date_of_download = ['January 10, 2024', 'January 20, 2024']
before_30_days = ['before December 11, 2023', 'before December 21, 2023']
directory = r'C:\Users\temulenbd\OneDrive\Desktop\learn\github_repo\cct\capstone_project'
df = pd.DataFrame()

In [None]:
# Define a custom function to import a CSV file, add a label column, and return the DataFrame.
def import_and_label(csv, value):
    
    # Declare global variables.
    global df
    
    # Read the CSV file into a DataFrame.
    new_df = pd.read_csv(csv, index_col=None)
    
    # Add a new column 'label' with the specified value.
    new_df['label'] = value
    
    # Concatenate the existing DataFrame 'df' with the new DataFrame.
    df = pd.concat([df, new_df], ignore_index=True)
    
    print('The values of the <'+ csv + '> file were successfully added to the <df>.')

In [None]:
# Define a custom function to replace 'Just posted' or 'Today' with 'January 10, 2024'.
def replace_just_posted_10th(date):
    global date_of_download
    date_download = date_of_download[0]
    return date_download if 'Just posted' in str(date) or 'Today' in str(date) else date

# Define a custom function to replace 'Just posted' or 'Today' with 'January 20, 2024'.
def replace_just_posted_20th(date):
    global date_of_download
    date_download = date_of_download[1]
    return date_download if 'Just posted' in str(date) or 'Today' in str(date) else date

In [None]:
# Define a custom function to extract and replace values with relevant information.
def remove_replace_elements_10th(date):
    global date_of_download
    if date and ('day' in str(date) or 'days' in str(date)):
        days_ago = int(re.search(r'(\d+) (?:day|days) ago', str(date)).group(1))
        new_date = datetime.strptime(date_of_download[0], '%B %d, %Y') - timedelta(days=days_ago)
        return new_date.strftime('%B %d, %Y')
    else:
        return date
    
# Define a custom function extract and replace with relevant information.    
def remove_replace_elements_20th(date):
    global date_of_download
    if date and ('day' in str(date) or 'days' in str(date)):
        days_ago = int(re.search(r'(\d+) (?:day|days) ago', str(date)).group(1))
        new_date = datetime.strptime(date_of_download[1], '%B %d, %Y') - timedelta(days=days_ago)
        return new_date.strftime('%B %d, %Y')
    else:
        return date

In [None]:
# Define the costum function to find and remove unnecessary or private information.
def remove_elements_with_colon(column_value, remove_value):
        
    # Convert the value to lowercase.
    column_value = column_value.lower()
    
    # Split each value of the column into a list using '\n' as a separator.
    elements = column_value.split('\n')
    
    # Find the index of the element containing given value.
    index_of_element = next((i for i, elements in enumerate(elements) if remove_value in elements and ':' in elements), None)
    
    # Check if the first conditions are present in the text.
    if index_of_element is not None:
        next_colon_index = next((j for j in range(index_of_element + 1, len(elements)) if ':' in elements[j]), None)
        
        # Add an extra condition to check the second conditions are present in the text.
        if next_colon_index is not None:
            del elements[index_of_element:next_colon_index]
            return '\n'.join(elements)
        else:
            return column_value
        
    else:
        return column_value

In [None]:
# Define the costum function to find and remove unnecessary or private information.
def remove_unnec_lines(column_value, remove_value):
    # Split each value of the column into a list using '\n' as a separator.
    elements = column_value.split('\n')

    # Function to check if a line contains an email address, phone number, or link
    def is_unwanted_line(line):
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        phone_pattern = r'\b\d+[-.\s+]?\d+[-.\s+]?\d+\b'
        link_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

        return re.search(email_pattern, line) or re.search(phone_pattern, line) or re.search(link_pattern, line)

    # Remove lines containing the specified remove_value
    index_of_element = next((i for i, element in enumerate(elements) if remove_value in element), None)
    if index_of_element is not None:
        elements.pop(index_of_element)

    # Remove lines containing emails, phone numbers, and links
    elements = [element for element in elements if not is_unwanted_line(element)]

    return '\n'.join(elements)


### 2. CLEANING AND TRANSFORMATION

**january 10, 2024**

In [None]:
# Merge csv file from 10th of January into one DataFrame.
for x in range(3):
    import_and_label(csv_file_name[x], label_value[x])

print('--------------------------------')
rows = df.shape[0]
columns = df.shape[1]
print(f'The newly created <df> has {rows} rows and {columns} columns.')

The values of the <data_jobads_rn.csv> file were successfully added to the <df>.
The values of the <data_jobads_e.csv> file were successfully added to the <df>.
The values of the <data_jobads_da.csv> file were successfully added to the <df>.
--------------------------------
The newly created <df> has 1034 rows and 6 columns.


In [None]:
# Remove duplicate rows based on the 'id' column and keep the original rows.
df = df.drop_duplicates(subset='id', keep='first')

print(f'There are {rows-df.shape[0]} duplicate rows were removed from the <df>')
print(f'Now, the <df> contains {df.shape[0]} rows and {df.shape[1]} columns.')

There are 114 duplicate rows were removed from the <df>
Now, the <df> contains 920 rows and 6 columns.


In [None]:
# Extract and replace values of the 'date' column.
df['date'] = df['date'].replace(['not available', 'Hiring ongoing', None], 'unknown').replace(['Posted 30+ days ago'], before_30_days[0]).apply(replace_just_posted_10th).apply(remove_replace_elements_10th)

# Save the values and reset the <df>.
df_10th = df.copy()
df = pd.DataFrame()

# Assuming df is your DataFrame and 'date' is the column
date_counts = df_10th['date'].value_counts()

# Print the counts for each unique value in the 'date' column
print(date_counts)

before December 11, 2023    425
unknown                      72
January 08, 2024             71
December 20, 2023            44
January 05, 2024             40
December 22, 2023            37
January 10, 2024             30
January 09, 2024             28
January 03, 2024             28
January 04, 2024             21
January 06, 2024             18
December 13, 2023            14
December 19, 2023            12
December 14, 2023            10
December 21, 2023            10
December 23, 2023             8
December 12, 2023             8
December 16, 2023             7
December 30, 2023             6
January 02, 2024              6
December 15, 2023             5
January 07, 2024              5
December 29, 2023             3
December 18, 2023             2
January 01, 2024              2
December 28, 2023             2
December 26, 2023             2
December 31, 2023             1
December 11, 2023             1
December 24, 2023             1
December 27, 2023             1
Name: da

**january 20, 2024**

In [None]:
# Merge all csv file into one DataFrame
for x in range(3, 6):
    import_and_label(csv_file_name[x], label_value[x])

print('--------------------------------')
rows = df.shape[0]
columns = df.shape[1]
print(f'The newly created <df> has {rows} rows and {columns} columns.')

The values of the <data_jobads_rn_20jan.csv> file were successfully added to the <df>.
The values of the <data_jobads_e_20jan.csv> file were successfully added to the <df>.
The values of the <data_jobads_da_20jan.csv> file were successfully added to the <df>.
--------------------------------
The newly created <df> has 333 rows and 6 columns.


In [None]:
# Remove duplicate rows based on the 'id' column and keep the original rows.
df = df.drop_duplicates(subset='id', keep='first')

print(f'There are {rows-df.shape[0]} duplicate rows were removed from the <df>')
print(f'Now, the <df> contains {df.shape[0]} rows and {df.shape[1]} columns.')

There are 14 duplicate rows were removed from the <df>
Now, the <df> contains 319 rows and 6 columns.


In [None]:
# Extract and replace values of the 'date' column.
df['date'] = df['date'].replace(['not available', 'Hiring ongoing', None], 'unknown').replace(['Posted 30+ days ago'], before_30_days[1]).apply(replace_just_posted_20th).apply(remove_replace_elements_20th)

# Save the values and reset the <df>.
df_20th = df.copy()
df = pd.DataFrame()

# Assuming df is your DataFrame and 'date' is the column
date_counts = df_20th['date'].value_counts()

# Print the counts for each unique value in the 'date' column
print(date_counts)

January 18, 2024            61
unknown                     40
before December 21, 2023    35
January 20, 2024            32
January 17, 2024            30
January 19, 2024            29
January 16, 2024            26
January 12, 2024            19
January 13, 2024            14
January 11, 2024             9
January 10, 2024             8
January 15, 2024             7
January 09, 2024             3
January 14, 2024             1
December 22, 2023            1
January 06, 2024             1
January 02, 2024             1
January 03, 2024             1
December 23, 2023            1
Name: date, dtype: int64


**combining and finalizing the process in merged data frame**

In [None]:
# Merge two dataframes into one.
df = pd.concat([df_10th, df_20th], ignore_index=True)
rows = df.shape[0]
columns = df.shape[1]

print(f'The newly created <df> has {rows} rows and {columns} columns.')

The newly created <df> has 1239 rows and 6 columns.


In [None]:
# Remove duplicate rows based on the 'id' column and keep the original rows.
df = df.drop_duplicates(subset='link', keep='first')

print(f'There are {rows-df.shape[0]} duplicate rows were removed from the <df>')
print(f'Now, the <df> contains {df.shape[0]} rows and {df.shape[1]} columns.')

There are 0 duplicate rows were removed from the <df>
Now, the <df> contains 1239 rows and 6 columns.


In [None]:
# Remove any private and unnecessary information from the job details.
for val_rem in to_remove:
    df['job_description'] = df['job_description'].apply(lambda x: remove_elements_with_colon(x, val_rem)
                                                        ).apply(lambda x:remove_unnec_lines(x, val_rem))

print('RANDOM EXAMPLES:\n')
print(df.iat[randint(0, 1034), 4])
print('---------')
print(df.iat[randint(0, 1034), 4])

RANDOM EXAMPLES:

senior analyst – trade spend management

glanbia

join this dynamic team focused on delivering better nutrition for every step of life’s journey

the opportunity

the senior analyst – trade spend management will be responsible for leading the development of a trade spend deals glossary and associated processes for tracking, accounting for and management of trade spend and deductions relating to gpn europe’s sports nutrition brands. with initial focus on the uk region, and subsequently expanded to other european markets.

this position will be part of gpn’s europe finance team, reporting into the finance director for uk & ireland.
proactively absorb learnings from the trade spend deals glossary and management processes developed within the us business, and use this to map phases of development for the uk (phase 1), european and international businesses
operate as the finance owner of uk and ireland trade spend process and deductions for sports nutrition and lifestyle b

In [None]:
# Remove all unnecessary symbols.
df['job_description'] = df['job_description'].str.replace(r'[^:;,.\s\w]', '', regex=True)

In [None]:
 # Create the file path for CSV export.
file_path = os.path.join(directory, 'data_jobads_final.csv')
    
# Export the DataFrame to CSV file.
df.to_csv(file_path, index=False)