In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import os
import re
from IPython.display import clear_output

def fetch_job_data(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    job_data = []

    job_cards = soup.find_all('div', class_='col-md-4 border-right border-bottom job-card')
    for job_card in job_cards:
        organization_name_meta = job_card.find('meta', itemprop='name')
        organization_name = organization_name_meta['content'] if organization_name_meta else ''
        organization_href_tag = job_card.find('a', class_='h6 mb-1')
        organization_href = "https://www.merojob.com" + organization_href_tag['href'] if organization_href_tag else ''
        
        position_tags = job_card.find_all('a', class_='job_title hover-primary')
        for position_tag in position_tags:
            job_position = position_tag.get('title', '')  
            match = re.search(r'(.+?) - Apply Before', job_position)
            job_title = match.group(1).strip() if match else job_position
            job_position_href = "https://www.merojob.com" + position_tag.get('href', '')  
            job_data.append([organization_name, organization_href, job_title, job_position_href])
    
    return job_data

def save_job_data_to_csv(job_data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Organization Name', 'Organization Link', 'Job Position', 'Position Link'])
        writer.writerows(job_data)
    print(f'CSV file "{filename}" has been successfully created.')

def extract_job_info(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    job_info = {}

    job_info['Job Title'] = soup.find('h1', itemprop='title').text if soup.find('h1', itemprop='title') else 'N/A'
    job_info['Deadline Date'] = soup.find('meta', itemprop='validThrough')['content'] if soup.find('meta', itemprop='validThrough') else 'N/A'
    
    basic_info_table = soup.find('div', class_='card-group').find('div', class_='card-body').find('table')
    if basic_info_table:
        for row in basic_info_table.find_all('tr'):
            key = row.find_all('td')[0].get_text()
            value = row.find_all('td')[2].get_text().strip()
            job_info[key] = value
    
    job_specification = soup.find('h3', string='Job Specification')
    if job_specification:
        job_specification_table = job_specification.find_next('table')
        job_info['Education Level'] = job_specification_table.find('span', itemprop='educationRequirements').text if job_specification_table.find('span', itemprop='educationRequirements') else 'N/A'
        job_info['Experience Required'] = job_specification_table.find('span', itemprop='experienceRequirements').text if job_specification_table.find('span', itemprop='experienceRequirements') else 'Not specified'
        
        skills_span = job_specification_table.find('span', itemprop='skills')
        if skills_span:
            job_info['Professional Skills'] = ', '.join(tag.text.strip() for tag in skills_span.find_all('span', class_='badge badge-light border rounded p-1'))
        else:
            job_info['Professional Skills'] = 'N/A'
    
    qualifications_experience_tag = soup.find('strong', string='Qualification & Experience:')
    if qualifications_experience_tag:
        job_info['Qualifications and Experience'] = '\n'.join([item.text for item in qualifications_experience_tag.find_next('ul').find_all('li')])
    
    skills_tag = soup.find('strong', string='Skills:')
    if skills_tag:
        job_info['Skills'] = '\n'.join([item.text for item in skills_tag.find_next('ul').find_all('li')])
    
    description = soup.find('h3', string='Job Description')
    if description:
        job_description = description.find_next('ul').find_all('li')
        job_info['Job Description'] = '\n'.join([item.text for item in job_description])
    
    return job_info

def save_job_infos_to_csv(job_infos, filename):
    if not job_infos:
        print("No job information to save.")
        return

    file_exists = os.path.isfile(filename) and os.path.getsize(filename) > 0
    with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            headers = job_infos[0].keys()
            writer.writerow(headers)
        for job_info in job_infos:
            writer.writerow(job_info.values())
    
    print(f'CSV file "{filename}" has been successfully updated.')

def main():
    base_url = "https://www.merojob.com/"
    job_links_csv = "links_to_jobs.csv"
    job_infos_csv = "job_informations.csv"

    job_data = fetch_job_data(base_url)
    save_job_data_to_csv(job_data, job_links_csv)

    df = pd.read_csv(job_links_csv)
    job_infos = []
    for idx, row in df.iterrows():
        clear_output(wait=True)
        print(f"Processing job {idx + 1} of {len(df)}")
        job_info = extract_job_info(row['Position Link'])
        if job_info:
            job_infos.append(job_info)
    
    save_job_infos_to_csv(job_infos, job_infos_csv)

if __name__ == "__main__":
    main()

Processing job 196 of 196
CSV file "job_informations.csv" has been successfully updated.
