<a href="https://colab.research.google.com/github/wwangwe/labour-market-analysis/blob/working/Web_Scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Real-time Kenyan Labour Market Analysis

## Web Scrapping



In [1]:
import json
import time
from random import randint

import pandas as pd
import requests
from bs4 import BeautifulSoup


In [2]:
headers = [
    ({
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    }),
    ({
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5)AppleWebKit/605.1.15 (KHTML, like Gecko)Version/12.1.1 Safari/605.1.15',
    }),
    ({
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    }),
    ({
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/601.7.8 (KHTML, like Gecko)',
    }),
    ({
        'User-Agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 [FBAN/FBIOS;FBDV/iPhone9,1;FBMD/iPhone;FBSN/iOS;FBSV/13.5.1;FBSS/2;FBID/phone;FBLC/en_US;FBOP/5]'
    })
]


In [3]:
def header(headers: list) -> dict:
    """Generate a random header.

    Args:
        headers (list): List of headers.

    Returns:
        random_header (dict): Random header from the list of headers.
    """
    random_int = randint(0, len(headers) - 1)
    random_header = headers[random_int]
    return random_header



In [4]:
def prepare_soup(url: str) -> 'BeautifulSoup':
    """Process url to a Beautiful Soup object.

    Args:
        url (str): Link to jobs page.

    Raises:
        ValueError: Raised when requests.get fails.

    Returns:
        soup: Browsable bs4 object.
    """
    response = requests.get(url, header(headers), timeout=5)
    status_code = response.status_code
    if status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        raise ValueError("Soup Not Created! Status Code: ", status_code)


In [5]:
def fetch_jobs(url: str) -> list:
    """Get job detail urls for all the jobs in current page(url).

    Args:
        url (str): Current page with jobs.

    Returns:
        list_data (list): List of job detail urls.
    """
    soup = prepare_soup(url)
    string_data = soup.find_all("script", type="application/ld+json")[0].text
    json_data = json.loads(string_data)['itemListElement']
    list_data = [data['url'] for data in json_data]
    return list_data


In [6]:
def fetch_job_details(soup: 'BeautifulSoup') -> dict:
    """Fetch details for each job.

    Each dictionary contains details about only one job. Try Except 
    used to handle possible errors due to change in selectors.

    Args:
        soup (BeautifulSoup): Browsable bs4 object.

    Returns:
        dict: Dictionary of job details.
    """
    details = {}
    try:
        details['title'] = soup.find('h1', 'job-header__title').text
    except AttributeError:
        details['title'] = 'None'
    try:
        details['job_function'] = soup.find(
            'div', 'hide-under-lg').find_all('h2')[1].text
    except AttributeError:
        details['job_function'] = 'None'
    try:
        details['location'] = soup.find('a', 'job-header__location').text
    except AttributeError:
        details['location'] = 'None'
    try:
        details['industry'] = soup.find('span',
                                        'job-header__location').find('a').text
    except AttributeError:
        details['industry'] = 'None'
    try:
        details['description'] = soup.find_all(
            'div', 'customer-card__content-segment')[0].find('p').text
    except AttributeError:
        details['description'] = 'None'
    try:
        details['qualifications'] = soup.find(
            'div', 'description-content__content').text
    except AttributeError:
        details['qualifications'] = 'None'

    return details



In [7]:
url = "https://www.brightermonday.co.ke/jobs"

def main(url):
    page = 50
    while True:
        current_url = url+f'?page={page}'
        page += 1
        job_data = []
        print(current_url)
        for job_url in fetch_jobs(current_url):
            soup = prepare_soup(job_url)
            if soup != None:
                job_data.append(fetch_job_details(soup))
            else:
                break
        if page <= 50:
            time.sleep(randint(1, 5))
        else:
            break
    return job_data
        
main(url)

https://www.brightermonday.co.ke/jobs?page=50


[{'description': '\nThe role holder will ensure the 24/7 stability, integrity, and efficient operation of the Software Systems and appliances that support core business, through proven communication, analytical, problem-solving skills, and innovation to help identify, communicate, and resolve issues to maximize the benefit of IT systems investments for the entire KCB Group.\n',
  'industry': '\nBanking, Finance & Insurance\n',
  'job_function': '\nEngineering & Technology\n',
  'location': '\nNairobi\n',
  'qualifications': 'Key Responsibilities:Collaborate with other IT staff to ensure smooth and reliable\noperation of software and systems for fulfilling business objectives and\nprocesses.Implement and adhere to regular systems maintenance policies and\nprocedures, including change request mechanisms and update schedules to\nensure optimum uptime and service availability.Conduct research on software systems products to justify recommendations and to support purchasing efforts.Particip