In [1]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd

## Extract all articles from job board

In [2]:
RECORDS_PER_PAGE = 50
get_url = lambda offset: f"https://mycareer.hsbc.com/en_GB/external/SearchJobs/?pipelineRecordsPerPage={RECORDS_PER_PAGE}&pipelineOffset={offset}"

In [3]:
error_message = "There are no jobs that match your search criteria. Please amend your filters and try again"
end_of_articles = lambda articles: articles[0].h3.text.strip(" \n\r") == error_message

In [4]:
def get_articles(offset):
    r = re.get(get_url(offset))
    soup = BeautifulSoup(r.text, "html.parser")
    tiles = soup.find("div", {"class": "section__content__results"})
    articles = tiles.find_all("article")
    return articles

In [6]:
total_articles = []
offset = 0
articles = get_articles(offset)
print("Postings obtenidos: ")
while not end_of_articles(articles):
    print(offset, end=" ")
    total_articles += articles
    offset += 50
    articles = get_articles(offset)

Postings obtenidos: 
0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 3050 3100 3150 3200 3250 3300 3350 3400 

## Extract data from each article

In [7]:
# Job Post items container
get_data_container = lambda article: article.find(attrs={"class": "article__header__text__subtitle"}).find_all(attrs={"class": "item__container"})

In [8]:
# Extracting functions
get_article_title = lambda article: article.find("h3").find("a").string.strip(" \n\r")
get_article_location = lambda container: container[0].find(attrs={"class": "location"}).string.strip(" \n\r")
get_article_sector = lambda container: container[1].find("span").string.strip(" \n\r")
get_article_hours = lambda container: container[2].find("span").string.strip(" \n\r")
get_article_worktype = lambda container: container[3].find("span").string.strip(" \n\r")
get_article_dates = lambda container: [x.strip(" \n\r") for x in container[4].find("span").string.replace(" ", "").split("\n\n-")]
get_job_link = lambda article: article.find(attrs={"class": "article__header__actions"}).a["href"]

In [9]:
def get_data_from_articles(articles):
    data = []
    for a in articles:
        container = get_data_container(a)
        location = get_article_location(container)
        sector = get_article_sector(container)
        hours = get_article_hours(container)
        worktype = get_article_worktype(container)
        dates = get_article_dates(container)
        data.append({
            "location": location,
            "sector": sector,
            "hours": hours,
            "worktype": worktype,
            "dates": dates,
        })
    return data

In [10]:
df = pd.DataFrame(get_data_from_articles(total_articles[:-1]))

In [11]:
df["from_date"] = df["dates"].str[0]
df["to_date"] = df["dates"].str[1]

In [12]:
df.drop(columns=["dates"], inplace=True)

In [13]:
df

Unnamed: 0,location,sector,hours,worktype,from_date,to_date
0,"Hyderabad, India",Technology,Permanent - Full Time,Office Working,19-Sep-2022,31-Dec-2025
1,"Sydney, Australia",Procurement,Permanent - Full Time,Hybrid Working,19-Sep-2022,03-Oct-2022
2,"Shanghai, Mainland China",Branch and Retail Banking,Permanent - Full Time,Office Working,19-Sep-2022,30-Nov-2022
3,"Singapore, Singapore",Insurance,Permanent - Full Time,Hybrid Working,19-Sep-2022,03-Oct-2022
4,"Hyderabad, India",Technology,Permanent - Full Time,Office Working,19-Sep-2022,31-Dec-2025
...,...,...,...,...,...,...
3405,"Kowloon Bay, Hong Kong",Call Centre,Permanent - Full Time,Office Working,05-Dec-2021,31-Dec-2025
3406,"Beijing, Mainland China",Insurance,Fixed Term - Full Time,Office Working,01-Dec-2021,31-Dec-2025
3407,"Kowloon Bay, Hong Kong",Call Centre,Permanent - Full Time,Office Working,18-Nov-2021,31-Dec-2025
3408,"Paris, France",Branch and Retail Banking,Permanent - Full Time,Office Working,28-Sep-2021,30-Sep-2022


In [14]:
df.to_csv("hsbc_jobs_data.csv")