In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm

In [2]:
def cook(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def process_date(date):
    if "ngày" in date:
        date = datetime.today() - timedelta(days = int(date[2]))
    else:
        date = datetime.today()
    date = date.date().strftime('%d/%m/%Y')
    return date


def add_job_data(job_id, job, data):
    data["Job ID"].append(job_id)

    title = job.find("a", class_="job-link -no-underline -desktop-only show-job-description").text
    data["Title"].append(title)

    company = job.find("span", class_="job-company").text
    data["Company"].append(company)

    location = job.find("a", class_="job-location clickable-link").text
    data["Location"].append(location)

    date = job.find("span", class_="job-listed-date").text.split()
    date = process_date(date)
    data["Date"].append(date)

    link = "https://www.jobstreet.vn" + job.find("a", href=True)["href"]
    data["Link"].append(link)

    soup = cook(link)
    description = soup.find("div", class_="-desktop-no-padding-top", id="job-description-container").text.replace("\n", "")
    data["Description"].append(description)


def get_job_data(job_list):
    data = {"Job ID": [], "Title": [], "Company": [], "Location": [], "Date": [], "Description": [], "Link": []}

    job_id = 0
    for job_name in tqdm(job_list):
        job_url = "https://www.jobstreet.vn/j?q=" + job_name
        soup = cook(job_url)
        last_page = int(soup.find("div", class_="search-results-page-number").text.split()[3])

        for page in tqdm(range(1, last_page + 1)):
            page_url = job_url + "&p=" + str(page)
            soup = cook(page_url)
            jobs = soup.find_all("div", class_="job-card result sponsored-job premium-job spon-top") + soup.find_all("div", class_="job-card result organic-job")

            for job in jobs:
                job_id += 1
                add_job_data(job_id, job, data)

    df = pd.DataFrame(data=data)
    df = df.drop_duplicates()
    return df

In [3]:
job_list = ["data science", "AI engineer", "data engineer", "data analysis", "software engineer"]
df = get_job_data(job_list)

100%|██████████| 50/50 [06:38<00:00,  7.98s/it]
100%|██████████| 25/25 [02:16<00:00,  5.44s/it]
100%|██████████| 50/50 [06:35<00:00,  7.91s/it]
100%|██████████| 50/50 [06:29<00:00,  7.79s/it]
100%|██████████| 50/50 [06:32<00:00,  7.86s/it]
100%|██████████| 5/5 [28:35<00:00, 343.13s/it]


In [6]:
print(df.shape[0])
df.head()

3131


Unnamed: 0,Job ID,Title,Company,Location,Date,Description,Link
0,1,Data Management (IT Background),S International Logistic VN,"Quận Cầu Giấy, Hà Nội",25/06/2024,"RESPONSIBILITIES: Designing, developing, and i...",https://www.jobstreet.vn/vi%E1%BB%87c/Data-Man...
1,2,KỸ SƯ DỰ ÁN (DIGITAL TRANSFORMATION PE),ESEC,Hồ Chí Minh,13/06/2024,KỸ SƯ DỰ ÁN (DIGITAL TRANSFORMATION PE) Part ...,https://www.jobstreet.vn/vi%E1%BB%87c/K%E1%BB%...
2,3,"[Khối Công Nghệ] Data Scientist (Data Analyst,...",ABBANK - Ngân Hàng TMCP An Bình,Hà Nội,05/07/2024,Bằng cấp: - Tốt nghiệp hệ chính quy các Trường...,https://www.jobstreet.vn/vi%E1%BB%87c/Kh%E1%BB...
3,4,"Data Scientist (Data Analyst, Python)",ITviec,Hà Nội,26/06/2024,Top 3 reasons to join usLương/ thưởng hấp dẫnC...,https://www.jobstreet.vn/vi%E1%BB%87c/Data-Sci...
4,5,[AY 24-25] Faculty of Data Science,VIN UNIVERSITY,Việt Nam,05/07/2024,ABOUT VINUNIVERSITY VinUniversity (VinUni: htt...,https://www.jobstreet.vn/vi%E1%BB%87c/AY-24-25...


In [7]:
df.to_csv('job_description.csv')