#### Scraping Script

In [84]:
from bs4 import BeautifulSoup
import requests
import json
import os
import pandas as pd
import time
import random

In [99]:
def parse_one_page_vacancies(html_text: str) -> list:
    vacancies_lst = []
    soup = BeautifulSoup(html_text, "html.parser")
    vacancies = soup.find_all(class_ ="l-vacancy")
    for v in vacancies:
        title_el = v.select_one(".vt")
        if title_el:
            title = title_el.get_text(strip=True)
            link = title_el.get("href")
            vacancies_lst.append({"title": title, "link": link})
    return vacancies_lst

In [129]:
def get_vacancies_list(url, how_many=100, auto_save=True, logs = True):
    """
    Loads valancies titles and links from DOU
    """
    all_vacancies = []
    
    base_url = url
    ajax_url = f"https://jobs.dou.ua/vacancies/xhr-load/?{base_url.split('?')[-1]}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Referer": base_url, 
    }
    session = requests.Session()
    session.headers.update(headers)

    resp = session.get(base_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    vs = parse_one_page_vacancies(resp.text)
    all_vacancies.extend(vs)

    csrf_token = soup.select_one("input[name=csrfmiddlewaretoken]")["value"]
    print("CSRF token:", csrf_token)

    count = 20
    cancel = False
    while len(all_vacancies) < how_many:
        time.sleep(2 + random.randint(1, 2) * random.random())
        data = {
                "csrfmiddlewaretoken": csrf_token,
                "count": count
            }
        resp_2 = session.post(ajax_url, data=data)
        if not resp_2.text.strip():
            break
        text_2 = json.loads(resp_2.text)["html"]
        vs_new = parse_one_page_vacancies(text_2)
        if len(vs_new) <= 0:
            break
        all_vacancies.extend(vs_new)
        
        # auto save
        if auto_save:
            with open(f"{url.split('=')[-1].split('/')[-1]}.json", "w") as f:
                json.dump(all_vacancies, f)
        
        if logs:
            print("downloaded:", len(all_vacancies))
        
    
        count += 20
    
    return all_vacancies

In [None]:
urls = [
    "https://jobs.dou.ua/vacancies/?category=Java",
    "https://jobs.dou.ua/vacancies/?category=Python",
    "https://jobs.dou.ua/vacancies/?category=.NET",
    "https://jobs.dou.ua/vacancies/?category=AI/ML",
    "https://jobs.dou.ua/vacancies/?category=PHP",
    "https://jobs.dou.ua/vacancies/?category=Golang",
    "https://jobs.dou.ua/vacancies/?category=iOS/macOS",
    "https://jobs.dou.ua/vacancies/?category=Android",
    "https://jobs.dou.ua/vacancies/?category=C%2B%2B",
    "https://jobs.dou.ua/vacancies/?category=QA",
    "https://jobs.dou.ua/vacancies/?category=Front%20End",
    "https://jobs.dou.ua/vacancies/?category=Project%20Manager",
    "https://jobs.dou.ua/vacancies/?category=Node.js",
    "https://jobs.dou.ua/vacancies/?category=Product%20Manager",
    "https://jobs.dou.ua/vacancies/?category=Design",
    "https://jobs.dou.ua/vacancies/?category=Sales",
    "https://jobs.dou.ua/vacancies/?category=Marketing",
    "https://jobs.dou.ua/vacancies/?category=DevOps",
    "https://jobs.dou.ua/vacancies/?search=govtech"
]

In [131]:
for url in urls:
    print(url.split('=')[-1])
    all_vacancies = get_vacancies_list(url, 100)
    with open(f"./data/{url.split('=')[-1].split('/')[-1]}.json", "w") as f:
        json.dump(all_vacancies, f)

AI/ML
CSRF token: sE9zavYDPPwAWJk9AhBqDb0qmK5WAVapjNRyUZIcmY3v72uCNyl9GBTqWOpwUUs6
downloaded: 60
downloaded: 100
PHP
CSRF token: O86iEvj1Nw0j3dknupOkDf5Uliv8OPkdHRut9ULoE7IHT3i21xTPXGcpnJxQeqTI
downloaded: 60
downloaded: 100
Golang
CSRF token: iEWYAyDI0K1vOZIr4y0HXbtxWNOAJmEouRS2rLStbDTLwrTc2Lekj87lWhJyfSF6
downloaded: 50
downloaded: 60
iOS/macOS
CSRF token: aCUp4VvJf8qXbYRGTQuVAtjVDLSsoeTpRe4KnrfF32KYv0Y9QJx8FoghHVpL4eIS
downloaded: 56
downloaded: 72
Android
CSRF token: swWCfLi48rYJA6G3UCBCYIOVlscPa0dIJ55Fp1zSYmzjfgWk3kZgWkVB2FFGdDT0
downloaded: 53
downloaded: 66
C%2B%2B
CSRF token: Fu1fqBjsBDdjQhUz2MMglL81GJYXqph5nXohl7TiqwPX0wNvwJlRSQuiF0SYj5AX
downloaded: 60
downloaded: 100
QA
CSRF token: yYP4cNiP41d0RqJ7PI1wlCetaH2FxN4Z15Emh7P5b2pJPlZ5daapFivJ3AgghxyY
downloaded: 60
downloaded: 100
Front%20End
CSRF token: RaomfHtgyGuggrzc9YsnQk1isCAQ2SY4Ill2O8SBmn7WZN0ZvAAdUWOsfuNQpFyj
downloaded: 60
downloaded: 100
Project%20Manager
CSRF token: 5t8PjOzfKRgobPb6xkL630erlaP6uqXvgOynlpQigv5t4i3o6p8

#### Vacancies Texts

In [170]:
import html2text

In [169]:
def extract_vacancy_text(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36",
    }
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    container = soup.find("div", class_="b-typo vacancy-section")
    
    markdown_text = html2text.html2text(str(container))
    markdown_text = markdown_text.strip()
    return markdown_text

In [175]:
from tqdm import tqdm

In [203]:
for i in os.listdir("./data"):
    name = i.split(".")[0]
    os.makedirs(f"./texts/{name}/", exist_ok=True)
    
    with open(f"./data/{i}", "r", encoding="utf-8") as f:
        vacancies = json.load(f)

    for i, vanacy in tqdm(enumerate(vacancies)):
        link = vanacy["link"]
        text = extract_vacancy_text(link)
        
        # rate handle
        time.sleep(1 + random.randint(0, 2) * random.random())
        
        with open(f"./texts/{name}/{i}.md", "w", encoding="utf-8") as fw:
            fw.write(text)

72it [02:11,  1.83s/it]
100it [03:03,  1.84s/it]
100it [02:59,  1.80s/it]
100it [03:03,  1.84s/it]
100it [02:58,  1.78s/it]
100it [03:07,  1.88s/it]
100it [02:51,  1.72s/it]
100it [02:59,  1.80s/it]
100it [02:58,  1.79s/it]
100it [02:54,  1.74s/it]


#### Fix Indexs and Unicode

In [211]:
for i in os.listdir("./data-done"):

    with open(f"data-done/{i}", "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for j, item in enumerate(data):
        item["title"] = item["title"]
        item["index"] = j

    with open(f"data/{i}", "w", encoding="utf-8") as fl:
        json.dump(data, fl, ensure_ascii=False, indent=2)
    
    print(i)

Android.json
Cpp.json
Design.json
DevOps.json
dotNET.json
FrontEnd.json
Golang.json
govtech.json
Java.json
macOS.json
Marketing.json
ML.json
Node.json
PHP.json
ProductManager.json
ProjectManager.json
Python.json
QA.json
Sales.json
