In [1]:
import requests

# request headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
    'Accept-Encoding': 'utf-8'
}

In [2]:
def recurrent_area_search(areas: list):
    id_area = []
    for area in areas:
        id_area.append((area['id'], area['name']))
        if 'areas' in area.keys():
            if len(area['areas']) > 0:
                id_area.extend(recurrent_area_search(area['areas']))
    return id_area

# getting area names and their codes in HH appropriate format
area_codes = requests.get('https://api.hh.ru/areas').json()
area_codes = {a.lower(): i for i, a in recurrent_area_search(area_codes)}
list(area_codes.items())[:5]

[('россия', '113'),
 ('республика марий эл', '1620'),
 ('виловатово', '4228'),
 ('волжск', '1621'),
 ('звенигово', '1622')]

In [67]:
import urllib3
from bs4 import BeautifulSoup
import time
import random
import dask

@dask.delayed
def lookup_resumes(query: str, location: str, page: int):
    """ 
        Function for looking up candidates on hh.ru

        query: str
            - Search query that is contained in candidates' resume description
        location: str
            - Location name in lowercase (e.g. 'москва') for searching for candidates
        page: int
            - Page number for the search. Each page contains 20 resumes

        returns:
            list of resume ids as strings
    """

    http = urllib3.PoolManager()

    # forming a url for accessing page with resumes
    lookup_url = 'https://hh.ru/search/resume' \
    f'?text={query}&logic=normal' \
    '&pos=position%2Ceducation%2Ckeywords' \
    '&exp_period=all_time&exp_company_size=any' \
    f'&exp_industry=any&area={area_codes[location]}&relocation=living_or_relocation' \
    '&age_from=&age_to=&gender=unknown&salary_from=&salary_to=&currency_code=RUR' \
    '&education_level=secondary&education_level=special_secondary&education_level=unfinished_higher&education_level=bachelor&education_level=master&education_level=higher&education_level=candidate' \
    f'&order_by=relevance&search_period=0&items_on_page=20&no_magic=false&page{page}'

    # requesting page with resumes with given query
    resp = http.request(
        method='GET',
        url=lookup_url,
        headers=headers
    )

    # searching for resumes on HTML page
    soup = BeautifulSoup(resp.data.decode('utf-8', 'ignore'), "html.parser")
    resumes = soup.findAll("div", {"data-resume-id" : True})
    resume_ids = [r.find('a')['href'].split('?')[0].split('/')[2] for r in resumes]

    time.sleep(random.uniform(0, 1.5))

    return resume_ids

In [70]:
# getting 80 candidates 

delayed_candidate_ids = [lookup_resumes(query='аналитик', location='москва', page=page) for page in range(5)]
candidate_ids = dask.compute(delayed_candidate_ids, scheduler="threading")

In [75]:
candidate_ids = sum(candidate_ids[0], [])

In [76]:
print(len(candidate_ids))
candidate_ids[:10]

80


['db593959000130b0f90039ed1f534c77687a4c',
 '0bc73da80008adceb80039ed1f577641336d34',
 '75f393310001eae8210039ed1f32496e324f41',
 'c3f95a87000787a0290039ed1f645038385173',
 'd477109d000853a1730039ed1f546d6b644f32',
 'bb70511d0002ea688f0039ed1f7232366b6352',
 '19d29c99000412d2da0039ed1f733548444f34',
 '0f378a6d0008f078690039ed1f4b5136627a48',
 '918275d900050be90d0039ed1f486e7a645547',
 '3143db6c000728ae3e0039ed1f65506d4e416e']

In [62]:
import re

def process_column_data(columns, first_key: str, second_key: str):
    """ 
        Function that converts BeautifulSoup object containing HTML with rows of two columns into 
        dictionary with desired first key for the value in the first column and second key for the
        value in the second column

        columns
            - BeautifulSoup object containing data with rows of columns
        first_key: str
            - Key for the value in the first column
        second_key: str
            - Key for the value in the second column

        returns:
            List (each element represents a row) of dictionaries (each represents columns in the row)
    """

    if columns is not None:
        wo_tags = [re.sub(r'<[^>]*>', '', str(el)) for el in columns.find_all('div', {'class': 'bloko-column'})[1:]]
        if len(wo_tags) % 2 == 0:
            return [{first_key: re.sub('\\xa0', ' ', el), second_key: re.sub('\\xa0', ' ', wo_tags[i+1 if i%2==0 else i+2])} for i, el in enumerate(wo_tags[::2])]
        else:
            return [{first_key: None, second_key: re.sub('\\xa0', ' ', el)} for el in wo_tags]
    return []

@dask.delayed
def get_resume_details(resume_id: str):
    """ 
        Returns prettified details on candidate's resume

        resume_id: str
            - Candidate's resume ID, that we get from lookup_resumes() 

        returns:
            Dictionary with information on the candidate
    """

    http = urllib3.PoolManager()
    lookup_url = f'https://hh.ru/resume/{resume_id}'
    resp = http.request(
        method='GET',
        url=lookup_url,
        headers=headers
    )

    # getting resume's information
    soup = BeautifulSoup(resp.data.decode('utf-8', 'ignore'), "html.parser")
    resume = soup.find('div', {'class': 'resume-applicant'})

    while resume is None:
        """time.sleep(random.uniform(0,3))
        return get_resume_details(resume_id)"""
        return None
    
    agesoup = resume.find('span', {'data-qa': 'resume-personal-age'})
    sexsoup = resume.find('span', {'data-qa': 'resume-personal-gender'})
    areasoup = resume.find('span', {'data-qa': 'resume-personal-address'})
    dlicensesoup = resume.find('div', {'data-qa': 'resume-block-driver-experience'})
    jobpossoup = resume.find('li', {'data-qa': 'resume-block-position-specialization'})
    skillssoup = resume.find('div', {'data-qa': 'skills-table'})
    aboutsoup = resume.find('div', {'data-qa': 'resume-block-skills-content'})
    langsoup = resume.find('div', {'data-qa': 'resume-block-languages'})
    extrasoup = resume.find('div', {'data-qa': 'resume-block-additional'})

    return {
        'age': re.findall(r'\d+', agesoup.text)[0] if agesoup is not None else None,
        'sex': sexsoup.text if sexsoup is not None else None,
        'area': areasoup.text if areasoup is not None else None,
        'd_licence': re.sub('\\xa0', ' ', dlicensesoup.find_all('div',{'class': 'resume-block-container'})[1].text) \
            if dlicensesoup is not None else None,
        'job_positions': jobpossoup.text if jobpossoup is not None else None,
        'skills': [el.text for el in skillssoup.find_all('span')[1:]] if skillssoup is not None else None,
        'about': aboutsoup.text if aboutsoup is not None else None,
        'experience': process_column_data(resume.find('div', {'data-qa': 'resume-block-experience'}),'time','place'),
        'education': process_column_data(resume.find('div', {'data-qa': 'resume-block-education'}),'time','institute'),
        'languages': [el.text for el in langsoup.find_all('p')] if langsoup is not None else None,
        'courses': process_column_data(resume.find('div', {'data-qa': 'resume-block-additional-education'}),'time','course'), 
        'extra': [el.text for el in extrasoup.find_all('p')] if extrasoup is not None else None
    }

In [63]:
delayed_candidates = [get_resume_details(cid) for cid in candidate_ids]
candidates_details = dask.compute(delayed_candidates, scheduler="threading")

In [64]:
import pandas as pd

candidates_df = pd.DataFrame(candidates_details[0])
candidates_df.head()

Unnamed: 0,age,sex,area,d_licence,job_positions,skills,about,experience,education,languages,courses,extra
0,35,Мужчина,Москва,,Аналитик,"[MS Word, MS Excel, MS PowerPoint, 1С: Предпри...","работа в команде , коммуникабельный , быстро у...",[{'time': 'Ноябрь 2022 — по настоящее время10 ...,"[{'time': '2024', 'institute': 'SkyproАналитик...","[Русский — Родной, Английский — A1 — Начальный]",[],"[Гражданство: Россия, Разрешение на работу: Ро..."
1,22,Мужчина,Москва,Права категории B,Продуктовый аналитик,"[, SQL, Математическая статистика, Tеория веро...",Начал изучать и погружаться в аналитику данных...,[{'time': 'Январь 2023 — по настоящее время8 м...,"[{'time': '2022', 'institute': 'Московский ави...","[Русский — Родной, Английский — B1 — Средний]","[{'time': '2023', 'course': 'Курс ""Аналитик да...","[Гражданство: Россия, Разрешение на работу: Ро..."
2,32,Мужчина,Томск,,Аналитик,"[Adobe Photoshop, Ведение переговоров, Интерне...",Сейчас учусь на курсе Аналитик PRO от Changell...,[{'time': 'Сентябрь 2021 — по настоящее время2...,"[{'time': '2019', 'institute': 'Российский Гос...","[Русский — Родной, Английский — A2 — Элементар...","[{'time': '2024', 'course': 'Аналитик PROChang...","[Гражданство: Россия, Разрешение на работу: Ро..."
3,25,Женщина,Москва,Права категории B,Аналитик,"[Уверенное пользование MS Word, Excel, Power P...",• Заинтересована в развитии и работе в области...,"[{'time': 'Апрель 2019 — Июнь 20193 месяца', '...","[{'time': '2019', 'institute': 'Санкт-Петербур...","[Русский — Родной, Английский — B2 — Средне-пр...",[],"[Гражданство: Россия, Разрешение на работу: Ро..."
4,31,Женщина,Москва,,Аналитик,"[Математический анализ, Математическая статист...","Работала аналитиком многие годы, в прошлом год...",[{'time': 'Февраль 2022 — по настоящее время1 ...,"[{'time': '2013', 'institute': 'Московский гос...","[Русский — Родной, Английский — B2 — Средне-пр...","[{'time': '2020', 'course': 'Яндекс.ПрактикумЯ...","[Гражданство: Россия, Разрешение на работу: Ро..."
