In [14]:
import requests

# request headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
    'Accept-Encoding': 'utf-8'
}

In [11]:
def recurrent_area_search(areas: list):
    id_area = []
    for area in areas:
        id_area.append((area['id'], area['name']))
        if 'areas' in area.keys():
            if len(area['areas']) > 0:
                id_area.extend(recurrent_area_search(area['areas']))
    return id_area

# getting area names and their codes in HH appropriate format
area_codes = requests.get('https://api.hh.ru/areas').json()
area_codes = {a.lower(): i for i, a in recurrent_area_search(area_codes)}
list(area_codes.items())[:5]

[('россия', '113'),
 ('республика марий эл', '1620'),
 ('виловатово', '4228'),
 ('волжск', '1621'),
 ('звенигово', '1622')]

In [8]:
from urllib import parse
import urllib3
from bs4 import BeautifulSoup
import time
import random

def lookup_resumes(query: str, location: str, page: int):
    """ 
        Function for looking up candidates on hh.ru

        query: str
            - Search query that is contained in candidates' resume description
        location: str
            - Location name in lowercase (e.g. 'москва') for searching for candidates
        page: int
            - Page number for the search. Each page contains 20 resumes

        returns:
            list of resume ids as strings
    """

    http = urllib3.PoolManager()
    urlencoded_query = parse.quote(query, safe='')

    # forming a url for accessing page with resumes
    lookup_url = 'https://hh.ru/search/resume?' \
    'exp_period=all_time&' \
    'L_lenient=true&' \
    'logic=normal&' \
    'pos=full_text&' \
    f'area={area_codes[location]}&' \
    'clusters=true&' \
    f'text={urlencoded_query}&' \
    f'page={page}'
    
    # requesting page with resumes with given query
    resp = http.request(
        method='GET',
        url=lookup_url,
        headers=headers
    )

    # searching for resumes on HTML page
    soup = BeautifulSoup(resp.data.decode('utf-8', 'ignore'), "html.parser")
    resumes = soup.findAll("div", {"data-resume-id" : True})
    resume_ids = [r.find('a')['href'].split('?')[0].split('/')[2] for r in resumes]

    return resume_ids

In [74]:
# getting 100 candidates 
candidate_ids = []

for i in range(5):
    candidate_ids.extend(lookup_resumes('data', 'москва', 10))
    time.sleep(random.uniform(0,3))

In [80]:
import re


def process_column_data(columns, first_key: str, second_key: str):
    """ 
        Function that converts BeautifulSoup object containing HTML with rows of two columns into 
        dictionary with desired first key for the value in the first column and second key for the
        value in the second column

        columns
            - BeautifulSoup object containing data with rows of columns
        first_key: str
            - Key for the value in the first column
        second_key: str
            - Key for the value in the second column

        returns:
            List (each element represents a row) of dictionaries (each represents columns in the row)
    """

    if columns is not None:
        wo_tags = [re.sub(r'<[^>]*>', '', str(el)) for el in columns.find_all('div', {'class': 'bloko-column'})[1:]]
        return [{first_key: re.sub('\\xa0', ' ', el), second_key: re.sub('\\xa0', ' ', wo_tags[i+1 if i%2==0 else i+2])} for i, el in enumerate(wo_tags[::2])]
    return []

def get_resume_details(resume_id: str):
    """ 
        Returns prettified details on candidate's resume

        resume_id: str
            - Candidate's resume ID, that we get from lookup_resumes() 

        returns:
            Dictionary with information on the candidate
    """

    http = urllib3.PoolManager()
    lookup_url = f'https://hh.ru/resume/{resume_id}'
    resp = http.request(
        method='GET',
        url=lookup_url,
        headers=headers
    )

    # getting resume's information
    soup = BeautifulSoup(resp.data.decode('utf-8', 'ignore'), "html.parser")
    resume = soup.find('div', {'class': 'resume-applicant'})

    while resume is None:
        time.sleep(random.uniform(0,3))
        return get_resume_details(resume_id)
    
    agesoup = resume.find('span', {'data-qa': 'resume-personal-age'})
    sexsoup = resume.find('span', {'data-qa': 'resume-personal-gender'})
    areasoup = resume.find('span', {'data-qa': 'resume-personal-address'})
    dlicensesoup = resume.find('div', {'data-qa': 'resume-block-driver-experience'})
    jobpossoup = resume.find('li', {'data-qa': 'resume-block-position-specialization'})
    skillssoup = resume.find('div', {'data-qa': 'skills-table'})
    aboutsoup = resume.find('div', {'data-qa': 'resume-block-skills-content'})
    langsoup = resume.find('div', {'data-qa': 'resume-block-languages'})
    extrasoup = resume.find('div', {'data-qa': 'resume-block-additional'})

    return {
        'age': re.findall(r'\d+', agesoup.text)[0] if agesoup is not None else None,
        'sex': sexsoup.text if sexsoup is not None else None,
        'area': areasoup.text if areasoup is not None else None,
        'd_licence': re.sub('\\xa0', ' ', dlicensesoup.find_all('div',{'class': 'resume-block-container'})[1].text) \
            if dlicensesoup is not None else None,
        'job_positions': jobpossoup.text if jobpossoup is not None else None,
        'skills': [el.text for el in skillssoup.find_all('span')[1:]] if skillssoup is not None else None,
        'about': aboutsoup.text if aboutsoup is not None else None,
        'experience': process_column_data(resume.find('div', {'data-qa': 'resume-block-experience'}),'time','place'),
        'education': process_column_data(resume.find('div', {'data-qa': 'resume-block-education'}),'time','institute'),
        'languages': [el.text for el in langsoup.find_all('p')] if langsoup is not None else None,
        'courses': process_column_data(resume.find('div', {'data-qa': 'resume-block-additional-education'}),'time','course'), 
        'extra': [el.text for el in extrasoup.find_all('p')] if extrasoup is not None else None
    }

In [81]:
candidates_details = [get_resume_details(id) for id in candidate_ids]

In [82]:
import pandas as pd

candidates_df = pd.DataFrame(candidates_details)
candidates_df.head()

Unnamed: 0,age,sex,area,d_licence,job_positions,skills,about,experience,education,languages,courses,extra
0,22.0,Male,Saint Petersburg,Driver's license category B,Analyst,"[Python, Tableau, Airflow, Docker, SQL, scikit...",I work as an analyst at the Research Institute...,[{'time': 'November 2021 — currently1 year 10 ...,"[{'time': '2021', 'institute': 'Peter the Grea...","[Russian — Native, English — B1 — Intermediate]",[],"[Citizenship: Russia, Permission to work: Russ..."
1,,Male,Moscow,Driver's license category B,Data scientist,"[Fast-learning, proactivity,, Strong self-moti...","Fast-learning, proactivity\r\n\r\nDesire to wo...",[{'time': 'July 2016 — currently7 years 2 mont...,"[{'time': '2020', 'institute': 'Geekbrains.ru ...","[Russian — Native, English — B1 — Intermediate]",[],"[Citizenship: Russia, Permission to work: Russ..."
2,34.0,Мужчина,Москва,Имеется собственный автомобильПрава категории ...,"Программист, разработчик","[Работа с большим объемом информации, Высокая ...",,[{'time': 'Декабрь 2017 — по настоящее время5 ...,"[{'time': '2013', 'institute': 'Московский соц...","[Русский — Родной, Английский — B1 — Средний]","[{'time': '2023', 'course': 'Курс Data Science...","[Гражданство: Россия, Разрешение на работу: Ро..."
3,23.0,Женщина,Москва,,Другое,"[Работоспособность, Работа с большим объемом и...",,[],"[{'time': '2023', 'institute': 'Московский гос...","[Русский — Родной, Английский — C1 — Продвинутый]",[],"[Гражданство: Россия, Разрешение на работу: Ро..."
4,32.0,Male,Moscow,,Analyst,"[Analytical skills, Английский язык, MS Outloo...","I am very active, serious about my work, quick...",[{'time': 'August 2015 — currently8 years 1 mo...,"[{'time': '2017', 'institute': 'National Resea...","[Hindi — Native, English — C2 — Proficiency, P...",[],"[Citizenship: India, Permission to work: Russi..."
