In [75]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests

In [76]:
def parsing_hh(offer_text):
        
    if offer_text is None:
        return (np.nan, np.nan, np.nan)
    
    offer_text = offer_text.replace(chr(160), '')
    offer_text = offer_text.replace(chr(32), '')
    offer_text = offer_text.replace('.', '')
    
    digits = '1234567890'
    
    if offer_text[0] in digits:
        sep_pos = offer_text.find('-')
        min_offer = int(offer_text[:sep_pos])
        txt = offer_text[sep_pos+1:]
        
        for i in range(0, len(txt)):
            if txt[i] not in digits:
                sep_pos = i
                break
                
        max_offer = int(txt[:sep_pos])
        currency = txt[sep_pos:]
        
        return (min_offer, max_offer, currency)

    
    if offer_text[:2] == 'от':
        txt = offer_text[2:]  
        
        for i in range(0, len(txt)):
            if txt[i] not in digits:
                sep_pos = i
                break
                
        min_offer = int(txt[:sep_pos])
        max_offer = np.nan
        currency = txt[sep_pos:]
        
        return (min_offer, max_offer, currency)
    
    
    if offer_text[:2] == 'до':
        txt = offer_text[2:]  
        
        for i in range(0, len(txt)):
            if txt[i] not in digits:
                sep_pos = i
                break
                
        min_offer = np.nan
        max_offer = int(txt[:sep_pos])
        currency = txt[sep_pos:]
        
        return (min_offer, max_offer, currency)
    

    return (np.nan,np.nan,offer_text)

In [77]:
def parse_hh(search_request):

     
    vacancies = pd.DataFrame()
    
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

    domain_url = 'https://spb.hh.ru'
    local_url = '/search/vacancy'
    
    params = {'area' : '231',
              'clusters' : 'true',
              'enable_snippets' : 'true',
              'text' : search_request,
              'schedule' : 'remote'
             }

    is_last_page = False
    while not is_last_page:
        
        response = requests.get(domain_url + local_url, headers=headers, params=params).text      
        soup = bs(response, 'lxml')

        vacancys_list = soup.find_all('div', {'class':'vacancy-serp-item'})

        for vacancy in vacancys_list:

            tittle_tag = vacancy.find('a', {'data-qa':'vacancy-serp__vacancy-title'})
            tittle = tittle_tag.getText().strip()  
            href = tittle_tag['href']

            company = vacancy.find('a', {'data-qa':'vacancy-serp__vacancy-employer'}).getText().strip()

            offer_tag = vacancy.find('span', {'data-qa':'vacancy-serp__vacancy-compensation'})
            offer_str = None if offer_tag == None else offer_tag.getText()

            city = vacancy.find('span', {'data-qa':'vacancy-serp__vacancy-address'}).getText()

            date = vacancy.find('span', {'data-qa':'vacancy-serp__vacancy-date'}).getText()

            offer_min, offer_max, currency = parse_offer_hh(offer_str)

            temp_df = pd.DataFrame({'tittle' : [tittle],
                                    'site': ['hh.ru'],
                                    'company' : [company],
                                    'offer_min' : [offer_min],
                                    'offer_max' : [offer_max],
                                    'currency' : [currency],
                                    'href' : [href],
                                    'city' : [city],
                                    'date' : [date]
                                   })
            vacancies = vacancies.append(temp_df) 


        next_page_tag = soup.find('a', {'data-qa':'pager-next'})
        if next_page_tag == None:
            is_last_page = True
        else:
            local_url = next_page_tag['href']
            
    
    return vacancies

In [78]:
def parsing_superjob(offer_text):
        

    if offer_text == 'По договорённости':
        return (np.nan, np.nan, np.nan)
    
    digits = '1234567890'
    
    offer_text = offer_text.replace(chr(160), '')
    offer_text = offer_text.replace('руб./месяц', '')    

    
    if offer_text[0] in digits:
        sep_pos = offer_text.find('—')
        
        min_offer = int(offer_text[:sep_pos])
        max_offer = offer_text[sep_pos+1:]
        currency = 'руб'
        
        return (min_offer, max_offer, currency)
    

    if offer_text[:2] == 'от':
        txt = offer_text[2:]  
                
        min_offer = int(txt)
        max_offer = np.nan
        currency = 'руб'
        
        return (min_offer, max_offer, currency)
    

    if offer_text[:2] == 'до':
        txt = offer_text[2:]  
                
        min_offer = np.nan
        max_offer = int(txt)
        currency = 'руб'
        
        return (min_offer, max_offer, currency)
    

    if offer_text[0] in digits:
        
        min_offer = int(offer_text)
        max_offer = int(offer_text)
        currency = 'руб'
        
        return (min_offer, max_offer, currency)
    

    return (np.nan,np.nan,offer_text)

In [79]:
def parse_superjob(search_link):
   
    vacancies = pd.DataFrame()

    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

    domain_url = 'https://russia.superjob.ru/'
    

    is_last_page = False
    while not is_last_page:
        

        response = requests.get(domain_url + search_link, headers=headers).text    
        soup = bs(response, 'lxml')

        vacancys_list = soup.find_all('div', {'class':'_3zucV _1fMKr undefined _1NAsu'})

        for vacancy in vacancys_list:

            block = vacancy.find('div', {'class':'jNMYr GPKTZ _1tH7S'})
            if block == None:
                continue


            block_tittle = block.findChildren(recursive=False)[0].findChildren(recursive=False)[0]

            tittle = block_tittle.getText()
            href = block_tittle['href']

            offer_str = vacancy.find('div', {'class':'jNMYr GPKTZ _1tH7S'}).findChildren(recursive=False)[1].getText()

            block_company = vacancy.find('span', {'class':'f-test-text-vacancy-item-company-name'})
            if block_company == None:
                company = None
            else:
                company =  block_company.findChildren(recursive=False)[0].getText()

            blocks_date_city = vacancy.find('span', {'class':'f-test-text-company-item-location'})\
                                                                        .findChildren(recursive=False)

            date = blocks_date_city[0].getText()
            city = blocks_date_city[2].getText()

            offer_min, offer_max, currency = parse_offer_superjob(offer_str)


            temp_df = pd.DataFrame({'tittle' : [tittle],
                                    'site': ['superjob.ru'],
                                    'company' : [company],
                                    'offer_min' : [offer_min],
                                    'offer_max' : [offer_max],
                                    'currency' : [currency],
                                    'href' : [href],
                                    'city' : [city],
                                    'date' : [date],
                                   })
            vacancies = vacancies.append(temp_df)


        next_page_tag = soup.find('a', {'class':'f-test-button-dalshe'})
        if next_page_tag == None:
            is_last_page = True
        else:
            local_url = next_page_tag['href']

    
    return vacancies

In [80]:

vacancies = parse_hh(search_request_hh)

vacancies = vacancies.append(parse_superjob(search_request_superjob_link))

In [81]:
vacancies.shape

(6, 9)

In [82]:

vacancies

Unnamed: 0,tittle,site,company,offer_min,offer_max,currency,href,city,date
0,Talent Acquisition Specialist (IT Recruiting),hh.ru,Inca Digital Securities,3000.0,4000.0,USD,https://spb.hh.ru/vacancy/37597197?query=data%...,Санкт-Петербург,27 июня
0,Автор модуля NLP,hh.ru,SkillFactory,,,,https://spb.hh.ru/vacancy/37705566?query=data%...,Санкт-Петербург,26 июня
0,Senior Software Engineer (Python/Devops) for a...,hh.ru,Assaia International AG,2500.0,3500.0,USD,https://spb.hh.ru/vacancy/37675980?query=data%...,Санкт-Петербург,25 июня
0,Senior Product Manager (Ad tech),hh.ru,Oblivki.biz,250.0,,руб,https://spb.hh.ru/vacancy/37536648?query=data%...,Санкт-Петербург,17 июня
0,Data Scientist (NLP),hh.ru,M-soft,120000.0,150000.0,руб,https://spb.hh.ru/vacancy/37404715?query=data%...,Санкт-Петербург,9 июня
0,Фуллстек разработчик / Senior Software Developer,superjob.ru,SegmentStream,,,,/vakansii/fullstek-razrabotchik-33964334.html,"Москва, Белорусская",Вчера
