In [None]:
import requests
import json
import time
import os

 
def getPage(page = 0, company = ''):

    # We create a method for getting a page with a list of vacancies.
    # Arguments:
    #   search field - filter for seraching, i.e. search by company name
    #   text - search request, in our case name of the company
    #   page - Page index, starts from 0. Default value is 0, i.e. first page
    #   per_page - number of vacancies per page. we will ask for ni more than 20 pages with 100 vacancies each
    #   professional role - filter to get only IT vacancies

    
    # Reference for GET request parameters
    params = {
        'search_field': 'company_name',
        'text': company,
        'page': page, 
        'per_page': 100,
        'professional_role': [156, 160, 10, 12, 150, 25, 165, 34, 36, 73, 155, 96, 164, 104, 157, 107, 112, 113, 148, 114, 116, 121, 124, 125, 126]
    }
    
    
    req = requests.get('https://api.hh.ru/vacancies', params) # Sending a request to the API
    data = req.content.decode() # Decoding the response so that the Cyrillic alphabet is displayed correctly
    req.close()
    return data


# Specify the names of companies to search

companies = ['positive technologies', 'aviasales.ru', 'ozon',
              'headhunter', 'selectel', 'redmadrobot', 'лаборатория касперского', '2гис', 'додо', 'dodo', 'яндекс', 'huawei', 'райффайзен',
              'nexign', 'avito', 'playrix', 'точка', 'vk', 'ibs', 'инфосистемы джет', 'тинькофф', 'контур', 'газинформсервис', 'альфа-банк',
              'skyeng', 'северсталь', 'московская биржа', 'газпром нефть', 'леруа мерлен', 'x5', 'росатом', 'сибур', 'нлмк',
              'газпром автоматизация', 'крок', 'центр финансовых технологий', 'сбер', 'softline', 'иннотех', 'гринатом', 'мтс', 'lamoda', 'icl', 'ланит', 'рт лабс', 'лига цифровой экономики',
              'газпромбанк', 'tele2', 'ай-теко', 'м.видео-эльдорадо', 'сбермаркет', 'ростех', 'сбермегамаркет', 'тензор', 'мегафон', 'bell integrator', 'втб', 'барс груп',
              'росбанк', '1с', 'открытие', 'мтс банк', 'билайн', 'сибинтек', 'отр', 'лента', '1с-рарус', 'rarus', '36.6', 'центральный банк российской федерации', 'ростелеком-солар', 'эр-телеком', 'первый бит', 'ржд технологии',
              'ростелеком', 'магнит', 'wildberries', 'почта россии', 'почта банк', 'dataart', 'accenture',
              'haulmont', 'деловые решения и технологии', 'epam', 'okko', 'naumen'
             ]

# Read the first 2000 vacancies
for company in companies:
  for page in range(0, 20):
      
      # Converting Request Response Text to a Python Reference
      jsObj = json.loads(getPage(page, company))
      
      # Save the files to the folder {path to the current document with the script}\docs\pagination
      # Determine the number of files in the folder to save the document with the response to the request
      # The resulting value is used to form the name of the document
      nextFileName = './docs/pagination/{}.json'.format(len(os.listdir('./docs/pagination')))
      
      # We create a new document, write the request response into it, then close
      f = open(nextFileName, mode='w', encoding='utf8')
      f.write(json.dumps(jsObj, ensure_ascii=False))
      f.close()
      
      # Checking to the last page if there are less than 2000 vacancies
      if (jsObj['pages'] - page) <= 1:
          break
      
      # An optional delay, but in order not to load hh.ru services, we will leave it. 5 sec we can wait
      time.sleep(1)
    
print('Search pages collected')

Страницы поиска собраны


In [None]:
import json
import os
import requests
import time

# We get a list of previously created files with a list of vacancies and go through it in a loop 
page_num = 0
for fl in os.listdir('./docs/pagination'):
    vac_num = 0
    # Open a file, read its contents, close the file
    f = open('./docs/pagination/{}'.format(fl), encoding='utf8')
    jsonText = f.read()
    f.close()
    
    # Let's convert the received text into a dictionary object
    jsonObj = json.loads(jsonText)
    if 'items' in jsonObj:
      # We receive and go through the list of vacancies directly

      for v in jsonObj['items']:
          # We turn to the API and get detailed information on a specific vacancy
          print(page_num, vac_num, v['url'])
          req = requests.get(v['url'])
          data = req.content.decode()
          req.close()
          
          # Create a json file with vacancy ID as title
          # We write the request response into it and close the file
          fileName = './docs/vacancies/{}.json'.format(v['id'])
          f = open(fileName, mode='w', encoding='utf8')
          f.write(data)
          f.close()
          
          time.sleep(0.25)
          vac_num += 1
      page_num += 1
print('Vacancies collected')

In [None]:
import pandas as pd
import json
import os
from sqlalchemy import engine as sql
from IPython import display

# Creating lists for the columns of the vacancies table
IDs = [] # List of Job IDs
company = [] # Company name
names = [] # List of job titles
descriptions = [] # List of job descriptions
viewers = [] # Number of views
publication = [] # Date of publication
creation = [] # Date when the application for the vacancy was opened
prof = [] # Specialization
url = [] # Vacancy URL


# The output will display the progress
# To do this, we find out the total number of files that need to be processed.
# Set the counter of processed files to zero
cnt_docs = len(os.listdir('./docs/vacancies'))
i = 0

# We go through all the files in the vacancies folder
for fl in os.listdir('./docs/vacancies'):
    
    # Open, read and close a file
    f = open('./docs/vacancies/{}'.format(fl), encoding='utf8')
    jsonText = f.read()
    f.close()
    
    # We translate the text of the file into a directory
    jsonObj = json.loads(jsonText)
    
    # Filling out lists for tables
    IDs.append(jsonObj['id'])
    company.append(jsonObj['employer'])
    names.append(jsonObj['name'])
    descriptions.append(jsonObj['description'])
    publication.append(jsonObj['published_at'])
    creation.append(jsonObj['initial_created_at'])
    prof.append(jsonObj['professional_roles'])
    url.append(jsonObj['alternate_url'])
    
    # We increase the counter of processed files by 1, clear the output of the cell and display the progress
    i += 1
    display.clear_output(wait=True)
    display.display('Готово {} из {}'.format(i, cnt_docs))


# We create a pandas dataframe in which we save all the information
df = pd.DataFrame({'id': IDs, 'employer': company, 'name': names, 'description': descriptions, 
                   'published_at': publication, 'initial_created_at': creation, 'professional_roles': prof, 'alternate_url': url })


# Display a message about the end of the program
display.clear_output(wait=True)
display.display('Vacancies uploaded to dataset')

'Вакансии загружены в БД'

In [None]:
import datetime

# We create functions so that dates from the dataset are displayed in the correct format, convenient for further analysis
def date_modify(d):
  d1 = datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S%z")
  new_format = "%d.%m.%Y"
  return d1.strftime(new_format)

def time_modify(d):
  d1 = datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S%z")
  new_format = "%H:%M"
  return d1.strftime(new_format)
  
def date_and_time(d):
  d1 = datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S%z")
  new_format = "%d.%m.%Y %H:%M:%S"
  return d1.strftime(new_format)


df['role_id'] = df.professional_roles.apply(lambda s: s[0]['id'])
df['company_name'] = df.employer.apply(lambda s: s['name'])

df['published_date'] = df.published_at.apply(date_modify)
df['published_time'] = df.published_at.apply(time_modify)
df['published_DT'] = df.published_at.apply(date_and_time)

df['initial_date'] = df.initial_created_at.apply(date_modify)
df['initial_time'] = df.initial_created_at.apply(time_modify)
df['initial_DT'] = df.initial_created_at.apply(date_and_time)

df['parse_date'] = pd.Timestamp.today().strftime("%d.%m.%Y")

In [None]:
df

In [None]:
df.to_csv('file.csv') 

In [None]:
df.to_excel('file_excel.xlsx')