## BI - web scraping assignment

In [1]:
import re
import itertools
from requests import get

import pandas as pd
from bs4 import BeautifulSoup 

#constants
site_root = 'https://nabory.kprm.gov.pl'
init_link = 'https://nabory.kprm.gov.pl/wyniki-naborow?AdResult%5BpagesCnt%5D=20&AdResult%5BisAdvancedMode%5D=&AdResult%5Bsort%5D=1&AdResult%5Bid%5D=&AdResult%5Bid_institution%5D=&AdResult%5Bid_institution_position%5D='

headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

Let's begin crawling the site; we'll also check if we get a response back.

In [2]:
response = get(init_link, headers=headers)
print(response)

<Response [200]>


In [3]:
html_soup = BeautifulSoup(response.text, 'html.parser')

After checking out the HTML code, we found the container which held all the search results. Let's access a single row which would be equal to a single job posting.

In [4]:
pattern = r'(?<=href=")\s*(.*)\s*(?=">)'
all_search_results = html_soup.find_all('li', class_='row')
search_results_links = []

for result in all_search_results:
    job_posting_link = re.search(pattern, str(result), re.IGNORECASE).group(1)
    full_link = site_root + job_posting_link
    search_results_links.append(full_link)

search_results_links

['https://nabory.kprm.gov.pl/wyniki-naborow/podkarpackie/rzeszow/sekretarz,77610',
 'https://nabory.kprm.gov.pl/wyniki-naborow/lodzkie/skierniewice/starszy-referent,77455',
 'https://nabory.kprm.gov.pl/wyniki-naborow/pomorskie/gdansk/specjalista,77472',
 'https://nabory.kprm.gov.pl/wyniki-naborow/dolnoslaskie/lubin/referent,77398',
 'https://nabory.kprm.gov.pl/wyniki-naborow/zachodniopomorskie/szczecin/referent,77418',
 'https://nabory.kprm.gov.pl/wyniki-naborow/zachodniopomorskie/szczecin/referent,77425',
 'https://nabory.kprm.gov.pl/wyniki-naborow/warminsko-mazurskie/wegorzewo/inspektor-weterynaryjny,77315',
 'https://nabory.kprm.gov.pl/wyniki-naborow/mazowieckie/warszawa/inspektor,77264',
 'https://nabory.kprm.gov.pl/wyniki-naborow/mazowieckie/warszawa/glowny-specjalista,77265',
 'https://nabory.kprm.gov.pl/wyniki-naborow/dolnoslaskie/wroclaw/samodzielny-referent,77281',
 'https://nabory.kprm.gov.pl/wyniki-naborow/podkarpackie/strzyzow/oskarzyciel-skarbowy,77209',
 'https://nabory.k

Let's scrap the data using regexes and create a dictionary with each key corresponding to a single row.

In [5]:
patterns = {"numer": r'(?<=<span class="id">nr )\s*(\d*)', 
            "stanowisko": r'(?<=<h1 class="h1">)\s*(.*)\s*(?=</h1)', 
            "pracodawca": r'(?<=<strong class="h">Nazwa urzędu</strong>\n<p>)\s*(.*)\s*(?=</p>)', 
            "miejsce":  r'(?<=<p>[0-9]{2}-[0-9]{3})\s*(.*)\s*(?=</p)',
            "data_publikacji": r'(?<=Wprowadzono: <span>)\s*(.*)\s*(?=</span>)',
            "data_ogloszenia": r'(?<=<strong class="h">Data ogłoszenia</strong>\n<div class="box bor">\n<strong>)\s*(.*)\s*(?=</strong>)',  
            "data_wyniku": r'(?<=<strong class="h">Data wyniku</strong>\n<div class="box bor">\n<strong>)\s*(.*)\s*(?=</strong>)', 
            "status": r'(?<=<strong class="c"><span>)\s*(.*)\s*(?=</span></strong>)',  
            "link_ogloszenia": r'(?<=<a class="btn btn-b" href=")\s*(.*)\s*(?=">)'
            }

dict_data = {}

for index, link in enumerate(search_results_links):
    response = get(link, headers=headers)
    single_page = BeautifulSoup(response.text, 'html.parser')
    single_record = []
    for key, value in patterns.items():
        single_record.append(re.search(value, str(single_page), re.IGNORECASE).group(1))
  
    dict_data.update({index: single_record})

Let's create and transform our dataframe to the desired format.

In [6]:
job_offers_df = pd.DataFrame.from_dict(dict_data, orient='index', columns=list(patterns.keys()))
job_offers_df.insert(8, "link_wynik", search_results_links)

job_offers_df["status"] = job_offers_df["status"].str.replace("<br/>", " ")

job_offers_df["link_ogloszenia"] = [site_root for i in range(job_offers_df.shape[0])] + job_offers_df["link_ogloszenia"]
job_offers_df["data_ogloszenia"] = job_offers_df["data_ogloszenia"] + ['.04.2021' for i in range(job_offers_df.shape[0])]
job_offers_df["data_wyniku"] = job_offers_df["data_wyniku"] + ['.04.2021' for i in range(job_offers_df.shape[0])]


In [7]:
job_offers_df.head(3)

Unnamed: 0,numer,stanowisko,pracodawca,miejsce,data_publikacji,data_ogloszenia,data_wyniku,status,link_wynik,link_ogloszenia
0,77610,sekretarz,Wojewódzki Inspektorat Inspekcji Handlowej w R...,Rzeszów,28.04.2021,28.04.2021,28.04.2021,koniec naboru,https://nabory.kprm.gov.pl/wyniki-naborow/podk...,https://nabory.kprm.gov.pl/podkarpackie/rzeszo...
1,77455,starszy referent,Izba Administracji Skarbowej w Łodzi,Łódź,26.04.2021,26.04.2021,26.04.2021,koniec naboru,https://nabory.kprm.gov.pl/wyniki-naborow/lodz...,https://nabory.kprm.gov.pl/lodzkie/skierniewic...
2,77472,specjalista,Komenda Wojewódzka Policji w Gdańsku,Gdańsk,24.04.2021,24.04.2021,24.04.2021,koniec naboru,https://nabory.kprm.gov.pl/wyniki-naborow/pomo...,https://nabory.kprm.gov.pl/pomorskie/gdansk/sp...
