In [1]:
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
BASE_RSPP_URL = "https://rspp.ru"
RSPP_URL_REGISTR = f"{BASE_RSPP_URL}/activity/social/registr/"
REPORT_TYPES = {
    "ОУР": "Отчет по устойчивому развитию",
    "СО": "Социальный отчет",
    "ИО": "Интегрированный отчет",
    "ЭО": "Экологический отчёт",
}
RE_YEARS = re.compile("....")
FOLDER_TO_SAVE_REPORTS = "reports"

In [3]:
def parse_td(company_report_row):
    report_info = [td.find("div") for td in company_report_row.findAll("td", class_="register-table__colclass")]
    company_name = report_info[0].find("span").text
    sector = report_info[1].text
    years_list = RE_YEARS.findall(report_info[2].text)
    years = ", ".join(years_list)
    report_tag_a = report_info[3].find("a")
    report_link, report_type = report_tag_a["href"], report_tag_a.text.strip()
    return {
        "компания": company_name,
        "сектор": sector,
        "год": years,
        "тип отчета": report_type,
        "ссылка на отчет": report_link,
    }

In [4]:
def parse_rspp():
    rspp_response = requests.get(RSPP_URL_REGISTR)
    assert rspp_response.status_code < 400, "Не получилось подключиться к сайту РСПП"
    soup = BeautifulSoup(rspp_response.text, "lxml")
    company_report_filter = soup.find("div", class_="company-report-filter")
    report_table = company_report_filter.find("table", class_="table_scroll-mobile")
    headline, *company_reports_list = report_table.findAll("tr", class_="register-table__row")
    company_reports_dicts = [parse_td(company_report_td) for company_report_td in tqdm(company_reports_list)]
    return pd.DataFrame.from_dict(company_reports_dicts)

In [5]:
rspp_df = parse_rspp()
rspp_df.head()

100%|█████████████████████████████████████| 1408/1408 [00:00<00:00, 4122.24it/s]


Unnamed: 0,компания,сектор,год,тип отчета,ссылка на отчет
0,Центр Корпоративной Медицины,Здравоохранение и спорт,2022,ОУР,/download/b98f348b936bbc33387b248e52cb1f2a/
1,ДОМ.РФ,Финансы и страхование,2022,ОУР,/download/82e60c15b4c0ee6ede2e194aa31c69e1/
2,ПАО «Россети Ленэнерго»,Энергетика,2022,СО,/download/d8bc2c7e94424c65e37793a462b7807d1274...
3,"ОАО ""МРСК Урала""",Энергетика,2021,ИО,/download/af503e07dd6b861d1ed3048c36868cc9/
4,"ПАО ""Россети Сибирь""",Энергетика,2021,ИО,/download/57becde4be827f45bedf2a46f58d793a/


In [6]:
rspp_df.to_csv("rspp_reports.csv")

In [7]:
CHUNK_SIZE = 400


def download_report_pdf(link, filename, folder_to_save=FOLDER_TO_SAVE_REPORTS):
    download_url = f"{BASE_RSPP_URL}{link}"
    report_response = requests.get(download_url, stream=True)
    assert report_response.status_code == 200, "Не получилось скачать отчет с сайта РСПП"
    filename = f"{folder_to_save}/{filename}"
    with open(filename, "wb") as f:
        for chunk in report_response.iter_content(CHUNK_SIZE):
            f.write(chunk)

In [8]:
def download_reports_pdf(rspp_df):
    for index, report in tqdm(rspp_df.iterrows(), total=rspp_df.shape[0]):
        link = report["ссылка на отчет"]
        company = report["компания"].replace("/", "_")
        filename = f"{company}_{index}.pdf"
        try:
            download_report_pdf(link, filename)
        except Exception as e:
            print(e, BASE_RSPP_URL + link)
        time.sleep(2)

In [10]:
download_reports_pdf(rspp_df)

 10%|███▌                                  | 134/1408 [12:20<2:42:47,  7.67s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/bcf0b03f02c703c0e87276216105516e/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0527c548e0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/bcf0b03f02c703c0e87276216105516e/


 43%|████████████████▍                     | 609/1408 [48:29<1:30:01,  6.76s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/f9a35061e70f6ab54880eb4a1338b94a/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0527c548e0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/f9a35061e70f6ab54880eb4a1338b94a/


 45%|█████████████████▊                      | 629/1408 [49:59<57:44,  4.45s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/d15f2a563faad6c3e1c7265284e80d6f/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0527c56500>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/d15f2a563faad6c3e1c7265284e80d6f/


 60%|██████████████████████▊               | 847/1408 [1:05:27<34:42,  3.71s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/d64aa92550ac153719bab75bff9ca470/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0527c6d480>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/d64aa92550ac153719bab75bff9ca470/


 62%|███████████████████████▎              | 866/1408 [1:06:55<36:40,  4.06s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/076c48aa17016b05702d57d8c1a16742/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f052c729db0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/076c48aa17016b05702d57d8c1a16742/


 63%|███████████████████████▊              | 883/1408 [1:08:12<29:49,  3.41s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/dacd90434be9fca055d44b1183b85db5/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0527c54580>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/dacd90434be9fca055d44b1183b85db5/


 73%|███████████████████████████          | 1030/1408 [1:21:16<26:33,  4.21s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/8bf6f57399bd55ee16c58c1997998ee2/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f052c35a0b0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/8bf6f57399bd55ee16c58c1997998ee2/


 78%|████████████████████████████▋        | 1094/1408 [1:25:33<23:55,  4.57s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/d4ec4daa936aec9355a2c1bd170134e3/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f052c35a260>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/d4ec4daa936aec9355a2c1bd170134e3/


 99%|████████████████████████████████████▌| 1391/1408 [1:50:16<00:59,  3.52s/it]

HTTPSConnectionPool(host='rspp.ru', port=443): Max retries exceeded with url: /download/257dd0099122b60e4d5579814798863b/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f052c359b40>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')) https://rspp.ru/download/257dd0099122b60e4d5579814798863b/


100%|█████████████████████████████████████| 1408/1408 [1:51:48<00:00,  4.76s/it]
