In [2]:
import os
import re
import time
import requests
import pandas as pd

from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [3]:
def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    return webdriver.Chrome(options=options)

In [None]:
def get_sitemap_links(base_url):
    sitemap_index_url = urljoin(base_url, '/sitemap.xml')
    all_links = []

    try:
        res = requests.get(sitemap_index_url, timeout=10)
        if res.status_code != 200:
            return []

        soup = BeautifulSoup(res.content, 'xml')
        if soup.find('sitemapindex'):
            for sitemap in soup.find_all('sitemap'):
                loc = sitemap.find('loc')
                lastmod = sitemap.find('lastmod')
                if loc and lastmod and '2025' in lastmod.text:
                    child_sitemap_url = loc.text
                    try:
                        child_res = requests.get(child_sitemap_url, timeout=10)
                        if child_res.status_code == 200:
                            child_soup = BeautifulSoup(child_res.content, 'xml')
                            for url_tag in child_soup.find_all('url'):
                                loc_tag = url_tag.find('loc')
                                if loc_tag:
                                    all_links.append(loc_tag.text)
                    except Exception as e:
                        print(f"[child sitemap error] {child_sitemap_url} – {e}")
        else:
            for url_tag in soup.find_all('url'):
                loc_tag = url_tag.find('loc')
                if loc_tag and '2025' in loc_tag.text:
                    all_links.append(loc_tag.text)

    except Exception as e:
        print(f"[sitemap error] {sitemap_index_url} – {e}")

    return all_links


def get_internal_links(driver, base_url):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if not href.startswith('http'):
            href = urljoin(base_url, href)
        if base_url in href:
            links.add(href)
    return list(links)


def download_pdfs(driver, base_url, company_inn_folder):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.lower().endswith('.pdf'):
            pdf_url = href if href.startswith('http') else urljoin(base_url, href)
            try:
                pdf_name = os.path.basename(urlparse(pdf_url).path)
                pdf_path = os.path.join(company_inn_folder, pdf_name)
                r = requests.get(pdf_url, timeout=10)
                with open(pdf_path, 'wb') as f:
                    f.write(r.content)
            except Exception as e:
                print(f"PDF download error: {e}")


def get_page_text(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    return soup.get_text(separator=' ', strip=True)


def scrape_company(driver, company, url, inn, output_dir):
    company_data = []
    parsed_links = set()
    company_inn_folder = os.path.join(output_dir, f"{company}_{inn}")
    os.makedirs(company_inn_folder, exist_ok=True)

    sitemap_links = get_sitemap_links(url)
    if sitemap_links:
        links = sitemap_links
    else:
        driver.get(url)
        time.sleep(2)
        links = get_internal_links(driver, url)
        links.insert(0, url)

    for link in links:
        if link in parsed_links:
            continue
        parsed_links.add(link)
        try:
            driver.get(link)
            time.sleep(2)
            text = get_page_text(driver)
            download_pdfs(driver, link, company_inn_folder)
            company_data.append({
                'company': company,
                'inn': inn,
                'url': link,
                'text': text
            })
        except Exception as e:
            print(f"Error visiting {link}: {e}")
    
    return company_data

In [None]:
def main():
    driver = init_driver()
    df = pd.read_excel('templates/input_links.xlsx', sheet_name='Лист1')
    output_data = []
    output_dir = 'downloaded_data'
    os.makedirs(output_dir, exist_ok=True)

    for _, row in df.iterrows():
        company, url, inn = row['company'], row['url'], row['INN']
        print(f"Scraping {company} ({url})")
        data = scrape_company(driver, company, url, inn, output_dir)
        output_data.extend(data)

    driver.quit()

    pd.DataFrame(output_data).to_csv('scraped_output.csv', index=False)

if __name__ == '__main__':
    main()

Scraping Группа «Илим» (https://www.ilimgroup.ru/ustoychivoe-razvitie/)
