In [2]:
import os
import re
import time
import requests
import pandas as pd

from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [3]:
def init_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    return webdriver.Chrome(options=options)

In [24]:
# def get_sitemap_links(base_url):
#     sitemap_index_url = urljoin(base_url, '/sitemap.xml')
#     all_links = []

#     try:
#         res = requests.get(sitemap_index_url, timeout=10)
#         if res.status_code != 200:
#             return []

#         soup = BeautifulSoup(res.content, 'xml')
#         if soup.find('sitemapindex'):
#             for sitemap in soup.find_all('sitemap'):
#                 loc = sitemap.find('loc')
#                 lastmod = sitemap.find('lastmod')
#                 if loc and lastmod and '2025' in lastmod.text:
#                     child_sitemap_url = loc.text
#                     try:
#                         child_res = requests.get(child_sitemap_url, timeout=10)
#                         if child_res.status_code == 200:
#                             child_soup = BeautifulSoup(child_res.content, 'xml')
#                             for url_tag in child_soup.find_all('url'):
#                                 loc_tag = url_tag.find('loc')
#                                 if loc_tag:
#                                     all_links.append(loc_tag.text)
#                     except Exception as e:
#                         print(f"[child sitemap error] {child_sitemap_url} – {e}")
#         else:
#             for url_tag in soup.find_all('url'):
#                 loc_tag = url_tag.find('loc')
#                 if loc_tag and '2025' in loc_tag.text:
#                     all_links.append(loc_tag.text)

#     except Exception as e:
#         print(f"[sitemap error] {sitemap_index_url} – {e}")

#     return all_links

# def get_sitemap_links(base_url):
#     sitemap_index_url = urljoin(base_url, '/sitemap.xml')
#     all_links = set()

#     try:
#         res = requests.get(sitemap_index_url, timeout=10)
#         if res.status_code != 200:
#             return []

#         soup = BeautifulSoup(res.content, 'xml')

#         def should_include(lastmod_text, loc_text):
#             # Allow pages modified in 2023 or 2024, or URLs that include these years
#             return any(year in (lastmod_text or '') for year in ['2023', '2024']) or \
#                    any(year in (loc_text or '') for year in ['2023', '2024'])

#         # Sitemap index: recursive
#         if soup.find('sitemapindex'):
#             for sitemap in soup.find_all('sitemap'):
#                 loc = sitemap.find('loc')
#                 lastmod = sitemap.find('lastmod')
#                 if loc and should_include(lastmod.text if lastmod else '', loc.text):
#                     child_sitemap_url = loc.text
#                     try:
#                         child_res = requests.get(child_sitemap_url, timeout=10)
#                         if child_res.status_code == 200:
#                             child_soup = BeautifulSoup(child_res.content, 'xml')
#                             for url_tag in child_soup.find_all('url'):
#                                 loc_tag = url_tag.find('loc')
#                                 lastmod_tag = url_tag.find('lastmod')
#                                 if loc_tag and should_include(lastmod_tag.text if lastmod_tag else '', loc_tag.text):
#                                     all_links.add(loc_tag.text)
#                     except Exception as e:
#                         print(f"[child sitemap error] {child_sitemap_url} – {e}")
#         else:
#             # It's a regular sitemap
#             for url_tag in soup.find_all('url'):
#                 loc_tag = url_tag.find('loc')
#                 lastmod_tag = url_tag.find('lastmod')
#                 if loc_tag and should_include(lastmod_tag.text if lastmod_tag else '', loc_tag.text):
#                     all_links.add(loc_tag.text)

#     except Exception as e:
#         print(f"[sitemap error] {sitemap_index_url} – {e}")

#     # Always ensure the main page is included
#     all_links.add(base_url.rstrip('/'))

#     return list(all_links)


def get_sitemap_links(base_url, filter_years=['2023', '2024', '2025']):
    sitemap_index_url = urljoin(base_url, '/sitemap.xml')
    filtered_links = set()
    all_links = set()

    def should_include(lastmod_text, loc_text):
        if not filter_years:
            return True
        return any(year in (lastmod_text or '') for year in filter_years) or \
               any(year in (loc_text or '') for year in filter_years)

    def is_sitemap_url(link):
        return link.endswith('.xml') or 'sitemap' in link.lower()

    try:
        res = requests.get(sitemap_index_url, timeout=10)
        if res.status_code != 200:
            return []

        soup = BeautifulSoup(res.content, 'xml')

        # Recursive sitemap index
        if soup.find('sitemapindex'):
            for sitemap in soup.find_all('sitemap'):
                loc = sitemap.find('loc')
                if loc:
                    child_sitemap_url = loc.text
                    try:
                        child_res = requests.get(child_sitemap_url, timeout=10)
                        if child_res.status_code == 200:
                            child_soup = BeautifulSoup(child_res.content, 'xml')
                            for url_tag in child_soup.find_all('url'):
                                loc_tag = url_tag.find('loc')
                                lastmod_tag = url_tag.find('lastmod')
                                if loc_tag:
                                    url = loc_tag.text
                                    if not is_sitemap_url(url):
                                        all_links.add(url)
                                        if should_include(lastmod_tag.text if lastmod_tag else '', url):
                                            filtered_links.add(url)
                    except Exception as e:
                        print(f"[child sitemap error] {child_sitemap_url} – {e}")
        else:
            for url_tag in soup.find_all('url'):
                loc_tag = url_tag.find('loc')
                lastmod_tag = url_tag.find('lastmod')
                if loc_tag:
                    url = loc_tag.text
                    if not is_sitemap_url(url):
                        all_links.add(url)
                        if should_include(lastmod_tag.text if lastmod_tag else '', url):
                            filtered_links.add(url)

    except Exception as e:
        print(f"[sitemap error] {sitemap_index_url} – {e}")

    # Fallback to all links if filtering is too narrow
    final_links = filtered_links if len(filtered_links) > 3 else all_links

    # Always include base URL
    final_links.add(base_url.rstrip('/'))

    return list(final_links)




def is_valid_link(url):
    try:
        res = requests.head(url, allow_redirects=True, timeout=5)
        return res.status_code < 400
    except:
        return False


def get_internal_links(driver, base_url):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if not href.startswith('http'):
            href = urljoin(base_url, href)
        if base_url in href:
            links.add(href)
    return list(links)



# def get_internal_links(driver, base_url):
#     soup = BeautifulSoup(driver.page_source, 'html.parser')
#     links = set()
#     for a in soup.find_all('a', href=True):
#         href = a['href']
#         if not href.startswith('http'):
#             href = urljoin(base_url, href)
#         if base_url in href:
#             links.add(href)
#     return list(links)


# def download_pdfs(driver, base_url, company_inn_folder):
#     soup = BeautifulSoup(driver.page_source, 'html.parser')
#     for a in soup.find_all('a', href=True):
#         href = a['href']
#         if href.lower().endswith('.pdf'):
#             pdf_url = href if href.startswith('http') else urljoin(base_url, href)
#             try:
#                 pdf_name = os.path.basename(urlparse(pdf_url).path)
#                 pdf_path = os.path.join(company_inn_folder, pdf_name)
#                 r = requests.get(pdf_url, timeout=10)
#                 with open(pdf_path, 'wb') as f:
#                     f.write(r.content)
#             except Exception as e:
#                 print(f"PDF download error: {e}")

def download_pdfs(driver, base_url, company_inn_folder):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    pdf_urls = set()

    # Collect from <a href="...pdf">
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.lower().endswith('.pdf'):
            full_url = href if href.startswith('http') else urljoin(base_url, href)
            pdf_urls.add(full_url)

    # Collect from <iframe src="...pdf"> and <embed src="...pdf">
    for tag in soup.find_all(['iframe', 'embed'], src=True):
        src = tag['src']
        if src.lower().endswith('.pdf'):
            full_url = src if src.startswith('http') else urljoin(base_url, src)
            pdf_urls.add(full_url)

    # Create directory if it doesn't exist
    os.makedirs(company_inn_folder, exist_ok=True)

    # Download each unique PDF
    for pdf_url in pdf_urls:
        try:
            pdf_name = os.path.basename(urlparse(pdf_url).path)
            pdf_path = os.path.join(company_inn_folder, pdf_name)

            if os.path.exists(pdf_path):
                print(f"Already downloaded: {pdf_name}")
                continue

            print(f"Downloading PDF: {pdf_url}")
            r = requests.get(pdf_url, timeout=10)
            r.raise_for_status()
            with open(pdf_path, 'wb') as f:
                f.write(r.content)
        except Exception as e:
            print(f"PDF download error ({pdf_url}): {e}")


def get_page_text(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    return soup.get_text(separator=' ', strip=True)


def scrape_company(driver, company, url, inn, output_dir):
    company_data = []
    parsed_links = set()
    company_inn_folder = os.path.join(output_dir, f"{company}_{inn}")
    os.makedirs(company_inn_folder, exist_ok=True)

    def is_sitemap_url(link):
        return link.endswith('.xml') or 'sitemap' in link.lower()

    # Start by scraping the main company page and extract internal links
    links_to_scrape = get_internal_links(driver, url)
    links_to_scrape.insert(0, url)  # Add the main URL to the list


    def parse_link(link):
        if link in parsed_links or is_sitemap_url(link):
            return
        parsed_links.add(link)
        try:
            driver.get(link)
            time.sleep(2)

            # Check for signs of a 404 error in the page content
            page_source = driver.page_source.lower()
            if "404" in page_source and ("not found" in page_source or "страница не найдена" in page_source):
                print(f"Skipping 404 page: {link}")
                return

            text = get_page_text(driver)
            if "404" in text and ("not found" in text.lower() or "страница не найдена" in text.lower()):
                print(f"Skipping 404 page (text-based): {link}")
                return

            download_pdfs(driver, link, company_inn_folder)
            company_data.append({
                'company': company,
                'inn': inn,
                'url': link,
                'text': text
            })
        except Exception as e:
            print(f"Error visiting {link}: {e}")
            
    # def parse_link(link):
    #     if link in parsed_links or is_sitemap_url(link):
    #         return
    #     parsed_links.add(link)
    #     try:
    #         driver.get(link)
    #         time.sleep(2)
    #         text = get_page_text(driver)
    #         download_pdfs(driver, link, company_inn_folder)
    #         company_data.append({
    #             'company': company,
    #             'inn': inn,
    #             'url': link,
    #             'text': text
    #         })
    #     except Exception as e:
    #         print(f"Error visiting {link}: {e}")

    # Parse all links found on the main page
    for link in links_to_scrape:
        parse_link(link)

    # Now check for a sitemap.xml if available and parse it
    sitemap_links = get_sitemap_links(url)
    if sitemap_links:
        for sitemap_link in sitemap_links:
            parse_link(sitemap_link)

    return company_data



# def scrape_company(driver, company, url, inn, output_dir):
#     company_data = []
#     parsed_links = set()
#     company_inn_folder = os.path.join(output_dir, f"{company}_{inn}")
#     os.makedirs(company_inn_folder, exist_ok=True)

#     sitemap_links = get_sitemap_links(url)
#     if sitemap_links:
#         links = sitemap_links
#     else:
#         driver.get(url)
#         time.sleep(2)
#         links = get_internal_links(driver, url)
#         links.insert(0, url)

#     for link in links:
#         if link in parsed_links:
#             continue
#         parsed_links.add(link)
#         try:
#             driver.get(link)
#             time.sleep(2)
#             text = get_page_text(driver)
#             download_pdfs(driver, link, company_inn_folder)
#             company_data.append({
#                 'company': company,
#                 'inn': inn,
#                 'url': link,
#                 'text': text
#             })
#         except Exception as e:
#             print(f"Error visiting {link}: {e}")
    
#     return company_data

In [25]:
def main():
    driver = init_driver()
    df = pd.read_excel('templates/input_links.xlsx', sheet_name='Лист1')
    output_data = []
    output_dir = 'downloaded_data'
    os.makedirs(output_dir, exist_ok=True)

    for _, row in df.iterrows():
        company, url, inn = row['company'], row['url'], row['INN']
        print(f"Scraping {company} ({url})")
        data = scrape_company(driver, company, url, inn, output_dir)
        output_data.extend(data)

    driver.quit()

    pd.DataFrame(output_data).to_csv('scraped_output.csv', index=False)

if __name__ == '__main__':
    main()

Scraping Группа «Илим» (https://www.ilimgroup.ru/)
Downloading PDF: https://www.ilimgroup.ru/upload/iblock/d7d/23992_23992-Politika-AO-GI-v-otnoshenii-obrabotki-i-zashchity-personalnykh-dannykh-_-Prikaz-_-GD_0462.20-ot-30.09.2020.pdf
Already downloaded: 23992_23992-Politika-AO-GI-v-otnoshenii-obrabotki-i-zashchity-personalnykh-dannykh-_-Prikaz-_-GD_0462.20-ot-30.09.2020.pdf
Skipping 404 page: https://www.ilimgroup.ru/career/vakansii/vedushchiy-inzhener-biolog/
Already downloaded: 23992_23992-Politika-AO-GI-v-otnoshenii-obrabotki-i-zashchity-personalnykh-dannykh-_-Prikaz-_-GD_0462.20-ot-30.09.2020.pdf
Already downloaded: 23992_23992-Politika-AO-GI-v-otnoshenii-obrabotki-i-zashchity-personalnykh-dannykh-_-Prikaz-_-GD_0462.20-ot-30.09.2020.pdf
Already downloaded: 23992_23992-Politika-AO-GI-v-otnoshenii-obrabotki-i-zashchity-personalnykh-dannykh-_-Prikaz-_-GD_0462.20-ot-30.09.2020.pdf
Skipping 404 page: https://www.ilimgroup.ru/career/vakansii/master-po-remontu-tskri/
Already downloaded: 2