In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
def get_main_sections(soup):
    main_section = soup.find('div', {'class': 'page_content'})
    links = main_section.select('a[href^="/en/"]')
    sub_nav_block = soup.find('div', {'class': 'sub-nav'})
    links = [link['href'] for link in sub_nav_block.select('a[href^="/en/"]')]
    return links

def get_sub_sections(root, main_sections, headers):
    sub_sections = []

    for section in main_sections:
        url = f'{root}{section}'
        page_to_scrape = requests.get(url, headers=headers)
        soup = BeautifulSoup(page_to_scrape.content, 'html.parser')

        topic_section = soup.find("div", {"class": "topic"})
        if topic_section:
            href_links = [a["href"] for a in topic_section.find_all("a", href=True)]
            sub_sections.append(href_links)
            print(href_links)
        else:
            print(f"No topic section found on {url}")

    # flatten sub_sections
    return [item for sublist in sub_sections for item in sublist]

def scrape_all_links(root, soup, headers):
    links = get_main_sections(soup)
    all_links = []
    while links:
        new_links = []
        for link in links:
            url = f'{root}{link}'
            page_to_scrape = requests.get(url, headers=headers)
            soup = BeautifulSoup(page_to_scrape.content, 'html.parser')
            topic_section = soup.find("div", {"class": "topic"})
            if topic_section:
                href_links = [a["href"] for a in topic_section.find_all("a", href=True)]
                new_links.extend(href_links)
                print(href_links)
            else:
                print(f"No topic section found on {url}")
        links = [link for link in new_links if link not in all_links]
        all_links.extend(links)
    return all_links

In [3]:
root = 'https://www.citizensinformation.ie'
headers = {
    'User-Agent': 'My Scraper Bot (contact: [allende.rev@gmail.com](mailto:allende.rev@gmail.com))'
}
page_to_scrape = requests.get(root, headers=headers)
soup = BeautifulSoup(page_to_scrape.content, 'html.parser')

all_links = scrape_all_links(root, soup, headers)

['/en/health/covid19/', '/en/health/health-system/', '/en/health/medical-cards-and-gp-visit-cards/', '/en/health/health-services/', '/en/health/drugs-and-medicines/', '/en/health/legal-matters-and-health/', '/en/health/eu-healthcare/', '/en/health/food-safety/', '/en/health/health-overview/']
['/en/social-welfare/irish-social-welfare-system/', '/en/social-welfare/disability-and-illness/', '/en/social-welfare/carers/', '/en/social-welfare/unemployed-people/', '/en/social-welfare/older-and-retired-people/', '/en/social-welfare/families-and-children/', '/en/social-welfare/death-related-benefits/', '/en/social-welfare/social-welfare-payments-and-work/', '/en/social-welfare/farming-and-fishing/', '/en/social-welfare/back-to-education/', '/en/social-welfare/extra-social-welfare-benefits/', '/en/social-welfare/supplementary-welfare-schemes/', '/en/social-welfare/voluntary-work-and-social-welfare-payments/', '/en/social-welfare/covid19-and-social-welfare/']
['/en/employment/types-of-employment

In [6]:
all_links

['/en/health/covid19/',
 '/en/health/health-system/',
 '/en/health/medical-cards-and-gp-visit-cards/',
 '/en/health/health-services/',
 '/en/health/drugs-and-medicines/',
 '/en/health/legal-matters-and-health/',
 '/en/health/eu-healthcare/',
 '/en/health/food-safety/',
 '/en/health/health-overview/',
 '/en/social-welfare/irish-social-welfare-system/',
 '/en/social-welfare/disability-and-illness/',
 '/en/social-welfare/carers/',
 '/en/social-welfare/unemployed-people/',
 '/en/social-welfare/older-and-retired-people/',
 '/en/social-welfare/families-and-children/',
 '/en/social-welfare/death-related-benefits/',
 '/en/social-welfare/social-welfare-payments-and-work/',
 '/en/social-welfare/farming-and-fishing/',
 '/en/social-welfare/back-to-education/',
 '/en/social-welfare/extra-social-welfare-benefits/',
 '/en/social-welfare/supplementary-welfare-schemes/',
 '/en/social-welfare/voluntary-work-and-social-welfare-payments/',
 '/en/social-welfare/covid19-and-social-welfare/',
 '/en/employmen

In [16]:
import pdfkit
import os

config = pdfkit.configuration(wkhtmltopdf='/usr/bin/wkhtmltopdf')

# Create the output directory if it doesn't exist
output_dir = '../app/pdf_docs'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for link in all_links:
    url = f'{root}{link}'
    print(f"Converting {url} to PDF...")
    output_file = os.path.join(output_dir, f"{link.replace('/', '_')}.pdf")
    pdfkit.from_url(url, output_file, configuration=config)
    print(f"PDF saved as {output_file}")

Converting https://www.citizensinformation.ie/en/health/covid19/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_covid19_.pdf
Converting https://www.citizensinformation.ie/en/health/health-system/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_health-system_.pdf
Converting https://www.citizensinformation.ie/en/health/medical-cards-and-gp-visit-cards/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_medical-cards-and-gp-visit-cards_.pdf
Converting https://www.citizensinformation.ie/en/health/health-services/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_health-services_.pdf
Converting https://www.citizensinformation.ie/en/health/drugs-and-medicines/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_drugs-and-medicines_.pdf
Converting https://www.citizensinformation.ie/en/health/legal-matters-and-health/ to PDF...
PDF saved as ../app/pdf_docs/_en_health_legal-matters-and-health_.pdf
Converting https://www.citizensinformation.ie/en/health/eu-healthcare/ to PDF...
PDF saved as ..

In [14]:
output_file

'app/pdf_docs/_en_moving-country_irish-citizenship_.pdf'