In [10]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install requests fpdf

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install xhtml2pdf

Note: you may need to restart the kernel to use updated packages.


# Code that only gets all "disease"-links

In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

# Base URL to filter links
base_url = "https://www.1177.se/sjukdomar--besvar/"

# Function to fetch child links within h3.c-teaser__heading
def fetch_child_links(url):
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all h3 tags with class "c-teaser__heading" and extract anchor links
        child_links = []
        teasers = soup.find_all('h3', class_="c-teaser__heading")
        
        for teaser in teasers:
            link = teaser.find('a', href=True)
            if link:
                child_href = link.get('href')
                full_child_url = urljoin(url, child_href)
                
                # Only include links that start with the base URL
                if full_child_url.startswith(base_url):
                    child_links.append(full_child_url)
                    print(f"Found child link: {full_child_url}")
        return child_links
    else:
        print(f"Failed to fetch child links from {url}. Status code: {response.status_code}")
        return []

# Function to get all parent links from the base page
def get_all_parent_links():
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags with class "c-anchor c-linklist__link"
        parent_links = []
        links = soup.find_all('a', class_="c-anchor c-linklist__link")

        # Extract and print only the links that start with the desired path
        for link in links:
            href = link.get('href')
            full_url = urljoin(base_url, href)  # Get the full URL
            
            # Filter links to only include those that start with the specified path
            if full_url.startswith(base_url):
                span_text = link.find('span').text if link.find('span') else 'No text'  # Get text inside <span>
                parent_links.append(full_url)
                print(f"Found parent link: {full_url} - Text: {span_text}")
        
        return parent_links
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return []

# Main function to visit each parent link and fetch child links
def main():
    # Fetch all parent links from the main page
    parent_links = get_all_parent_links()

    # Visit each parent link and fetch child links
    for parent_link in parent_links:
        print(f"\nVisiting parent link: {parent_link}")
        
        # Fetch and print all child links from the current parent link
        child_links = fetch_child_links(parent_link)
        
        # Optional: If you want to visit each child link further, you can call fetch_child_links again here
        for child_link in child_links:
            print(f"\tChild link: {child_link}")
            # Optionally, you can repeat this recursively to go deeper, but be cautious of over-scraping.
        
        # Add a delay between requests to be polite to the server
        time.sleep(1)

# Run the main function
if __name__ == "__main__":
    main()


Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/allergi-och-overkanslighet/ - Text: Allergi och överkänslighet
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/celiaki/ - Text: Celiaki
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/doftoverkanslighet/ - Text: Doftöverkänslighet
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/insektsallergi/ - Text: Insektsallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/komjolksallergi/ - Text: Komjölksallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/kvalsterallergi/ - Text: Kvalsterallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/laktosintolerans/ - Text: Laktosintolerans
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/matallergi/ -

KeyboardInterrupt: 

# Same code but with PDF download

In [28]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import pdfkit  # Import pdfkit for HTML-to-PDF conversion

# Base URL to filter links
base_url = "https://www.1177.se/sjukdomar--besvar/"

# Function to fetch child links within h3.c-teaser__heading
def fetch_child_links(url, max_links=20):
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all h3 tags with class "c-teaser__heading" and extract anchor links
        child_links = []
        teasers = soup.find_all('h3', class_="c-teaser__heading")
        
        for teaser in teasers:
            if len(child_links) >= max_links:
                break
            link = teaser.find('a', href=True)
            if link:
                child_href = link.get('href')
                full_child_url = urljoin(url, child_href)
                
                # Only include links that start with the base URL
                if full_child_url.startswith(base_url):
                    child_links.append(full_child_url)
                    print(f"Found child link: {full_child_url}")
        return child_links
    else:
        print(f"Failed to fetch child links from {url}. Status code: {response.status_code}")
        return []

# Function to get all parent links from the base page
def get_all_parent_links():
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags with class "c-anchor c-linklist__link"
        parent_links = []
        links = soup.find_all('a', class_="c-anchor c-linklist__link")

        # Extract and print only the links that start with the desired path
        for link in links:
            href = link.get('href')
            full_url = urljoin(base_url, href)  # Get the full URL
            
            # Filter links to only include those that start with the specified path
            if full_url.startswith(base_url):
                span_text = link.find('span').text if link.find('span') else 'No text'  # Get text inside <span>
                parent_links.append(full_url)
                print(f"Found parent link: {full_url} - Text: {span_text}")
        
        return parent_links
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return []

# Function to convert HTML page to PDF
def convert_html_to_pdf(url, pdf_filename):
    try:
        # Use pdfkit to convert HTML to PDF
        pdfkit.from_url(url, pdf_filename)
        print(f"Successfully saved PDF: {pdf_filename}")
    except Exception as e:
        print(f"Error converting {url} to PDF: {e}")

# Main function to visit each parent link and fetch child links
def main():
    # Fetch all parent links from the main page
    parent_links = get_all_parent_links()

    # To track how many child links we have processed
    total_child_links_fetched = 0

    # Visit each parent link and fetch child links
    for parent_link in parent_links:
        print(f"\nVisiting parent link: {parent_link}")
        
        # Fetch and print all child links from the current parent link
        child_links = fetch_child_links(parent_link, max_links=20 - total_child_links_fetched)
        
        # Loop through child links and convert to PDF
        for child_link in child_links:
            total_child_links_fetched += 1
            if total_child_links_fetched > 20:
                break

            # Create a filename for the PDF (e.g., page1.pdf, page2.pdf, ...)
            pdf_filename = f"child_page_{total_child_links_fetched}.pdf"
            print(f"\tConverting {child_link} to PDF...")

            # Convert HTML to PDF and save it
            convert_html_to_pdf(child_link, pdf_filename)

            # Optional delay to be polite to the server
            time.sleep(1)
        
        # If we have already fetched 20 child links, stop the process
        if total_child_links_fetched >= 20:
            print("Reached the limit of 20 child links. Stopping.")
            break

# Run the main function
if __name__ == "__main__":
    main()


Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/allergi-och-overkanslighet/ - Text: Allergi och överkänslighet
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/celiaki/ - Text: Celiaki
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/doftoverkanslighet/ - Text: Doftöverkänslighet
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/insektsallergi/ - Text: Insektsallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/komjolksallergi/ - Text: Komjölksallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/kvalsterallergi/ - Text: Kvalsterallergi
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/laktosintolerans/ - Text: Laktosintolerans
Found parent link: https://www.1177.se/sjukdomar--besvar/allergier-och-overkanslighet/matallergi/ -