In [17]:
import subprocess
import sys

# Function to install packages using pip
def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Ensure the required package is installed
try:
    import fitz  # This is part of PyMuPDF
except ImportError:
    install_package('PyMuPDF')
    import fitz

import requests
from bs4 import BeautifulSoup
import os

# Function to download a file from a URL
def download_file(url, folder):
    local_filename = os.path.join(folder, url.split('/')[-1])
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

BASE_URL = 'https://www.k5learning.com'

# Main function to scrape and download PDFs
def download_pdfs_from_webpages(urls, folder='pdf_downloads', limit=5):
    # Create a directory to save PDFs if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # Set to keep track of downloaded PDFs
    downloaded_pdfs = set()
    downloaded_files = []

    # Iterate over each URL in the list
    for url in urls:
        print(f'Scraping URL: {url}')
        
        # Get the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all links ending with .pdf
        pdf_links = soup.find_all('a', href=lambda href: href and href.endswith('.pdf'))

        # Download each PDF found, limited to the specified number
        count = 0
        for link in pdf_links:
            if count >= limit:
                break
            pdf_url = link.get('href')
            if not pdf_url.startswith('http'):
                pdf_url = BASE_URL + pdf_url  # Handle relative URLs

            if pdf_url not in downloaded_pdfs:
                print(f'Downloading: {pdf_url}')
                local_file = download_file(pdf_url, folder)
                downloaded_files.append(local_file)
                downloaded_pdfs.add(pdf_url)
                count += 1

    print('Download completed.')
    return downloaded_files

# Function to merge PDFs
def merge_pdfs(pdf_list, output_path):
    result = fitz.open()
    for pdf in pdf_list:
        with fitz.open(pdf) as mfile:
            result.insert_pdf(mfile)
    result.save(output_path)
    print(f'Merged PDF saved as {output_path}')

# List of URLs to scrape
webpage_urls = [
    'https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-single-digits',
    'https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-within-20-no-borrow',
    'https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-1digit-within-20',
    'https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-vertical-within-20-no-regrouping',
    'https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-vertical-within-20',
    'https://www.k5learning.com/free-math-worksheets/first-grade-1/subtraction/subtract-1-digit-from-2-digit-no-regrouping',
    'https://www.k5learning.com/free-math-worksheets/first-grade-1/subtraction/subtract-2-digit-numbers-no-regrouping',
    'https://www.k5learning.com/free-math-worksheets/first-grade-1/subtraction/add-and-subtract-3-single-digit-numbers',
    'https://www.k5learning.com/free-math-worksheets/first-grade-1/subtraction/add-and-subtract-4-single-digit-numbers',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtracting-1-digit-from-3-digit-with-regrouping',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-whole-tens-from-whole-tens',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-whole-tens-from-2-digit-numbers',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-whole-tens-from-hundreds',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-whole-tens-from-3-digit-numbers',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-whole-hundreds-from-3-digit-numbers',
    'https://www.k5learning.com/free-math-worksheets/second-grade-2/subtraction/subtract-2-digit-number-from-whole-hundreds',
    'https://www.k5learning.com/free-math-worksheets/third-grade-3/subtraction/subtract-3-digit-numbers-with-regrouping',
    'https://www.k5learning.com/free-math-worksheets/third-grade-3/subtraction/subtract-4-digit-numbers-with-regrouping',
    'https://www.k5learning.com/free-math-worksheets/third-grade-3/subtraction/subtract-borrow-across-2-zeros',
    'https://www.k5learning.com/free-math-worksheets/third-grade-3/subtraction/subtract-borrow-across-3-zeros'
]

# Download PDFs from the list of URLs
downloaded_files = download_pdfs_from_webpages(webpage_urls)

# Merge the downloaded PDFs
merge_pdfs(downloaded_files, 'result.pdf')


Scraping URL: https://www.k5learning.com/free-preschool-kindergarten-worksheets/subtraction/subtract-single-digits
Prepending website root
https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h1.pdf
Downloading: https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h1.pdf
Prepending website root
https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h1.pdf
Prepending website root
https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h2.pdf
Downloading: https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h2.pdf
Prepending website root
https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h3.pdf
Downloading: https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-digit-numbers-h3.pdf
Prepending website root
https://www.k5learning.com/worksheets/kindergarten-subtraction-of-two-1-dig

In [18]:
print("Done.")

Done.
