In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os

In [2]:
# Setup Selenium WebDriver for Edge
def setup_driver():
    driver = driver = webdriver.Edge()
    return driver

In [3]:
# Function to interact with the page and reveal all links
def interact_and_get_links(driver, base_url):
    driver.get(base_url)
    
    # Wait for the page to load and for interactive elements to be clickable
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    
    # Example interactions (adjust based on actual interactions needed):
    # Click on buttons or load more content
    try:
        while True:
            # Click all "Show More" or similar buttons to reveal more content
            buttons = driver.find_elements(By.CSS_SELECTOR, '.show-more-button')  # Example selector
            if buttons:
                for button in buttons:
                    button.click()
                    time.sleep(2)  # Wait for content to load
            else:
                break
    except Exception as e:
        print(f"Exception occurred: {e}")
    
    # Extract all article links
    links = set()
    for element in driver.find_elements(By.TAG_NAME, 'a'):
        href = element.get_attribute('href')
        if href and href.startswith(base_url):
            links.add(href)
    
    return links

In [10]:
import pdfkit

def save_page_as_pdf(url, output_filename):
    # Configure pdfkit options (if needed)
    options = {
        'quiet': ''  # Optional: Run in quiet mode
    }
    configuration = pdfkit.configuration(wkhtmltopdf="C:\Program Files\wkhtmltopdf\\bin\wkhtmltopdf.exe")
    
    # Save the web page as PDF
    pdfkit.from_url(url, output_filename, options=options, configuration=configuration)

In [11]:
base_url = "https://learn.microsoft.com/en-us/azure/machine-learning/"
output_dir = 'azure_ml_docs'
os.makedirs(output_dir, exist_ok=True)

driver = setup_driver()
links = interact_and_get_links(driver, base_url)

for index, link in enumerate(links):
    output_filename = os.path.join(output_dir, f"article_{index+1}.pdf")
    print(f"Saving {link} as PDF...")
    save_page_as_pdf(link, output_filename)

driver.quit()

Saving https://learn.microsoft.com/en-us/azure/machine-learning/how-to-r-train-model?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/concept-model-management-and-deployment?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/concept-enterprise-security?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-train-model?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-explore-data?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-cloud-workstation?view=azureml-api-2 as PDF...
Saving https://learn.microsoft.com/en-us/azure/machine-learning/tuto