In [1]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL for the first page (filtered for 2024 alerts)
base_url = "https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024"

# Initialize counter for total alerts
total_alerts = 0

# Function to extract the "Next" page URL
def extract_next_page(soup, base_url):
    # Locate the "Next" button
    next_button = soup.find('span', attrs={"aria-hidden": "true"}, string="Next")
    if next_button:
        # Find the parent <a> tag
        parent_a_tag = next_button.find_parent('a')
        if parent_a_tag and parent_a_tag.get('href'):
            # Extract the page parameter
            next_href = parent_a_tag['href']
            # Preserve the filters from the base URL and add the new page parameter
            return base_url.split("?")[0] + next_href
    return None  # No "Next" button found

# Function to scrape a single page and count alerts
def scrape_page(url):
    global total_alerts
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all alert containers (e.g., <article> elements with the relevant class)
        alert_containers = soup.find_all('article', class_='c-teaser')  # Adjust class if necessary
        count_on_page = len(alert_containers)
        total_alerts += count_on_page

        print(f"Scraping: {url} - Found {count_on_page} alerts on this page.")
        return soup
    except Exception as e:
        print(f"Error fetching page {url}: {e}")
        return None

# Function to scrape all pages and count total alerts
def scrape_all_pages():
    global total_alerts
    next_page = base_url
    page_number = 0

    while next_page:
        page_number += 1
        print(f"\nScraping page {page_number}...")

        # Scrape the current page
        soup = scrape_page(next_page)
        if not soup:
            break  # Stop if the page couldn't be scraped

        # Throttle requests to avoid overloading the server
        time.sleep(2)

        # Extract the next page URL
        next_page = extract_next_page(soup, base_url)
        if next_page:
            print(f"Next page URL: {next_page}")

# Run the scraper
scrape_all_pages()

# Print the total number of alerts for 2024
print(f"\nTotal Alerts for 2024: {total_alerts}")


Scraping page 1...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=1

Scraping page 2...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=1 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=2

Scraping page 3...
Scraping: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=2 - Found 10 alerts on this page.
Next page URL: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=3

Scraping page