In [None]:
!pip install requests beautifulsoup4 pandas openpyxl
!pip install xlsxwriter



Collecting xlsxwriter
  Downloading XlsxWriter-3.2.3-py3-none-any.whl.metadata (2.7 kB)
Downloading XlsxWriter-3.2.3-py3-none-any.whl (169 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.3


URL du rapport :
https://www.dgssi.gov.ma/fr/bulletins-securite

#WEB SCRAPPING DGSSI

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
import time
from urllib.parse import urljoin

In [None]:
import re
import time
import pandas as pd
from functools import wraps
import os

In [None]:
# Base URL of the website
BASE_URL = "https://www.dgssi.gov.ma"

In [None]:
def scrape_report_details(url):
    print(f"\n----- SCRAPING REPORT: {url} -----")
    try:
        # Send GET request to the report page with a timeout
        print("Sending request to report page...")
        response = requests.get(url, timeout=10)
        print(f"Response status code: {response.status_code}")

        soup = BeautifulSoup(response.content, "html.parser")

        # TESTING TIP 2: Save HTML for debugging
        filename = f"debug_page_{url.split('/')[-1].replace('/', '_')}.html"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Saved HTML to {filename}")

        # Initialize a dictionary to store the report data
        report_data = {"URL": url}

        # Helper function to clean field values
        def clean_field_value(text, field_name):
            """Remove field name from the value if present"""
            if field_name in text:
                return text.replace(field_name, "").strip()
            return text.strip()

        # Extract Title from the page (from h1 or the table)
        print("Extracting title...")
        title_tag = soup.find("h1")
        if title_tag:
            print(f"Found title in h1: {title_tag.text.strip()}")
            report_data["Title"] = title_tag.text.strip()
        else:
            print("No h1 title found, looking in table...")
            title_row = soup.find("td", text="Titre") or soup.find("th", text="Titre")
            if title_row:
                title_tag = title_row.find_next("td")
                if title_tag:
                    title_text = clean_field_value(title_tag.text.strip(), "Titre")
                    print(f"Found title in table: {title_text}")
                    report_data["Title"] = title_text
                else:
                    print("No title found in table")
                    report_data["Title"] = "No Title"
            else:
                print("No title row found in table")
                report_data["Title"] = "No Title"

        # Extract Reference Number
        print("Extracting reference number...")
        ref_row = soup.find("td", text="Numéro de Référence") or soup.find("th", text="Numéro de Référence")
        if ref_row:
            print("Found reference row")
            ref_number = ref_row.find_next("td")
            if ref_number:
                ref_text = clean_field_value(ref_number.text.strip(), "Numéro de Référence")
                print(f"Reference number: {ref_text}")
                report_data["Reference"] = ref_text
            else:
                print("No reference number found")
                report_data["Reference"] = "No Reference"
        else:
            print("No reference row found")
            report_data["Reference"] = "No Reference"

        # Extract Publication Date
        print("Extracting publication date...")
        date_row = soup.find("td", text="Date de publication") or soup.find("th", text="Date de publication")
        if date_row:
            print("Found date row")
            pub_date = date_row.find_next("td")
            if pub_date:
                pub_date_text = clean_field_value(pub_date.text.strip(), "Date de publication")
                print(f"Publication date: {pub_date_text}")
                report_data["Publication Date"] = pub_date_text
            else:
                print("No publication date found")
                report_data["Publication Date"] = "No Date"
        else:
            print("No date row found")
            report_data["Publication Date"] = "No Date"

        # Extract Risk Level
        print("Extracting risk level...")
        risk_row = soup.find("td", text="Niveau de Risque") or soup.find("th", text="Niveau de Risque")
        if risk_row:
            print("Found risk level row")
            risk_level = risk_row.find_next("td")
            if risk_level:
                risk_text = clean_field_value(risk_level.text.strip(), "Niveau de Risque")
                print(f"Risk level: {risk_text}")
                report_data["Risk Level"] = risk_text
            else:
                print("No risk level found")
                report_data["Risk Level"] = "No Risk Level"
        else:
            print("No risk level row found")
            report_data["Risk Level"] = "No Risk Level"

        # Extract Impact Level
        print("Extracting impact level...")
        impact_row = soup.find("td", text="Niveau d'Impact") or soup.find("th", text="Niveau d'Impact")
        if impact_row:
            print("Found impact level row")
            impact_level = impact_row.find_next("td")
            if impact_level:
                impact_text = clean_field_value(impact_level.text.strip(), "Niveau d'Impact")
                print(f"Impact level: {impact_text}")
                report_data["Impact Level"] = impact_text
            else:
                print("No impact level found")
                report_data["Impact Level"] = "No Impact Level"
        else:
            print("No impact level row found")
            report_data["Impact Level"] = "No Impact Level"

        # Extract Affected Systems
        print("Extracting affected systems...")
        systems_header = soup.find(string=lambda text: text and "Systèmes affectés" in text)
        if systems_header:
            print("Found systems header")
            # Try to find nearby list items or content
            systems_ul = systems_header.find_next("ul")
            if systems_ul:
                systems = [clean_field_value(li.text.strip(), "Systèmes affectés") for li in systems_ul.find_all("li")]
                print(f"Found {len(systems)} affected systems in list")
                report_data["Affected Systems"] = ", ".join(systems)
            else:
                # Try an alternative approach if no list found
                print("No list found, looking for paragraph...")
                next_p = systems_header.find_next("p")
                if next_p:
                    systems_text = clean_field_value(next_p.text.strip(), "Systèmes affectés")
                    print(f"Found systems in paragraph: {systems_text}")
                    report_data["Affected Systems"] = systems_text
                else:
                    print("No systems paragraph found")
                    report_data["Affected Systems"] = "No Affected Systems"
        else:
            print("No systems header found")
            report_data["Affected Systems"] = "No Affected Systems"

        # Extract Vulnerability Summary if available
        print("Extracting vulnerability summary...")
        summary_header = soup.find(string=lambda text: text and "Bilan de la vulnérabilité" in text)
        if summary_header:
            print("Found summary header")
            summary_div = summary_header.find_next("div") or summary_header.find_next("p")
            if summary_div:
                summary_text = clean_field_value(summary_div.text.strip(), "Bilan de la vulnérabilité")
                print(f"Found summary text (first 50 chars): {summary_text[:50]}...")
                report_data["Vulnerability Summary"] = summary_text
            else:
                print("No summary text found")
                report_data["Vulnerability Summary"] = "No Summary"
        else:
            print("No summary header found")
            report_data["Vulnerability Summary"] = "No Summary"

        # Extract Solution if available
        print("Extracting solution...")
        solution_header = soup.find(string=lambda text: text and "Solution" in text)
        if solution_header:
            print("Found solution header")
            solution_div = solution_header.find_next("div") or solution_header.find_next("p")
            if solution_div:
                solution_text = clean_field_value(solution_div.text.strip(), "Solution")
                print(f"Found solution text (first 50 chars): {solution_text[:50]}...")
                report_data["Solution"] = solution_text
            else:
                print("No solution text found")
                report_data["Solution"] = "No Solution"
        else:
            print("No solution header found")
            report_data["Solution"] = "No Solution"

        print("Extraction completed for this report")
        return report_data

    except Exception as e:
        print(f"ERROR scraping {url}: {e}")
        return {"URL": url, "Error": str(e), "Title": "Failed to scrape"}


In [None]:
# Function to scrape report URLs from the main bulletins page
def scrape_report_urls(save_to_file=True):
    print("\n----- SCRAPING REPORT URLS -----")
    # Try this URL instead
    url = "https://www.dgssi.gov.ma/fr/bulletins-securite"  # Updated URL
    print(f"Requesting main page: {url}")

    try:
        response = requests.get(url, timeout=10)
        print(f"Response status code: {response.status_code}")
        # Add after the response line in scrape_report_urls
        print(f"Response content preview: {response.text[:500]}...")

        soup = BeautifulSoup(response.content, "html.parser")
        print("Parsing response HTML...")

        # Extract all report links (looking for links to bulletin pages)
        report_links = []

        # Look for cards or containers with links
        print("Looking for bulletin links...")
        for link in soup.find_all("a", href=True):
            href = link["href"]

            # Check if it's a bulletin link
            if "/fr/bulletin" in href or "/bulletins/" in href:
                # Make absolute URL if it's relative
                full_url = urljoin(BASE_URL, href)
                report_links.append(full_url)
                print(f"Found bulletin link: {full_url}")

        # Also check for "En savoir plus" links which might lead to reports
        print("Looking for 'En savoir plus' links...")
        for link in soup.find_all("a", class_="btn"):
            if link.get("href"):
                full_url = urljoin(BASE_URL, link["href"])
                if full_url not in report_links:
                    report_links.append(full_url)
                    print(f"Found 'En savoir plus' link: {full_url}")

        # Remove duplicates
        original_count = len(report_links)
        report_links = list(set(report_links))
        print(f"Removed {original_count - len(report_links)} duplicate URLs")
        print(f"Total unique report URLs found: {len(report_links)}")

        # Save URLs to a JSON file to avoid scraping again in the future
        if save_to_file:
            with open("report_urls.json", "w") as file:
                json.dump(report_links, file)
                print(f"Saved {len(report_links)} URLs to report_urls.json")

        return report_links

    except Exception as e:
        print(f"ERROR scraping report URLs: {e}")
        return []


In [None]:

# Function to read the saved URLs from the file
def load_report_urls():
    print("\n----- LOADING REPORT URLS -----")
    if os.path.exists("report_urls.json"):
        print("Found existing report_urls.json file")
        with open("report_urls.json", "r") as file:
            urls = json.load(file)
            print(f"Loaded {len(urls)} URLs from file")
            return urls
    else:
        print("No existing report_urls.json file found")
        print("Scraping URLs from website...")
        # If the file does not exist, scrape the URLs and save them
        return scrape_report_urls(save_to_file=True)

the correct code for scrapping without errors

In [None]:
import re
import time
import pandas as pd
from functools import wraps

def handle_errors(func):
    """Decorator to handle common scraping errors"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            print(f"Error in {func.__name__}: {str(e)}")
            return None
    return wrapper

def clean_text(text):
    """Clean text while preserving accents and removing control characters"""
    if not isinstance(text, str):
        return text

    # First replace problematic control characters
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', ' ', text)

    # Replace special hyphens/dashes with normal ones
    text = re.sub(r'[\u2010-\u2015]', '-', text)

    # Remove other problematic Unicode characters while preserving accents
    text = re.sub(r'[\u2028\u2029]', ' ', text)  # Line/paragraph separators

    # Normalize newlines and multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def main():
    print("\n============================================")
    print("STARTING DGSSI SECURITY BULLETINS SCRAPER")
    print("============================================\n")

    try:
        # Load or scrape the report URLs
        report_links = load_report_urls()
        if not report_links:
            print("ERROR: No report URLs found. Please check the website structure or connection.")
            return

        print(f"\nFinal list contains {len(report_links)} report URLs")

        # List to hold the scraped report data
        all_reports = []

        # Iterate through all the report links and scrape the individual report data
        for i, report_url in enumerate(report_links):
            print(f"\n[PROGRESS] Scraping report {i+1}/{len(report_links)}")
            report_data = scrape_report_details(report_url)
            if report_data:
                all_reports.append(report_data)
                print(f"Added report to dataset: {report_data.get('Title', 'Unknown Title')}")

                # Add a short delay to avoid overwhelming the server
                if i < len(report_links) - 1:
                    print("Waiting 1 second before next request...")
                    time.sleep(1)

        if not all_reports:
            print("ERROR: No reports were successfully scraped.")
            return

        # Convert to DataFrame
        print("\n----- CREATING DATAFRAME -----")
        df = pd.DataFrame(all_reports)
        print(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
        print(f"Columns: {', '.join(df.columns)}")

        # Reorder columns for better readability
        column_order = [
            "Title", "Reference", "Publication Date", "Risk Level",
            "Impact Level", "Affected Systems", "Vulnerability Summary",
            "Solution", "URL"
        ]

        # Only include columns that exist in the DataFrame
        columns_to_use = [col for col in column_order if col in df.columns]
        # Add any additional columns that weren't in the order list
        columns_to_use.extend(col for col in df.columns if col not in columns_to_use)
        df = df[columns_to_use]
        print(f"Reordered columns: {', '.join(columns_to_use)}")

        # Clean all text columns
        print("\n----- CLEANING DATA -----")
        for column in df.columns:
            if df[column].dtype == object:
                df[column] = df[column].apply(lambda x: clean_text(x) if isinstance(x, str) else x)

        # Save the data to Excel
        excel_filename = "rapports_dgssi_securite_cleaned.xlsx"
        print(f"\n----- SAVING DATA TO EXCEL -----")

        # Additional Excel writer parameters for robustness
        writer = pd.ExcelWriter(
            excel_filename,
            engine='xlsxwriter'

        )
        df.to_excel(writer, index=False)
        writer.close()

        print("\n============================================")
        print(f"SCRAPING COMPLETE: Data saved to '{excel_filename}'")
        print(f"Total reports scraped: {len(all_reports)}")
        print("============================================")

    except Exception as e:
        print(f"\nCRITICAL ERROR: {str(e)}")
        print("Scraping failed. Please check the error message above.")



    # Save the data to CSV instead of Excel
import pandas as pd  # Import pandas if not already done

csv_filename = "rapports_dgssi_securite_cleaned.csv"
print(f"\n----- SAVING DATA TO CSV -----")

# Assuming 'all_reports' contains your scraped data:
df = pd.DataFrame(all_reports)  # Create DataFrame from all_reports

# Save the DataFrame to a CSV file
df.to_csv(csv_filename, index=False)

print("\n============================================")
print(f"SCRAPING COMPLETE: Data saved to '{csv_filename}'")
print(f"Total reports scraped: {len(all_reports)}")
print("============================================")

if __name__ == "__main__":
    main()


----- SAVING DATA TO CSV -----


NameError: name 'all_reports' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil# Define the source and destination paths
source_path = '/content/rapports_dgssi_securite_cleaned.csv'
destination_dir = '/content/drive/MyDrive/MY TASKS/Data Driven/'
destination_path = destination_dir + 'rapports_dgssi_securite_cleaned.csv'

# Move the file
shutil.move(source_path, destination_path)

print(f"File moved to {destination_path}")

File moved to /content/drive/MyDrive/MY TASKS/Data Driven/rapports_dgssi_securite_cleaned.csv


#Fin

In [None]:


def main():
    print("\n============================================")
    print("STARTING DGSSI SECURITY BULLETINS SCRAPER")
    print("============================================\n")

    # Load or scrape the report URLs
    report_links = load_report_urls()
    print(f"\nFinal list contains {len(report_links)} report URLs")

    # TESTING TIP 1: Add this here to limit testing to just a few URLs
    # Uncomment the following line for testing with fewer URLs

    if not report_links:
        print("ERROR: No report URLs found. Please check the website structure or connection.")
        return

    # List to hold the scraped report data
    all_reports = []

    # Iterate through all the report links and scrape the individual report data
    for i, report_url in enumerate(report_links):
        print(f"\n[PROGRESS] Scraping report {i+1}/{len(report_links)}")
        report_data = scrape_report_details(report_url)
        all_reports.append(report_data)
        print(f"Added report to dataset: {report_data.get('Title', 'Unknown Title')}")

        # Add a short delay to avoid overwhelming the server
        if i < len(report_links) - 1:  # Don't sleep after the last request
            print(f"Waiting 1 second before next request...")
            time.sleep(1)

    # Convert the list of reports into a DataFrame
    print("\n----- CREATING DATAFRAME -----")
    df = pd.DataFrame(all_reports)
    print(f"Created DataFrame with {len(df)} rows and {len(df.columns)} columns")
    print(f"Columns: {', '.join(df.columns)}")

    # Reorder columns for better readability
    column_order = [
        "Title", "Reference", "Publication Date", "Risk Level",
        "Impact Level", "Affected Systems", "Vulnerability Summary",
        "Solution", "URL"
    ]

    # Only include columns that exist in the DataFrame
    columns_to_use = [col for col in column_order if col in df.columns]

    # Add any additional columns that weren't in the order list
    for col in df.columns:
        if col not in columns_to_use:
            columns_to_use.append(col)

    df = df[columns_to_use]
    print(f"Reordered columns: {', '.join(columns_to_use)}")

    # Save the data to CSV instead of Excel for better handling
    csv_filename = "rapports_dgssi_securite_cleaned.csv"
    print(f"\n----- SAVING DATA TO CSV -----")
    df.to_csv(csv_filename, index=False)

    print("\n============================================")
    print(f"SCRAPING COMPLETE: Data saved to '{csv_filename}'")
    print(f"Total reports scraped: {len(all_reports)}")
    print("============================================")

if __name__ == "__main__":
    main()


STARTING DGSSI SECURITY BULLETINS SCRAPER


----- LOADING REPORT URLS -----
No existing report_urls.json file found
Scraping URLs from website...

----- SCRAPING REPORT URLS -----
Requesting main page: https://www.dgssi.gov.ma/fr/bulletins-securite
Response status code: 200
Response content preview: <!DOCTYPE html><html lang="fr" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/ dc: http://purl.org/dc/terms/ foaf: http://xmlns.com/foaf/0.1/ og: http://ogp.me/ns# rdfs: http://www.w3.org/2000/01/rdf-schema# schema: http://schema.org/ sioc: http://rdfs.org/sioc/ns# sioct: http://rdfs.org/sioc/types# skos: http://www.w3.org/2004/02/skos/core# xsd: http://www.w3.org/2001/XMLSchema# "><head><meta charset="utf-8" /><link rel="canonical" href="https://www.dgssi.gov.ma/fr/bulletins...
Parsing response HTML...
Looking for bulletin links...
Found bulletin link: https://www.dgssi.gov.ma/index.php/fr/bulletins-securite
Found bulletin link: https://www.dgssi.gov.ma/fr/bulletins-se

  ref_row = soup.find("td", text="Numéro de Référence") or soup.find("th", text="Numéro de Référence")
  date_row = soup.find("td", text="Date de publication") or soup.find("th", text="Date de publication")
  risk_row = soup.find("td", text="Niveau de Risque") or soup.find("th", text="Niveau de Risque")
  impact_row = soup.find("td", text="Niveau d'Impact") or soup.find("th", text="Niveau d'Impact")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Impact level: Modéré
Extracting affected systems...
Found systems header
Found 3 affected systems in list
Extracting vulnerability summary...
Found summary header
Found summary text (first 50 chars): Mozilla Foundation annonce la disponibilité d'une ...
Extracting solution...
Found solution header
Found solution text (first 50 chars): Veuillez se référer au bulletin de sécurité de Moz...
Extraction completed for this report
Added report to dataset: Vulnérabilités affectant le client de messagerie mozilla thunderbird
Waiting 1 second before next request...

[PROGRESS] Scraping report 1369/1519

----- SCRAPING REPORT: https://www.dgssi.gov.ma/fr/bulletins/mise-jour-de-securite-pour-google-chrome-os-0 -----
Sending request to report page...
Response status code: 200
Saved HTML to debug_page_mise-jour-de-securite-pour-google-chrome-os-0.html
Extracting title...
Found title in h1: Mise à jour de sécu