In [2]:
# Install dependencies if needed
import sys
import subprocess

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
try:
    import selenium
    import loguru
    import bs4
except ImportError:
    print("Installing required packages...")
    install_package("selenium")
    install_package("loguru")
    install_package("beautifulsoup4")
    install_package("lxml")
    print("Packages installed successfully!")


Installing required packages...
Packages installed successfully!


# Test DNBScraper

Ce notebook exécute un test rapide du scraper:

1. Importe `DNBScraper`
2. Extrait les liens PDF sur la première page uniquement (`max_pages=1`)
3. Affiche un résumé (total, années, matières)
4. Affiche les 3 premiers liens avec métadonnées


In [3]:
# Imports
import sys
import os

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.scraper import DNBScraper
from src.parser import MetadataParser
from loguru import logger

# Optionally, adjust logging level
logger.remove()
logger.add(lambda msg: print(msg, end=""))



1

In [4]:
# Create scraper and extract one page
scraper = DNBScraper()
pdf_links = scraper.extract_pdf_links(max_pages=1)
print(f"Total links found on first page: {len(pdf_links)}")


2025-10-16 10:35:38.099 | INFO     | src.scraper:__init__:103 - DNBScraper initialized with URL: https://eduscol.education.fr/711/preparer-le-diplome-national-du-brevet-dnb-avec-les-sujets-des-annales (headless: True)
2025-10-16 10:35:38.100 | INFO     | src.scraper:_init_driver:112 - Initializing Selenium WebDriver
2025-10-16 10:35:38.100 | DEBUG    | src.scraper:_init_driver:118 - Running in headless mode
2025-10-16 10:35:39.999 | SUCCESS  | src.scraper:_init_driver:138 - WebDriver initialized successfully
2025-10-16 10:35:39.999 | INFO     | src.scraper:extract_pdf_links:488 - Navigating to: https://eduscol.education.fr/711/preparer-le-diplome-national-du-brevet-dnb-avec-les-sujets-des-annales
2025-10-16 10:35:40.445 | SUCCESS  | src.scraper:extract_pdf_links:495 - Page loaded successfully
2025-10-16 10:35:40.445 | DEBUG    | src.scraper:extract_pdf_links:499 - Checking for overlays/modals to close
2025-10-16 10:36:40.634 | INFO     | src.scraper:extract_pdf_links:524 - Extracting l

In [5]:
# Summary
summary = scraper.get_summary_dict()
print("\n==== SUMMARY ====")
print(f"Total PDFs: {summary['total']}")
print(f"Years: {summary['years']}")
print(f"Subjects: {summary['subjects']}")


2025-10-16 10:36:56.661 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63474/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 10:36:56.661 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63471/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 10:36:56.662 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/62978/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 10:36:56.662 | DEBUG    | src.parse

In [6]:
# Show first 3 links with parsed metadata
parser = MetadataParser()
for i, link in enumerate(pdf_links[:3], start=1):
    meta = parser.parse_url(link['url'], link.get('data_atl_name'))
    print(f"\nLink #{i}")
    print(f"URL: {link['url']}")
    print(f"data-atl-name: {link.get('data_atl_name', '')}")
    print(f"filename: {meta.get('filename')}")
    print(f"year: {meta.get('year')}")
    print(f"subject: {meta.get('subject')}")
    print(f"session: {meta.get('session')}")
    print(f"series: {meta.get('series')}")
    print(f"type: {meta.get('document_type')}")


2025-10-16 10:37:31.514 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63474/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}

Link #1
URL: https://eduscol.education.fr/document/63474/download
data-atl-name: 
filename: download
year: None
subject: None
session: None
series: generale
type: sujet
2025-10-16 10:37:31.515 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63471/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}

Link #2
URL: https://eduscol.education.fr/document/63471/download
data-atl-name: 
filename: download
year: None
subject: None
session: None
series: generale
type: sujet
2025-10-16 10:37:31.515 