In [22]:
# Install dependencies if needed
import sys
import subprocess

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
try:
    import selenium
    import loguru
    import bs4
except ImportError:
    print("Installing required packages...")
    install_package("selenium")
    install_package("loguru")
    install_package("beautifulsoup4")
    install_package("lxml")
    print("Packages installed successfully!")


# Test DNBScraper

Ce notebook exécute un test rapide du scraper:

1. Importe `DNBScraper`
2. Extrait les liens PDF sur la première page uniquement (`max_pages=1`)
3. Affiche un résumé (total, années, matières)
4. Affiche les 3 premiers liens avec métadonnées


In [23]:
# Imports
import sys
import os

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.scraper import DNBScraper
from loguru import logger

# Optionally, adjust logging level
logger.remove()
logger.add(lambda msg: print(msg, end=""))



3

In [24]:
# Create scraper and extract one page
scraper = DNBScraper()
pdf_links = scraper.extract_pdf_links(max_pages=1)
print(f"Total links found on first page: {len(pdf_links)}")


2025-10-16 12:10:30.302 | INFO     | src.scraper:__init__:103 - DNBScraper initialized with URL: https://eduscol.education.fr/711/preparer-le-diplome-national-du-brevet-dnb-avec-les-sujets-des-annales (headless: True)
2025-10-16 12:10:30.303 | INFO     | src.scraper:_init_driver:112 - Initializing Selenium WebDriver
2025-10-16 12:10:30.303 | DEBUG    | src.scraper:_init_driver:118 - Running in headless mode
2025-10-16 12:10:31.797 | SUCCESS  | src.scraper:_init_driver:138 - WebDriver initialized successfully
2025-10-16 12:10:31.798 | INFO     | src.scraper:extract_pdf_links:488 - Navigating to: https://eduscol.education.fr/711/preparer-le-diplome-national-du-brevet-dnb-avec-les-sujets-des-annales
2025-10-16 12:10:32.245 | SUCCESS  | src.scraper:extract_pdf_links:495 - Page loaded successfully
2025-10-16 12:10:32.245 | DEBUG    | src.scraper:extract_pdf_links:499 - Checking for overlays/modals to close
2025-10-16 12:11:32.457 | INFO     | src.scraper:extract_pdf_links:524 - Extracting l

In [25]:
# Summary
summary = scraper.get_summary_dict()
print("\n==== SUMMARY ====")
print(f"Total PDFs: {summary['total']}")
print(f"Years: {summary['years']}")
print(f"Subjects: {summary['subjects']}")


2025-10-16 12:11:34.879 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63474/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 12:11:34.880 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/63471/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 12:11:34.881 | DEBUG    | src.parser:parse_url:86 - Parsed metadata for download: {'url': 'https://eduscol.education.fr/document/62978/download', 'filename': 'download', 'file_id': None, 'year': None, 'subject': None, 'session': None, 'series': 'generale', 'is_correction': False, 'document_type': 'sujet'}
2025-10-16 12:11:34.881 | DEBUG    | src.parse

## SECTION 2: Structured Entries

Affiche le nombre d'entrées structurées et un aperçu des 3 premières (`ExamEntry`).


In [26]:
# Display structured entries
entries = getattr(scraper, 'structured_entries', [])
print(f"Structured entries count: {len(entries)}")

for entry in entries[:3]:
    print(f"\nEntry ID: {entry.id}")
    print(f"  session: {entry.session.value}")
    print(f"  discipline: {entry.discipline.value}")
    print(f"  serie: {entry.serie.value}")
    print(f"  localisation: {entry.localisation.value}")
    print(f"  files: {len(entry.files)}")
    for f in entry.files:
        print(f"    - filename: {f.filename}")
        print(f"      file_id: {f.file_id}")
        print(f"      filename_for_save: {f.filename_for_save}")
        print(f"      download_url: {f.download_url}")


Structured entries count: 10

Entry ID: 1
  session: NORMAL
  discipline: FR_DICTEE
  serie: GENERALE
  localisation: AM_NORTH
  files: 3
    - filename: download
      file_id: 63474
      filename_for_save: 1_NORMAL_FR_DICTEE_GENERALE_AM_NORTH_63474
      download_url: https://eduscol.education.fr/document/63474/download
    - filename: download
      file_id: 63471
      filename_for_save: 1_NORMAL_FR_DICTEE_GENERALE_AM_NORTH_63471
      download_url: https://eduscol.education.fr/document/63471/download
    - filename: download
      file_id: 62978
      filename_for_save: 1_NORMAL_FR_DICTEE_GENERALE_AM_NORTH_62978
      download_url: https://eduscol.education.fr/document/62978/download

Entry ID: 2
  session: NORMAL
  discipline: FR_DICTEE
  serie: GENERALE
  localisation: AM_SOUTH
  files: 2
    - filename: download
      file_id: 63501
      filename_for_save: 2_NORMAL_FR_DICTEE_GENERALE_AM_SOUTH_63501
      download_url: https://eduscol.education.fr/document/63501/download
    -

## SECTION 3: Validation (simulation de run_validation)

Vérifie les champs essentiels et synthétise un mini rapport avec pandas.


In [27]:
import pandas as pd

missing = 0
ok = 0
rows = []

for entry in entries:
    status = 'OK'
    if not (entry.session and entry.discipline and entry.serie and entry.localisation):
        status = 'MISSING'
    if len(entry.files) < 1:
        status = 'MISSING'
    for f in entry.files:
        if not (f.file_id and f.filename and f.filename_for_save and f.download_url):
            status = 'MISSING'
    if status == 'MISSING':
        missing += 1
    else:
        ok += 1
    rows.append({
        'ID': entry.id,
        'Status': status,
        'Session': entry.session.value if entry.session else '',
        'Discipline': entry.discipline.value if entry.discipline else '',
    })

print(f"OK: {ok} | MISSING: {missing}")
df = pd.DataFrame(rows)
df.head(10)


OK: 10 | MISSING: 0


Unnamed: 0,ID,Status,Session,Discipline
0,1,OK,NORMAL,FR_DICTEE
1,2,OK,NORMAL,FR_DICTEE
2,3,OK,NORMAL,FR_DICTEE
3,4,OK,NORMAL,FR_DICTEE
4,5,OK,NORMAL,FR_DICTEE
5,6,OK,NORMAL,FR_DICTEE
6,7,OK,NORMAL,FR_DICTEE
7,8,OK,NORMAL,FR_DICTEE
8,9,OK,NORMAL,FR_DICTEE
9,10,OK,NORMAL,FR_DICTEE


## SECTION 4: Test de téléchargement

Prépare les URLs et la structure de métadonnées comme dans `main.py` et teste le téléchargement d'un fichier.
**Note**: Cette section utilise les nouvelles `structured_entries` au lieu de l'ancien `MetadataParser`.


In [28]:
# Prepare download configuration from structured_entries (no actual download)
urls = []
metadata_list = []

for entry in entries:
    for f in entry.files:
        urls.append(f.download_url)
        metadata_list.append({
            'url': f.download_url,
            'filename': f.filename_for_save + '.pdf',
            'file_id': f.file_id,
            'year': entry.session.value.split('_')[0] if '_' in entry.session.value else None,
            'subject': entry.discipline.value,
            'session': entry.session.value,
            'series': entry.serie.value,
            'is_correction': False,
            'document_type': 'sujet',
        })

metadata = {'all': metadata_list}

print(f"Prepared {len(urls)} URLs for download")
print(f"Exemple filename: {metadata_list[0]['filename'] if metadata_list else '<none>'}")
print(f"First URL: {urls[0] if urls else '<none>'}")


Prepared 43 URLs for download
Exemple filename: 1_NORMAL_FR_DICTEE_GENERALE_AM_NORTH_63474.pdf
First URL: https://eduscol.education.fr/document/63474/download


In [29]:
# Télécharger le premier fichier à l'aide du PDFDownloader
if urls:
    from src.downloader import PDFDownloader  # Import du downloader

    output_dir = "./test_downloads"  # Dossier temporaire pour le test
    downloader = PDFDownloader(output_dir=output_dir)
    try:
        print(f"Téléchargement de : {urls[0]}")
        results = downloader.batch_download(
            urls=[urls[0]],
            metadata={'all': [metadata_list[0]]},
            max_workers=1,
            skip_existing=False,
            organize=True,
        )
        if results['successful']:
            print("Téléchargement terminé avec succès.")
            for info in results['successful']:
                # Affiche le chemin du fichier téléchargé s'il y en a un
                print("Fichier sauvegardé :", info.get('output_path', '<chemin inconnu>') if isinstance(info, dict) else str(info))
        else:
            print("Le téléchargement a échoué.")
            # Gère les cas où la structure de résultat peut être une chaîne ou un dict
            if results['failed']:
                failed_info = results['failed'][0]
                if isinstance(failed_info, dict):
                    print("Erreur :", failed_info.get('error', 'Aucune information d\'erreur.'))
                else:
                    print("Erreur :", failed_info)
    finally:
        downloader.close()
else:
    print("Aucune URL à télécharger.")


2025-10-16 12:11:34.958 | INFO     | src.downloader:__init__:87 - PDFDownloader initialized with output_dir: test_downloads
Téléchargement de : https://eduscol.education.fr/document/63474/download
2025-10-16 12:11:34.958 | INFO     | src.downloader:batch_download:307 - Starting batch download of 1 files with 1 workers
2025-10-16 12:11:34.959 | INFO     | src.downloader:download_pdf:176 - Downloading: https://eduscol.education.fr/document/63474/download


Downloading PDFs: 100%|██████████| 1/1 [00:00<00:00, 660.21file/s]

2025-10-16 12:11:34.962 | ERROR    | src.downloader:batch_download:349 - Failed to download https://eduscol.education.fr/document/63474/download: 'MetadataParser' object has no attribute 'generate_organized_filename'
2025-10-16 12:11:34.963 | INFO     | src.downloader:batch_download:357 - Batch download complete: 0 successful, 1 failed out of 1 total
Le téléchargement a échoué.
Erreur : https://eduscol.education.fr/document/63474/download
2025-10-16 12:11:34.963 | DEBUG    | src.downloader:close:480 - HTTP session closed





In [30]:
# NOTE: Cette cellule utilisait l'ancien MetadataParser
# Maintenant, les métadonnées sont disponibles directement via structured_entries
# Voir les cellules précédentes pour les exemples avec structured_entries
