In [None]:
# Install dependencies if needed
import sys
import subprocess

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
try:
    import selenium
    import loguru
    import bs4
except ImportError:
    print("Installing required packages...")
    install_package("selenium")
    install_package("loguru")
    install_package("beautifulsoup4")
    install_package("lxml")
    print("Packages installed successfully!")


Installing required packages...


# Test DNBScraper

Ce notebook exécute un test rapide du scraper:

1. Importe `DNBScraper`
2. Extrait les liens PDF sur la première page uniquement (`max_pages=1`)
3. Affiche un résumé (total, années, matières)
4. Affiche les 3 premiers liens avec métadonnées


In [None]:
# Imports
import sys
import os

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.scraper import DNBScraper
from src.parser import MetadataParser
from loguru import logger

# Optionally, adjust logging level
logger.remove()
logger.add(lambda msg: print(msg, end=""))



ModuleNotFoundError: No module named 'selenium'

In [None]:
# Create scraper and extract one page
scraper = DNBScraper()
pdf_links = scraper.extract_pdf_links(max_pages=1)
print(f"Total links found on first page: {len(pdf_links)}")


In [None]:
# Summary
summary = scraper.get_summary_dict()
print("\n==== SUMMARY ====")
print(f"Total PDFs: {summary['total']}")
print(f"Years: {summary['years']}")
print(f"Subjects: {summary['subjects']}")


In [None]:
# Show first 3 links with parsed metadata
parser = MetadataParser()
for i, link in enumerate(pdf_links[:3], start=1):
    meta = parser.parse_url(link['url'], link.get('data_atl_name'))
    print(f"\nLink #{i}")
    print(f"URL: {link['url']}")
    print(f"data-atl-name: {link.get('data_atl_name', '')}")
    print(f"filename: {meta.get('filename')}")
    print(f"year: {meta.get('year')}")
    print(f"subject: {meta.get('subject')}")
    print(f"session: {meta.get('session')}")
    print(f"series: {meta.get('series')}")
    print(f"type: {meta.get('document_type')}")
