In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

In [None]:
def goTo(driver, query):
    script = f"document.querySelector(\'{query}\').click()"
    driver.execute_script(script)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "mainForm")))

def dump_html(driver, index):
    html_content = driver.page_source

    # Write to file with UTF-8 encoding (handles French characters)
    with open(f'epicea_results/{index}.html', 'w', encoding='utf-8') as f:
        f.write(html_content)

def scrape():
    # Setup Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Remove for visible browser
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Launch browser
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        driver.get('https://epicea.inrs.fr/servlet/public_request')
        
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "mainForm")))

        driver.execute_script("window.mainForm.calculate.value = 'true';")
        driver.execute_script("window.mainForm.searchType.value = 'simple';")
        driver.execute_script("window.mainForm.submit();")

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "mainForm")))

        driver.execute_script("window.mainForm.goTo.value='1'")
        driver.execute_script("window.mainForm.submit();")

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "mainForm")))

        goTo(driver, 'a.lien[href*=\"public_display\"]')
        dump_html(driver, "00")
        start = time.time()
        for i in range(100):
            goTo(driver, 'a.lien[title=\"Dossier suivant\"]')
            dump_html(driver, str(i))
            #time.sleep(.1)
        print(f"Elapsed time : {time.time() - start}s")
        
    finally:
        driver.quit()

scrape()

In [None]:
import os
import glob
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
def process_html_directory(directory_path="./epicea_results", extension="*.html"):
    soups = {}
    for file_path in tqdm(glob.glob(os.path.join(directory_path, extension)), desc="Processing HTML files"):
        filename = os.path.basename(file_path)
        with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
        soups[filename] = BeautifulSoup(content, 'html.parser')
    return soups

soups = process_html_directory()

In [None]:
data = []
currentDossier = 0

for soup in tqdm(soups.values()):
    table = soup.select("table.tablein")[2]
    tmpDict = {}
    for tr in table.select("tr")[1:]:
        tds = tr.select("td")[0:2]
        title : str = tds[0].getText().strip().replace('\xa0', ' ').replace(' :', '')
        content : str = tds[1].getText().strip()
        tmpDict[title] = content
    data.append(tmpDict)


In [None]:
import pandas as pd

In [None]:
x = pd.DataFrame(data)
x