In [1]:
pip install pandas requests beautifulsoup4 tqdm google-cloud-storage google-cloud-bigquery

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def fetch_formation_details(formation_id):
    url = f"https://www.campusfaso.bf/formations/ouvrir-fiche?f={formation_id}"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        details = {}

        university_header = soup.find('th', style=lambda value: value and 'font-size: 18px' in value)
        if university_header:
            details['Université'] = university_header.text.strip()

        table = soup.find('table')
        if table:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 2:
                    key = cells[0].text.strip()
                    value = cells[1].text.strip()
                    details[key] = value

        details['ID'] = formation_id
        return details
    return {}

def process_formation(formation_id):
    details = fetch_formation_details(formation_id)
    time.sleep(0.5)  # Pause de 0.5 seconde entre chaque requête
    return details

def find_last_valid_id(start_id, step=100, max_attempts=10):
    last_valid_id = start_id
    current_id = start_id
    attempts = 0

    while attempts < max_attempts:
        details = fetch_formation_details(current_id)
        if details:
            last_valid_id = current_id
            current_id += step
        else:
            if step == 1:
                break
            current_id = last_valid_id
            step = max(1, step // 2)
        attempts += 1

    return last_valid_id

def process_formations_parallel(start_id, end_id, max_workers=20):
    formations = []
    total_formations = end_id - start_id + 1
    valid_ids = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_formation, formation_id)
                   for formation_id in range(start_id, end_id + 1)]

        for future in tqdm(as_completed(futures), total=total_formations, desc="Progression"):
            try:
                details = future.result()
                if details:
                    formations.append(details)
                    valid_ids += 1
            except Exception as exc:
                formation_id = details.get('ID', 'Inconnu')
                print(f"La formation ID {formation_id} a généré une exception: {exc}")

    df = pd.DataFrame(formations)
    return df, valid_ids

# Détection automatique du dernier ID valide
start_id = 1
print("Recherche du dernier ID valide...")
end_id = find_last_valid_id(start_id)
print(f"Dernier ID valide trouvé : {end_id}")

# Extraction des données
result, valid_count = process_formations_parallel(start_id, end_id)

print(f"Nombre total d'IDs vérifiés : {end_id - start_id + 1}")
print(f"Nombre d'IDs valides trouvés : {valid_count}")
print(f"Pourcentage d'IDs valides : {(valid_count / (end_id - start_id + 1)) * 100:.2f}%")

# Réorganiser les colonnes pour avoir 'Université' en premier
columns_order = ['ID', 'Université'] + [col for col in result.columns if col not in ['ID', 'Université']]
result = result[columns_order]
result[~result['Université'].isnull()].reset_index(drop=True, inplace=True)


Recherche du dernier ID valide...
Dernier ID valide trouvé : 901


Progression: 100%|██████████| 901/901 [00:29<00:00, 30.95it/s]


Nombre total d'IDs vérifiés : 901
Nombre d'IDs valides trouvés : 901
Pourcentage d'IDs valides : 100.00%


In [3]:
!ls

collect_bf_schools_data.ipynb  src  tutorials
