In [1]:
import json
from pathlib import Path

# Chargement
with open("output/unified_profiles.json", "r", encoding="utf-8") as f:
    unified_pairs = json.load(f)  # r√©sultat du matching par paires

with open("output/unified_triplets.json", "r", encoding="utf-8") as f:
    unified_triplets = json.load(f)  # r√©sultat du matching triplet

In [1]:
# evaluate_matching_strategies.py
import json
from pathlib import Path

MOROCCAN_CITIES = {"casablanca", "rabat", "mohammedia", "marrakech", "fes", "agadir", "tanger", "meknes", "oujda", "kenitra", "safi", "tetouan"}

def is_moroccan(profile):
    loc = str(profile.get("location", "")).lower()
    return ("morocco" in loc or "maroc" in loc or
            any(city in loc for city in MOROCCAN_CITIES))

def count_platforms(profiles):
    return {p["platform"] for p in profiles}

def evaluate_unified(unified_list, name):
    total_identities = len(unified_list)
    total_profiles = sum(len(u["profiles"]) for u in unified_list)
    pairs_count = 0
    triplets_count = 0
    moroccan_identities = 0
    high_confidence = 0  # ‚â•2 profils avec email commun ou score √©lev√©

    all_pairs = []

    for u in unified_list:
        profiles = u["profiles"]
        platforms = count_platforms(profiles)
        n = len(profiles)

        # Compter les paires (combinaisons 2 √† 2)
        pairs_in_cluster = n * (n - 1) // 2
        pairs_count += pairs_in_cluster

        if n >= 3 and len(platforms) == 3:
            triplets_count += 1
        elif n == 2 and len(platforms) == 2:
            pass  # paire classique

        # V√©rifier si marocain
        if any(is_moroccan(p) for p in profiles):
            moroccan_identities += 1

        # Confiance : email partag√© ou score ‚â• 0.7
        emails = {p.get("email_norm") or p.get("email") for p in profiles}
        emails = {e for e in emails if e and "@" in str(e)}
        if len(emails) == 1 or u.get("score", 0) >= 0.7:
            high_confidence += 1

        # Stocker les paires pour analyse fine (optionnel)
        for i in range(len(profiles)):
            for j in range(i+1, len(profiles)):
                p1, p2 = profiles[i], profiles[j]
                all_pairs.append((p1, p2))

    return {
        "m√©thode": name,
        "identit√©s_unifi√©es": total_identities,
        "profils_couverts": total_profiles,
        "paires_li√©es": pairs_count,
        "triplets_complets": triplets_count,
        "identit√©s_marocaines": moroccan_identities,
        "confiance_√©lev√©e": high_confidence,
        "couverture_%": round(total_profiles / 11399 * 100, 1) if name == "Paires" else round(total_profiles / 11399 * 100, 1)
    }

def main():
    # Total de profils : 3770 + 4276 + 3353 = 11399
    TOTAL_PROFILES = 3770 + 4276 + 3353

    # Charger les deux r√©sultats
    try:
        with open("output/unified_profiles.json", "r", encoding="utf-8") as f:
            unified_pairs = json.load(f)
    except FileNotFoundError:
        print("‚ö†Ô∏è unified_profiles.json manquant")
        unified_pairs = []

    try:
        with open("output/unified_triplets.json", "r", encoding="utf-8") as f:
            unified_triplets = json.load(f)
    except FileNotFoundError:
        print("‚ö†Ô∏è unified_triplets.json manquant")
        unified_triplets = []

    # √âvaluer
    eval_pairs = evaluate_unified(unified_pairs, "Paires (transitivit√©)")
    eval_triplets = evaluate_unified(unified_triplets, "Triplets (simultan√©s)")

    # Afficher comparaison
    print("\n" + "="*80)
    print("üìä COMPARAISON DES STRAT√âGIES DE MATCHING")
    print("="*80)
    print(f"{'M√©trique':<25} {'Paires (transitivit√©)':<25} {'Triplets (simultan√©s)':<25}")
    print("-"*80)

    keys = [
        "identit√©s_unifi√©es",
        "profils_couverts",
        "paires_li√©es",
        "triplets_complets",
        "identit√©s_marocaines",
        "confiance_√©lev√©e",
    ]

    for key in keys:
        val1 = eval_pairs[key]
        val2 = eval_triplets[key]
        print(f"{key.replace('_', ' ').title():<25} {val1:<25} {val2:<25}")

    print(f"\nüí° Interpr√©tation :")
    print(f" ‚Üí La m√©thode par paires maximise la couverture (rappel).")
    print(f" ‚Üí La m√©thode triplet maximise la pr√©cision (qualit√©).")
    print(f" ‚Üí Pour une analyse fine (ex: localisation), les triplets sont plus fiables.")
    print(f" ‚Üí Pour une base exhaustive (ex: recrutement), les paires sont pr√©f√©rables.")

if __name__ == "__main__":
    main()


üìä COMPARAISON DES STRAT√âGIES DE MATCHING
M√©trique                  Paires (transitivit√©)     Triplets (simultan√©s)    
--------------------------------------------------------------------------------
Identit√©s Unifi√©es        564                       388                      
Profils Couverts          4685                      1164                     
Paires Li√©es              196227                    1164                     
Triplets Complets         138                       388                      
Identit√©s Marocaines      536                       385                      
Confiance √âlev√©e          152                       386                      

üí° Interpr√©tation :
 ‚Üí La m√©thode par paires maximise la couverture (rappel).
 ‚Üí La m√©thode triplet maximise la pr√©cision (qualit√©).
 ‚Üí Pour une analyse fine (ex: localisation), les triplets sont plus fiables.
 ‚Üí Pour une base exhaustive (ex: recrutement), les paires sont pr√©f√©rables.
