In [1]:
"""
√âTAPE 4 ‚Äì FUSION DES PROFILS (Profil Unique Consolid√©)
Objectif : cr√©er un profil global √† partir des identit√©s unifi√©es (triplets + paires r√©siduelles)

Conform√©ment au cahier des charges :
- Priorit√© par plateforme pour chaque attribut
- Agr√©gation coh√©rente (texte / num√©rique)
- Gestion des conflits
- G√©n√©ration de profil structur√©
- Sauvegarde en JSON
"""

import json
from pathlib import Path

# üîù Priorit√© des plateformes selon le type d'information
PLATFORM_PRIORITY = {
    "fullName":        ["linkedin", "github", "twitter"],
    "jobTitle":        ["linkedin"],
    "company":         ["linkedin"],
    "bio":             ["linkedin", "twitter", "github"],
    "location":        ["linkedin", "twitter", "github"],
    "email":           ["github", "linkedin", "twitter"],
    "publicRepos":     ["github"],
    "followersCount":  ["twitter", "linkedin", "github"],  # on prendra le max plus tard
}

# üìä Champs num√©riques : on appliquera max() ou moyenne si besoin
NUMERIC_FIELDS = {"followersCount", "publicRepos", "followingCount"}

def get_best_text_value(profiles, field):
    """R√©cup√®re la meilleure valeur textuelle selon la priorit√© des plateformes."""
    if field not in PLATFORM_PRIORITY:
        # Fallback : prendre la premi√®re non vide
        for p in profiles:
            if p.get(field):
                return p[field]
        return None

    for platform in PLATFORM_PRIORITY[field]:
        for p in profiles:
            if p["platform"].lower() == platform and p.get(field):
                return p[field]
    # Si aucune priorit√© ne matche, chercher dans n'importe quelle plateforme
    for p in profiles:
        if p.get(field):
            return p[field]
    return None

def aggregate_text_fields(profiles, field):
    """Concat√®ne les valeurs uniques et non vides d‚Äôun champ textuel."""
    values = set()
    for p in profiles:
        val = p.get(field)
        if val and isinstance(val, str):
            val = val.strip()
            if val:
                values.add(val)
    return " | ".join(sorted(values)) if values else None

def aggregate_numeric_fields(profiles, field):
    """Retourne la valeur maximale pour les champs num√©riques."""
    values = []
    for p in profiles:
        val = p.get(field)
        if isinstance(val, (int, float)) and val is not None:
            values.append(val)
    return max(values) if values else None

def consolidate_one_identity(cluster):
    """Construit un profil consolid√© √† partir d‚Äôun cluster (paire ou triplet)."""
    profiles = cluster["profiles"]
    unified_id = cluster["unified_id"]

    # Champs avec priorit√© explicite
    consolidated = {
        "unified_id": unified_id,
        "fullName": get_best_text_value(profiles, "fullName"),
        "jobTitle": get_best_text_value(profiles, "jobTitle"),
        "company": get_best_text_value(profiles, "company"),
        "email": get_best_text_value(profiles, "email"),
        "location": get_best_text_value(profiles, "location"),
    }

    # Champs agr√©g√©s (texte)
    consolidated["bio"] = aggregate_text_fields(profiles, "bio")

    # Champs num√©riques (max)
    for field in NUMERIC_FIELDS:
        consolidated[field] = aggregate_numeric_fields(profiles, field)

    # Sources
    consolidated["platforms_sources"] = sorted({p["platform"] for p in profiles})

    # Nettoyage final : supprimer les cl√©s avec valeur None
    return {k: v for k, v in consolidated.items() if v is not None}

def main():
    output_dir = Path("output")
    input_path = output_dir / "unified_hybrid.json"
    output_path = output_dir / "final_unified_profiles.json"

    if not input_path.exists():
        print(f"‚ùå Fichier d'entr√©e introuvable : {input_path}")
        return

    with open(input_path, "r", encoding="utf-8") as f:
        hybrid_clusters = json.load(f)

    print(f"üîÑ Consolidation de {len(hybrid_clusters)} identit√©s unifi√©es...")

    consolidated_profiles = []
    for cluster in hybrid_clusters:
        profile = consolidate_one_identity(cluster)
        consolidated_profiles.append(profile)

    # Sauvegarde
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(consolidated_profiles, f, indent=2, ensure_ascii=False)

    # Stats
    total = len(consolidated_profiles)
    with_email = sum(1 for p in consolidated_profiles if "email" in p)
    with_job = sum(1 for p in consolidated_profiles if "jobTitle" in p)

    print(f"\n‚úÖ Fusion termin√©e !")
    print(f"üìÅ Fichier sauvegard√© : {output_path}")
    print(f"üìä {total} profils unifi√©s")
    print(f"üìß {with_email}/{total} avec email")
    print(f"üíº {with_job}/{total} avec poste")

if __name__ == "__main__":
    main()

üîÑ Consolidation de 605 identit√©s unifi√©es...

‚úÖ Fusion termin√©e !
üìÅ Fichier sauvegard√© : output/final_unified_profiles.json
üìä 605 profils unifi√©s
üìß 427/605 avec email
üíº 0/605 avec poste
