In [3]:
# step3_triplet_matching.py
"""
√âTAPE 3 : Matching Triplet Simultan√© (GitHub + LinkedIn + Twitter)

Objectif : Trouver des groupes de 3 profils (un par plateforme) appartenant
√† la m√™me personne, en √©valuant les trois ensemble.

Fonctionnalit√©s :
- Score global pond√©r√© : email (0.30), fullName (0.25), repo/about (0.30), username (0.10), bio (0.05)
- Support des liens externes (ex: GitHub bio mentionne Twitter/LinkedIn)
- Matching 1:1:1 strict (pas de chevauchement)
- Sortie JSON valide (conversion des types NumPy)
"""

import json
import numpy as np
from pathlib import Path
from collections import defaultdict

def convert_numpy_types(obj):
    """Convertit r√©cursivement les types NumPy en types natifs Python pour JSON."""
    if isinstance(obj, dict):
        return {k: convert_numpy_types(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [convert_numpy_types(x) for x in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

def normalize_email(email):
    if not email or "@" not in str(email):
        return None
    return str(email).strip().lower()

def cosine_sim(u, v):
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    if norm_u == 0 or norm_v == 0:
        return 0.0
    return np.dot(u, v) / (norm_u * norm_v + 1e-8)

def extract_first_name(name):
    if not name or not isinstance(name, str):
        return ""
    parts = name.strip().split()
    return parts[0].lower() if parts else ""

def main():
    print("üöÄ √âTAPE 3 : Matching Triplet Simultan√© (GitHub + LinkedIn + Twitter)")
    output_dir = Path("output")

    # Charger les profils
    with open(output_dir / "profiles_metadata.json", "r", encoding="utf-8") as f:
        profiles = json.load(f)

    # Charger les embeddings
    embeddings = {}
    for field in ["fullName", "username", "bio", "repo_descriptions", "headline"]:
        embeddings[field] = np.load(output_dir / f"{field}_embeddings.npy")

    # Indexer par plateforme + ajouter identifiant global
    github_profiles = []
    linkedin_profiles = []
    twitter_profiles = []
    profile_to_index = {}

    for idx, p in enumerate(profiles):
        p["email_norm"] = normalize_email(p.get("email"))
        p["first_name"] = extract_first_name(p.get("fullName", ""))
        p["_global_index"] = idx
        profile_to_index[idx] = p

        plat = p["platform"].lower()
        if plat == "github":
            github_profiles.append(p)
        elif plat == "linkedin":
            linkedin_profiles.append(p)
        elif plat == "twitter":
            twitter_profiles.append(p)

    print(f"üìä Profils charg√©s : {len(github_profiles)} GitHub, "
          f"{len(linkedin_profiles)} LinkedIn, {len(twitter_profiles)} Twitter")

    # Indexation par attribut (liste de dictionnaires)
    email_to_profiles = defaultdict(list)
    username_to_profiles = defaultdict(list)
    first_name_to_profiles = defaultdict(list)

    for p in profiles:
        if p["email_norm"]:
            email_to_profiles[p["email_norm"]].append(p)
        if p.get("username"):
            username_to_profiles[p["username"].lower()].append(p)
        if p["first_name"]:
            first_name_to_profiles[p["first_name"]].append(p)

    # G√©n√©rer les triplets candidats
    triplet_candidates = []

    for gh in github_profiles:
        gh_idx = gh["_global_index"]
        candidates_linkedin = set()
        candidates_twitter = set()

        # --- LinkedIn candidates ---
        if gh["email_norm"]:
            for p in email_to_profiles[gh["email_norm"]]:
                if p["platform"] == "linkedin":
                    candidates_linkedin.add(p["_global_index"])
        if gh["first_name"]:
            for p in first_name_to_profiles[gh["first_name"]]:
                if p["platform"] == "linkedin":
                    candidates_linkedin.add(p["_global_index"])
        gh_text = (gh.get("bio", "") + " " + gh.get("repo_descriptions", "")).lower()
        for p in linkedin_profiles:
            if p.get("username", "").lower() in gh_text:
                candidates_linkedin.add(p["_global_index"])

        # --- Twitter candidates ---
        if gh["email_norm"]:
            for p in email_to_profiles[gh["email_norm"]]:
                if p["platform"] == "twitter":
                    candidates_twitter.add(p["_global_index"])
        if gh["first_name"]:
            for p in first_name_to_profiles[gh["first_name"]]:
                if p["platform"] == "twitter":
                    candidates_twitter.add(p["_global_index"])
        for p in twitter_profiles:
            if p.get("username", "").lower() in gh_text:
                candidates_twitter.add(p["_global_index"])

        # --- √âvaluer tous les triplets candidats ---
        for li_idx in candidates_linkedin:
            for tw_idx in candidates_twitter:
                if len({gh_idx, li_idx, tw_idx}) < 3:
                    continue

                li = profile_to_index[li_idx]
                tw = profile_to_index[tw_idx]

                score = 0.0

                # Email exact (0.30)
                emails = {p["email_norm"] for p in [gh, li, tw] if p["email_norm"]}
                if len(emails) == 1:
                    score += 0.30

                # Full name (0.25)
                name_scores = [
                    cosine_sim(embeddings["fullName"][gh_idx], embeddings["fullName"][li_idx]),
                    cosine_sim(embeddings["fullName"][gh_idx], embeddings["fullName"][tw_idx]),
                    cosine_sim(embeddings["fullName"][li_idx], embeddings["fullName"][tw_idx])
                ]
                score += 0.25 * max(name_scores)

                # GitHub repo ‚Üî LinkedIn headline (0.30)
                score += 0.30 * cosine_sim(
                    embeddings["repo_descriptions"][gh_idx],
                    embeddings["headline"][li_idx]
                )

                # Username (0.10)
                user_scores = [
                    cosine_sim(embeddings["username"][gh_idx], embeddings["username"][li_idx]),
                    cosine_sim(embeddings["username"][gh_idx], embeddings["username"][tw_idx]),
                    cosine_sim(embeddings["username"][li_idx], embeddings["username"][tw_idx])
                ]
                score += 0.10 * max(user_scores)

                # Bio (0.05)
                bio_vecs = []
                for idx in [gh_idx, li_idx, tw_idx]:
                    vec = embeddings["bio"][idx]
                    if np.linalg.norm(vec) > 1e-8:
                        bio_vecs.append(vec)
                if len(bio_vecs) >= 2:
                    total = 0.0
                    count = 0
                    for i in range(len(bio_vecs)):
                        for j in range(i + 1, len(bio_vecs)):
                            total += cosine_sim(bio_vecs[i], bio_vecs[j])
                            count += 1
                    score += 0.05 * (total / count)

                score = min(1.0, score)

                if score >= 0.60:
                    triplet_candidates.append((gh_idx, li_idx, tw_idx, score))

    # Trier et s√©lectionner sans chevauchement
    triplet_candidates.sort(key=lambda x: x[3], reverse=True)
    used = set()
    final_triplets = []

    for gh_idx, li_idx, tw_idx, score in triplet_candidates:
        ids = {gh_idx, li_idx, tw_idx}
        if ids & used:
            continue
        used |= ids
        final_triplets.append({
            "unified_id": f"person_{len(final_triplets):05d}",
            "score": score,  # sera converti en float natif
            "profiles": [
                profile_to_index[gh_idx],
                profile_to_index[li_idx],
                profile_to_index[tw_idx]
            ]
        })

    # Sauvegarde s√©curis√©e (conversion NumPy ‚Üí JSON)
    output_path = output_dir / "unified_triplets.json"
    final_triplets_clean = convert_numpy_types(final_triplets)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_triplets_clean, f, indent=2, ensure_ascii=False)

    # Stats
    print(f"‚úÖ {len(final_triplets)} triplets complets identifi√©s")
    print(f"‚úÖ R√©sultats sauvegard√©s dans '{output_path}'")
    print("‚û°Ô∏è Pr√™t pour analyse fine (ex: localisation Marocaine)")

if __name__ == "__main__":
    main()

üöÄ √âTAPE 3 : Matching Triplet Simultan√© (GitHub + LinkedIn + Twitter)
üìä Profils charg√©s : 3770 GitHub, 4276 LinkedIn, 3353 Twitter
‚úÖ 388 triplets complets identifi√©s
‚úÖ R√©sultats sauvegard√©s dans 'output/unified_triplets.json'
‚û°Ô∏è Pr√™t pour analyse fine (ex: localisation Marocaine)
