In [2]:
import pandas as pd
import re
import os

def clean_location(loc):
    """Normalise les localisations pour d√©tecter le Maroc"""
    if pd.isna(loc):
        return "", False
    loc = str(loc).strip()
    loc_lower = loc.lower()
    if "morocco" in loc_lower or "maroc" in loc_lower:
        # Utiliser maxsplit comme mot-cl√© pour √©viter le DeprecationWarning
        city_match = re.split(r'[,;]', loc, maxsplit=1)[0].strip()
        return f"{city_match}, Morocco", True
    return loc, False

def extract_skills_from_text(text):
    """Extraction simple de comp√©tences cl√©s"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    all_skills = []
    skill_keywords = [
        "nlp", "natural language", "bert", "llm", "transformer",
        "rust", "python", "java", "swift", "javascript", "typescript",
        "docker", "kubernetes", "jenkins", "terraform", "ci/cd",
        "coreml", "pytorch", "tensorflow", "machine learning",
        "computer vision", "opencv", "object detection",
        "arabic", "french", "multilingual",
        "graph", "knowledge graph", "semantic search", "vector search",
        "ios", "android", "flutter", "react", "angular", "spring",
        "mysql", "mongodb", "postgresql"
    ]
    for kw in skill_keywords:
        if kw in text:
            all_skills.append(kw.title())
    return "; ".join(sorted(set(all_skills)))

def normalize_github(df):
    df = df.copy()
    df["full_name"] = df.get("name", "")
    df["username"] = df.get("username", "")
    df["bio"] = df.get("bio", "")
    df["current_role"] = df["bio"].str.split("|").str[0] if "bio" in df.columns else ""
    df["company"] = df.get("company", "")
    df["email"] = df.get("email", "")
    df["location"], df["is_based_in_morocco"] = zip(*df.get("location", pd.Series([None]*len(df))).apply(clean_location))
    df["github_url"] = df.get("profile_url", "").str.strip()
    df["avatar_url"] = df.get("avatar_url", "")
    df["projects_summary"] = df.get("repo_descriptions", "").astype(str)
    df["skills"] = df["projects_summary"].apply(extract_skills_from_text)
    df["linkedin_url"] = ""
    df["twitter_url"] = ""
    df["primary_platform"] = "github"
    df["source_files"] = "github.csv"
    return df

def normalize_linkedin(df):
    df = df.copy()
    df["full_name"] = df.get("full_name", "")
    df["username"] = df.get("username", "")
    df["bio"] = df.get("about", "")
    df["current_role"] = df.get("headline", "")
    df["company"] = ""
    df["email"] = ""
    df["location"], df["is_based_in_morocco"] = zip(*df.get("location", pd.Series([None]*len(df))).apply(clean_location))
    df["avatar_url"] = df.get("profile_photo", "")
    df["github_url"] = ""
    df["linkedin_url"] = "https://www.linkedin.com/in/" + df["username"].astype(str)
    df["twitter_url"] = ""
    df["projects_summary"] = df.get("projects", "").astype(str)
    df["skills"] = (df["bio"].fillna("") + " " + df["projects_summary"]).apply(extract_skills_from_text)
    df["primary_platform"] = "linkedin"
    df["source_files"] = "linkedin.csv"
    return df

def normalize_twitter(df):
    df = df.copy()
    # Utiliser 'Twitter Username' comme username si colonne diff√©rente
    if "Twitter Username" in df.columns:
        df["username"] = df["Twitter Username"]
    else:
        df["username"] = df.get("username", "")
    df["full_name"] = df.get("full_name", df.get("bio", ""))  # fallback
    df["bio"] = df.get("bio", "")
    df["current_role"] = df["bio"]  # souvent dans la bio
    df["company"] = ""
    df["email"] = ""
    df["location"] = ""
    df["is_based_in_morocco"] = False
    df["avatar_url"] = df.get("image_url", "")
    df["github_url"] = ""
    df["linkedin_url"] = ""
    df["twitter_url"] = "https://twitter.com/" + df["username"].astype(str)
    df["projects_summary"] = ""
    df["skills"] = df["bio"].apply(extract_skills_from_text)
    df["primary_platform"] = "twitter"
    df["source_files"] = "twitter.csv"
    return df

# Colonnes du sch√©ma unifi√©
UNIFIED_COLS = [
    "full_name", "username", "primary_platform", "email", "location",
    "is_based_in_morocco", "bio", "current_role", "company",
    "linkedin_url", "github_url", "twitter_url", "avatar_url",
    "skills", "projects_summary", "source_files"
]

def main():
    # Charger les fichiers (√† adapter selon ton chemin)
    github_df = pd.read_csv("github.csv", on_bad_lines='skip')
    linkedin_df = pd.read_csv("linkedin.csv", on_bad_lines='skip')
    twitter_df = pd.read_csv("twitter.csv", on_bad_lines='skip')

    # Normaliser chaque source
    gh_norm = normalize_github(github_df)[UNIFIED_COLS]
    li_norm = normalize_linkedin(linkedin_df)[UNIFIED_COLS]
    tw_norm = normalize_twitter(twitter_df)[UNIFIED_COLS]

    # Combiner
    unified = pd.concat([gh_norm, li_norm, tw_norm], ignore_index=True)

    # --- D√©doublonnage simple (optionnel mais recommand√©) ---
    # Regrouper par full_name + is_based_in_morocco (√† am√©liorer si besoin)
    # On garde la ligne avec le plus d'infos (ex: celle avec email ou plus de skills)
    unified["info_score"] = (
        unified["email"].notna().astype(int) * 2 +
        unified["skills"].str.len() / 100 +
        unified["projects_summary"].str.len() / 1000
    )
    unified = unified.sort_values("info_score", ascending=False)
    unified = unified.drop_duplicates(subset=["full_name"], keep="first")
    unified = unified.drop(columns=["info_score"])

    # Sauvegarder
    output_file = "global_tech_talents_morocco.csv"
    unified.to_csv(output_file, index=False, encoding='utf-8')
    print(f"‚úÖ Fichier unifi√© sauvegard√© : {output_file}")
    print(f"üìä {len(unified)} profils uniques, dont {unified['is_based_in_morocco'].sum()} bas√©s au Maroc.")

if __name__ == "__main__":
    main()

‚úÖ Fichier unifi√© sauvegard√© : global_tech_talents_morocco.csv
üìä 9101 profils uniques, dont 2584 bas√©s au Maroc.
