In [2]:
import pandas as pd
import numpy as np
import re
from collections import Counter

# ----------------------------
# UTILITAIRE : FORMAT LISIBLE
# ----------------------------
def human_format(num):
    """Convertit 2584 ‚Üí '2 584'"""
    return f"{int(num):,}".replace(",", " ")

# ----------------------------
# CHARGEMENT ET PR√âPARATION
# ----------------------------
df = pd.read_csv("global_tech_talents_morocco.csv")

# S'assurer que is_based_in_morocco est bool√©en
df['is_based_in_morocco'] = df['is_based_in_morocco'].astype(bool)

# Normaliser les localisations pour les profils marocains
def normalize_location(loc):
    if pd.isna(loc):
        return ""
    loc = str(loc).strip()
    loc_lower = loc.lower()
    if "morocco" in loc_lower or "maroc" in loc_lower:
        # Extraire la ville avant la virgule si possible
        city = re.split(r'[,;]', loc, maxsplit=1)[0].strip()
        # Nettoyer les variantes
        city = re.sub(r'prefecture of ', '', city, flags=re.IGNORECASE)
        city = re.sub(r'-.*$', '', city).strip()  # ex: "Casablanca-Settat" ‚Üí "Casablanca"
        if city.lower() in ["morocco", "maroc", ""]:
            return "Morocco"
        return city.title()
    return loc

df['location_clean'] = df['location'].apply(normalize_location)

# Filtrer les profils marocains (copie explicite pour √©viter le warning)
morocco_df = df[df['is_based_in_morocco']].copy()
morocco_df['city'] = morocco_df['location_clean']

# ----------------------------
# STATISTIQUES GLOBALES
# ----------------------------
total = len(df)
morocco_count = morocco_df.shape[0]

print("üåç STATISTIQUES GLOBALES")
print("=" * 50)
print(f"Total de profils unifi√©s       : {human_format(total)}")
print(f"Profils bas√©s au Maroc         : {human_format(morocco_count)} ({morocco_count/total:.1%})")

# R√©partition par plateforme
platform_counts = df['primary_platform'].value_counts()
print("\nüì± R√©partition par plateforme principale :")
for plat, cnt in platform_counts.items():
    print(f"   - {plat.capitalize():<10} : {human_format(cnt)} ({cnt/total:.1%})")

# ----------------------------
# ANALYSE TEXTUELLE
# ----------------------------
def analyze_text(col, name):
    s = col.fillna("").astype(str)
    char_len = s.str.len()
    word_len = s.str.split().str.len()
    non_empty = (char_len > 0).sum()
    very_short = (char_len < 10).sum()
    return {
        'non_empty': non_empty,
        'empty_pct': 1 - non_empty / len(s),
        'very_short_pct': very_short / len(s),
        'mean_chars': char_len.mean(),
        'median_chars': char_len.median(),
        'mean_words': word_len.mean(),
        'max_chars': char_len.max()
    }

fields = {'Bio': df['bio'], 'Projets': df['projects_summary'], 'Comp√©tences': df['skills']}
print("\nüìù Qualit√© des donn√©es textuelles")
print("-" * 50)
for name, col in fields.items():
    stats = analyze_text(col, name)
    print(f"\nüìÑ {name}:")
    print(f"   Non vide            : {human_format(stats['non_empty'])} ({1 - stats['empty_pct']:.1%})")
    print(f"   Tr√®s court (<10 car): {stats['very_short_pct']:.1%}")
    print(f"   Moy. caract√®res     : {stats['mean_chars']:.0f}")
    print(f"   M√©diane             : {stats['median_chars']:.0f}")
    print(f"   Moy. mots           : {stats['mean_words']:.1f}")

# ----------------------------
# R√îLES ET COMP√âTENCES
# ----------------------------
def clean_role(role):
    if pd.isna(role) or str(role).strip() == "":
        return ""
    r = str(role).lower()
    r = re.sub(r'\s*at\s.*', '', r)
    r = re.sub(r'[^a-z\s]', ' ', r)
    return r.strip().title()

df['clean_role'] = df['current_role'].apply(clean_role)
valid_roles = df[df['clean_role'] != '']['clean_role']
top_roles = valid_roles.value_counts().head(10)

print("\nüíº Top 10 des r√¥les :")
for i, (role, cnt) in enumerate(top_roles.items(), 1):
    print(f"   {i}. {role:<30} ({human_format(cnt)})")

# Comp√©tences
all_skills = []
for sk in df['skills'].dropna():
    for s in str(sk).split(";"):
        s_clean = s.strip()
        if s_clean and s_clean not in ["", "N/A"]:
            all_skills.append(s_clean)

skill_counts = Counter(all_skills)
top_skills = skill_counts.most_common(15)
print("\nüõ†Ô∏è  Top 15 des comp√©tences :")
for i, (skill, cnt) in enumerate(top_skills, 1):
    print(f"   {i}. {skill:<25} ({human_format(cnt)})")

# ----------------------------
# VILLES AU MAROC
# ----------------------------
if not morocco_df.empty:
    city_counts = morocco_df['city'].value_counts().head(10)
    print("\nüèôÔ∏è  Top villes au Maroc :")
    for city, cnt in city_counts.items():
        print(f"   - {city:<25} : {human_format(cnt)}")

# ----------------------------
# PR√âSENCE EN LIGNE
# ----------------------------
has_email = df['email'].notna() & (df['email'] != "")
has_gh = df['github_url'].str.contains('github', na=False)
has_li = df['linkedin_url'].str.contains('linkedin', na=False)
has_tw = df['twitter_url'].str.contains(r'twitter|x', case=False, na=False)

platform_sum = has_gh.astype(int) + has_li.astype(int) + has_tw.astype(int)
multichannel = (platform_sum >= 2).sum()

print("\nüîó Pr√©sence en ligne :")
print(f"   - Email      : {human_format(has_email.sum())} ({has_email.mean():.1%})")
print(f"   - GitHub     : {human_format(has_gh.sum())} ({has_gh.mean():.1%})")
print(f"   - LinkedIn   : {human_format(has_li.sum())} ({has_li.mean():.1%})")
print(f"   - Twitter/X  : {human_format(has_tw.sum())} ({has_tw.mean():.1%})")
print(f"   - ‚â•2 plateformes : {human_format(multichannel)} ({multichannel/total:.1%})")

print("\n‚úÖ Analyse termin√©e.")

üåç STATISTIQUES GLOBALES
Total de profils unifi√©s       : 9 101
Profils bas√©s au Maroc         : 2 584 (28.4%)

üì± R√©partition par plateforme principale :
   - Linkedin   : 4 120 (45.3%)
   - Github     : 3 448 (37.9%)
   - Twitter    : 1 533 (16.8%)

üìù Qualit√© des donn√©es textuelles
--------------------------------------------------

üìÑ Bio:
   Non vide            : 7 207 (79.2%)
   Tr√®s court (<10 car): 21.3%
   Moy. caract√®res     : 193
   M√©diane             : 74
   Moy. mots           : 27.8

üìÑ Projets:
   Non vide            : 7 568 (83.2%)
   Tr√®s court (<10 car): 58.2%
   Moy. caract√®res     : 444
   M√©diane             : 2
   Moy. mots           : 63.7

üìÑ Comp√©tences:
   Non vide            : 3 531 (38.8%)
   Tr√®s court (<10 car): 70.8%
   Moy. caract√®res     : 11
   M√©diane             : 0
   Moy. mots           : 1.4

üíº Top 10 des r√¥les :
   1. Software Engineer              (253)
   2. Full Stack Developer           (120)
   3. Software Eng