In [6]:
import requests
import json
import sseclient
import time
from datetime import datetime
import pandas as pd
import os
from collections import defaultdict

‚úì Librairies import√©es avec succ√®s
Date de d√©marrage : 2025-12-13 11:25:03



In [9]:
ENTITIES = {
    "bob_vs_society": {
        "name": "Bob vs. Society",
        "type": "film",
        "wiki_title": "Bob_vs._Society"
    },
    "fred_astaire": {
        "name": "Fred Astaire",
        "type": "actor",
        "wiki_title": "Fred_Astaire"
    },
    "shawshank_redemption": {
        "name": "The Shawshank Redemption",
        "type": "film",
        "wiki_title": "The_Shawshank_Redemption"
    },
    "christopher_nolan": {
        "name": "Christopher Nolan",
        "type": "director",
        "wiki_title": "Christopher_Nolan"
    },
    "the_godfather": {
        "name": "The Godfather",
        "type": "film",
        "wiki_title": "The_Godfather"
    }
}

print("Entities :")
for key, entity in ENTITIES.items():
    print(f"  ‚Ä¢ {entity['name']} ({entity['type']})")
print()

Entities :
  ‚Ä¢ Bob vs. Society (film)
  ‚Ä¢ Fred Astaire (actor)
  ‚Ä¢ The Shawshank Redemption (film)
  ‚Ä¢ Christopher Nolan (director)
  ‚Ä¢ The Godfather (film)



In [11]:
# ===========================
# 3. CONFIGURATION DES FICHIERS DE SORTIE
# ===========================

# Cr√©er un dossier pour stocker les donn√©es
OUTPUT_DIR = "wikimedia_stream_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Chemins des fichiers
METRICS_FILE = os.path.join(OUTPUT_DIR, "metrics.csv")
ALERTS_FILE = os.path.join(OUTPUT_DIR, "alerts.csv")
SUMMARY_FILE = os.path.join(OUTPUT_DIR, "summary.csv")

print(f"Metrics : {METRICS_FILE}")
print(f"Alerts : {ALERTS_FILE}")
print(f"Summary : {SUMMARY_FILE}\n")

Metrics : wikimedia_stream_data\metrics.csv
Alerts : wikimedia_stream_data\alerts.csv
Summary : wikimedia_stream_data\summary.csv



In [13]:
metrics_data = []
alerts_data = []

summary_stats = defaultdict(lambda: {
    "total_edits": 0,
    "total_bytes_changed": 0,
    "unique_users": set(),
    "anonymous_edits": 0,
    "major_changes": 0
})


In [15]:

def is_entity_page(title):
    """V√©rifie si la page modifi√©e correspond √† une de nos entit√©s"""
    for entity_key, entity_info in ENTITIES.items():
        if entity_info['wiki_title'] == title:
            return entity_key, entity_info['name']
    return None, None

def process_change_event(change):
    """
    Traite un √©v√©nement de modification et extrait les m√©triques
    
    Param√®tres:
    -----------
    change : dict
        √âv√©nement de modification re√ßu du stream Wikimedia
        
    Retourne:
    ---------
    dict ou None : M√©triques extraites ou None si pas pertinent
    """
    try:
        # V√©rifier que c'est un √©v√©nement de type "edit"
        if change.get('type') != 'edit':
            return None
        
        # V√©rifier que c'est une page anglaise
        if change.get('wiki') != 'enwiki':
            return None
        
        title = change.get('title', '').replace(' ', '_')
        entity_key, entity_name = is_entity_page(title)
        
        if entity_key is None:
            return None
        
        # Extraire les informations importantes
        timestamp = datetime.fromtimestamp(change.get('timestamp', 0))
        user = change.get('user', 'Unknown')
        is_bot = change.get('bot', False)
        is_anonymous = 'user' in change and not change.get('user')
        bytes_changed = change.get('length', {}).get('new', 0) - change.get('length', {}).get('old', 0)
        comment = change.get('comment', '')[:100]  # Limiter √† 100 caract√®res
        
        metric = {
            'timestamp': timestamp,
            'entity_key': entity_key,
            'entity_name': entity_name,
            'user': user,
            'is_bot': is_bot,
            'is_anonymous': is_anonymous,
            'bytes_changed': bytes_changed,
            'comment': comment
        }
        
        return metric
        
    except Exception as e:
        print(f"Erreur lors du traitement : {e}")
        return None

def check_for_alerts(metric):
    """
    V√©rifie si une m√©trique d√©clenche une alerte
    
    Crit√®res d'alerte:
    - Modification > 1000 bytes (changement majeur)
    - Modification par utilisateur anonyme
    """
    alerts = []
    
    # Alerte 1: Changement majeur
    if abs(metric['bytes_changed']) > 1000:
        alerts.append({
            'timestamp': metric['timestamp'],
            'entity_name': metric['entity_name'],
            'alert_type': 'MAJOR_CHANGE',
            'description': f"Changement de {metric['bytes_changed']} bytes",
            'user': metric['user']
        })
    
    # Alerte 2: Utilisateur anonyme
    if metric['is_anonymous']:
        alerts.append({
            'timestamp': metric['timestamp'],
            'entity_name': metric['entity_name'],
            'alert_type': 'ANONYMOUS_EDIT',
            'description': "Modification par utilisateur anonyme",
            'user': metric['user']
        })
    
    return alerts

def update_summary_stats(metric):
    """Met √† jour les statistiques r√©sum√©es"""
    key = metric['entity_key']
    summary_stats[key]['total_edits'] += 1
    summary_stats[key]['total_bytes_changed'] += metric['bytes_changed']
    summary_stats[key]['unique_users'].add(metric['user'])
    
    if metric['is_anonymous']:
        summary_stats[key]['anonymous_edits'] += 1
    
    if abs(metric['bytes_changed']) > 1000:
        summary_stats[key]['major_changes'] += 1

def save_data():
    """Sauvegarde les donn√©es dans les fichiers CSV"""
    # Sauvegarder les m√©triques
    if metrics_data:
        df_metrics = pd.DataFrame(metrics_data)
        df_metrics.to_csv(METRICS_FILE, index=False)
        print(f"‚úì {len(metrics_data)} m√©triques sauvegard√©es")
    
    # Sauvegarder les alertes
    if alerts_data:
        df_alerts = pd.DataFrame(alerts_data)
        df_alerts.to_csv(ALERTS_FILE, index=False)
        print(f"‚úì {len(alerts_data)} alertes sauvegard√©es")
    
    # Sauvegarder le r√©sum√©
    summary_list = []
    for entity_key, stats in summary_stats.items():
        summary_list.append({
            'entity_key': entity_key,
            'entity_name': ENTITIES[entity_key]['name'],
            'total_edits': stats['total_edits'],
            'total_bytes_changed': stats['total_bytes_changed'],
            'unique_users': len(stats['unique_users']),
            'anonymous_edits': stats['anonymous_edits'],
            'major_changes': stats['major_changes']
        })
    
    if summary_list:
        df_summary = pd.DataFrame(summary_list)
        df_summary.to_csv(SUMMARY_FILE, index=False)
        print(f"‚úì R√©sum√© sauvegard√© pour {len(summary_list)} entit√©s")

print("Fonctions utilitaires d√©finies ‚úì\n")

Fonctions utilitaires d√©finies ‚úì



In [26]:
def connect_to_stream(duration_seconds=300):
    """
    Se connecte au stream Wikimedia EventStreams avec gestion des d√©connexions
    """
    url = 'https://stream.wikimedia.org/v2/stream/recentchange'
    
    headers = {
        'User-Agent': 'IMDb-Stream-Project/1.0 (Educational; Contact: votre.email@example.com)',
        'Accept': 'text/event-stream'
    }
    
    print("="*60)
    print("D√âMARRAGE DE LA COLLECTE")
    print("="*60)
    print(f"URL : {url}")
    print(f"Dur√©e : {duration_seconds} secondes ({duration_seconds/60:.1f} minutes)")
    print(f"Heure de d√©but : {datetime.now().strftime('%H:%M:%S')}")
    print("="*60 + "\n")
    
    start_time = time.time()
    event_count = 0
    reconnect_count = 0
    max_reconnects = 5
    
    while time.time() - start_time < duration_seconds and reconnect_count < max_reconnects:
        try:
            if reconnect_count > 0:
                print(f"\nüîÑ Reconnexion ({reconnect_count}/{max_reconnects})...")
                time.sleep(2)  # Attendre 2 secondes avant de reconnecter
            else:
                print("üîå Connexion au stream EventStreams...")
            
            response = requests.get(
                url, 
                stream=True, 
                headers=headers,
                timeout=60  # Timeout de 60s pour d√©tecter les probl√®mes
            )
            
            if response.status_code != 200:
                print(f"‚ùå Erreur HTTP {response.status_code}")
                reconnect_count += 1
                continue
            
            if reconnect_count == 0:
                print("‚úì Connexion √©tablie au stream EventStreams")
                print("‚è≥ En attente d'√©v√©nements...\n")
            else:
                print("‚úì Reconnect√© !")
            
            buffer = ""
            
            for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
                # V√©rifier le temps √©coul√©
                elapsed = time.time() - start_time
                if elapsed > duration_seconds:
                    print(f"\n‚è∞ Dur√©e atteinte ({elapsed:.1f}s)")
                    return  # Sortir compl√®tement
                
                if not chunk:
                    continue
                
                buffer += chunk
                
                # Parser les √©v√©nements
                while '\n\n' in buffer:
                    event_text, buffer = buffer.split('\n\n', 1)
                    
                    if not event_text.strip():
                        continue
                    
                    # Extraire la ligne data:
                    data_line = None
                    for line in event_text.split('\n'):
                        if line.startswith('data: '):
                            data_line = line[6:]
                            break
                    
                    if not data_line:
                        continue
                    
                    event_count += 1
                    
                    if event_count % 100 == 0:
                        print(".", end="", flush=True)
                    
                    if event_count == 1:
                        print(f"‚úì Premier √©v√©nement re√ßu apr√®s {elapsed:.1f}s\n")
                    
                    try:
                        change = json.loads(data_line)
                        
                        if event_count % 500 == 0:
                            print(f"\nüìä {event_count} √©v√©nements en {elapsed:.1f}s")
                            if 'wiki' in change:
                                print(f"   Exemple : {change.get('wiki')} - {change.get('title', 'N/A')}")
                        
                        metric = process_change_event(change)
                        
                        if metric:
                            print(f"\n\nüéØ √âV√âNEMENT D√âTECT√â !")
                            print(f"   Entit√© : {metric['entity_name']}")
                            print(f"   User : {metric['user']}")
                            print(f"   Changement : {metric['bytes_changed']:+d} bytes\n")
                            
                            metrics_data.append(metric)
                            
                            alerts = check_for_alerts(metric)
                            if alerts:
                                print(f"   ‚ö†Ô∏è  {len(alerts)} ALERTE(S) !")
                                for alert in alerts:
                                    print(f"      ‚Üí {alert['alert_type']}")
                                alerts_data.extend(alerts)
                            
                            update_summary_stats(metric)
                            
                    except json.JSONDecodeError:
                        continue
                    except Exception:
                        continue
            
            # Si on arrive ici, la boucle s'est termin√©e normalement
            break
            
        except (requests.exceptions.ChunkedEncodingError, 
                requests.exceptions.ConnectionError,
                requests.exceptions.Timeout) as e:
            reconnect_count += 1
            if reconnect_count < max_reconnects:
                print(f"\n‚ö†Ô∏è  D√©connexion d√©tect√©e (tentative {reconnect_count}/{max_reconnects})")
            else:
                print(f"\n‚ùå Trop de d√©connexions ({max_reconnects}). Arr√™t.")
                
        except KeyboardInterrupt:
            print("\n‚è∏Ô∏è  Interrompu par l'utilisateur")
            break
            
        except Exception as e:
            print(f"\n‚ùå Erreur inattendue : {type(e).__name__}")
            reconnect_count += 1
            if reconnect_count >= max_reconnects:
                break
    
    # Sauvegarder √† la fin
    print("\n" + "="*60)
    print("SAUVEGARDE")
    print("="*60)
    save_data()
    
    print("\n" + "="*60)
    print("STATISTIQUES FINALES")
    print("="*60)
    print(f"√âv√©nements trait√©s : {event_count}")
    print(f"√âv√©nements pertinents : {len(metrics_data)}")
    print(f"Alertes : {len(alerts_data)}")
    print(f"Dur√©e : {time.time() - start_time:.1f}s")
    print(f"Reconnexions : {reconnect_count}")
    print("="*60 + "\n")

In [27]:
# ===========================
# 7. EX√âCUTION DE LA COLLECTE
# ===========================

# NOTE : D√©commenter la ligne suivante pour lancer la collecte
connect_to_stream(duration_seconds=300)  # Collecter pendant 5 minutes

print("""
INSTRUCTIONS POUR EX√âCUTER LA COLLECTE
======================================

1. Installer la librairie sseclient-py :
   pip install sseclient-py

2. D√©commenter la ligne :
   connect_to_stream(duration_seconds=300)

3. Ex√©cuter cette cellule

4. Attendre la fin de la collecte (5 minutes par d√©faut)

5. Les donn√©es seront sauvegard√©es dans le dossier 'wikimedia_stream_data/'

AJUSTER LA DUR√âE
================
Pour collecter pendant plus ou moins longtemps, modifier le param√®tre :
- 60 secondes = 1 minute
- 300 secondes = 5 minutes
- 3600 secondes = 1 heure
""")


D√âMARRAGE DE LA COLLECTE
URL : https://stream.wikimedia.org/v2/stream/recentchange
Dur√©e : 300 secondes (5.0 minutes)
Heure de d√©but : 11:49:58

üîå Connexion au stream EventStreams...
‚úì Connexion √©tablie au stream EventStreams
‚è≥ En attente d'√©v√©nements...

‚úì Premier √©v√©nement re√ßu apr√®s 2.0s

.....
üìä 500 √©v√©nements en 15.1s
   Exemple : commonswiki - Category:Files uploaded with OpenRefine by PantheraLeo1359531
.....
üìä 1000 √©v√©nements en 28.4s
   Exemple : commonswiki - Category:Files uploaded with OpenRefine by PantheraLeo1359531
.....
üìä 1500 √©v√©nements en 41.6s
   Exemple : wikidatawiki - Q73942392
.....
üìä 2000 √©v√©nements en 54.5s
   Exemple : commonswiki - File:STP 20160201 955 (39555198894).jpg
.....
üìä 2500 √©v√©nements en 69.1s
   Exemple : commonswiki - Category:Digitales Oberfl√§chenmodell 1m (IT.NRW)
.....
üìä 3000 √©v√©nements en 81.5s
   Exemple : arwiktionary - unis
.....
üìä 3500 √©v√©nements en 95.4s
   Exemple : commonswiki - Cat

In [29]:

# ===========================
# 8. ANALYSE DES R√âSULTATS
# ===========================

def analyze_results():
    """Analyse et affiche les r√©sultats collect√©s"""
    print("\n" + "="*60)
    print("ANALYSE DES R√âSULTATS")
    print("="*60 + "\n")
    
    # Charger les donn√©es
    try:
        df_metrics = pd.read_csv(METRICS_FILE)
        df_alerts = pd.read_csv(ALERTS_FILE)
        df_summary = pd.read_csv(SUMMARY_FILE)
        
        print("üìä R√âSUM√â PAR ENTIT√â")
        print("-" * 60)
        print(df_summary.to_string(index=False))
        
        print("\n\n‚ö†Ô∏è  ALERTES R√âCENTES (5 derni√®res)")
        print("-" * 60)
        if len(df_alerts) > 0:
            print(df_alerts.tail().to_string(index=False))
        else:
            print("Aucune alerte enregistr√©e")
        
        print("\n\nüìù MODIFICATIONS R√âCENTES (5 derni√®res)")
        print("-" * 60)
        if len(df_metrics) > 0:
            print(df_metrics[['timestamp', 'entity_name', 'user', 'bytes_changed']].tail().to_string(index=False))
        else:
            print("Aucune modification enregistr√©e")
        
        print("\n" + "="*60 + "\n")
        
    except FileNotFoundError:
        print("‚ö†Ô∏è  Aucune donn√©e trouv√©e. Ex√©cutez d'abord la collecte !")

# D√©commenter pour analyser les r√©sultats apr√®s la collecte
analyze_results()


ANALYSE DES R√âSULTATS

‚ö†Ô∏è  Aucune donn√©e trouv√©e. Ex√©cutez d'abord la collecte !
