## Import

In [26]:
import requests
import json
import sseclient
import time
from datetime import datetime
import pandas as pd
import os
from collections import defaultdict

In [27]:
#entities selection
ENTITIES = {
    "bob_vs_society": {
        "name": "Bob vs. Society",
        "type": "film",
        "wiki_title": "Bob_vs._Society"
    },
    "fred_astaire": {
        "name": "Fred Astaire",
        "type": "actor",
        "wiki_title": "Fred_Astaire"
    },
    "shawshank_redemption": {
        "name": "The Shawshank Redemption",
        "type": "film",
        "wiki_title": "The_Shawshank_Redemption"
    },
    "christopher_nolan": {
        "name": "Christopher Nolan",
        "type": "director",
        "wiki_title": "Christopher_Nolan"
    },
    "the_godfather": {
        "name": "The Godfather",
        "type": "film",
        "wiki_title": "The_Godfather"
    }
}

print("Entities :")
for key, entity in ENTITIES.items():
    print(f"{entity['name']} ({entity['type']})")
print()

Entities :
Bob vs. Society (film)
Fred Astaire (actor)
The Shawshank Redemption (film)
Christopher Nolan (director)
The Godfather (film)



In [28]:
OUTPUT_DIR = "wikimedia_stream_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

METRICS_FILE = os.path.join(OUTPUT_DIR, "metrics.csv")
ALERTS_FILE = os.path.join(OUTPUT_DIR, "alerts.csv")
SUMMARY_FILE = os.path.join(OUTPUT_DIR, "summary.csv")

print(f"Metrics : {METRICS_FILE}")
print(f"Alerts : {ALERTS_FILE}")
print(f"Summary : {SUMMARY_FILE}\n")

Metrics : wikimedia_stream_data\metrics.csv
Alerts : wikimedia_stream_data\alerts.csv
Summary : wikimedia_stream_data\summary.csv



In [29]:
metrics_data = []
alerts_data = []

summary_stats = defaultdict(lambda: {
    "total_edits": 0,
    "total_bytes_changed": 0,
    "unique_users": set(),
    "anonymous_edits": 0,
    "major_changes": 0
})


In [30]:
def is_entity_page(title):
    #Check if the modified page corresponds to one of our entities
    for entity_key, entity_info in ENTITIES.items():
        if entity_info['wiki_title'] == title:
            return entity_key, entity_info['name']
    return None, None

def process_change_event(change):
    try:
        # Check if is  a 'edit' event
        if change.get('type') != 'edit':
            return None
        
        # CHeck if is an english page 
        if change.get('wiki') != 'enwiki':
            return None
        
        title = change.get('title', '').replace(' ', '_')
        entity_key, entity_name = is_entity_page(title)
        
        if entity_key is None:
            return None
        
        # Extract important information
        timestamp = datetime.fromtimestamp(change.get('timestamp', 0))
        user = change.get('user', 'Unknown')
        is_bot = change.get('bot', False)
        is_anonymous = 'user' in change and not change.get('user')
        bytes_changed = change.get('length', {}).get('new', 0) - change.get('length', {}).get('old', 0)
        comment = change.get('comment', '')[:100]
        
        metric = {
            'timestamp': timestamp,
            'entity_key': entity_key,
            'entity_name': entity_name,
            'user': user,
            'is_bot': is_bot,
            'is_anonymous': is_anonymous,
            'bytes_changed': bytes_changed,
            'comment': comment
        }
        
        return metric
        
    except Exception as e:
        print(f"Error {e}")
        return None

def check_for_alerts(metric):
    alerts = []
    
    # Major change
    if abs(metric['bytes_changed']) > 1000:
        alerts.append({
            'timestamp': metric['timestamp'],
            'entity_name': metric['entity_name'],
            'alert_type': 'MAJOR_CHANGE',
            'description': f"Changement de {metric['bytes_changed']} bytes",
            'user': metric['user']
        })
    
    # Change from a anonymous user
    if metric['is_anonymous']:
        alerts.append({
            'timestamp': metric['timestamp'],
            'entity_name': metric['entity_name'],
            'alert_type': 'ANONYMOUS_EDIT',
            'description': "Modification par utilisateur anonyme",
            'user': metric['user']
        })
    
    return alerts

def update_summary_stats(metric):
    key = metric['entity_key']
    summary_stats[key]['total_edits'] += 1
    summary_stats[key]['total_bytes_changed'] += metric['bytes_changed']
    summary_stats[key]['unique_users'].add(metric['user'])
    
    if metric['is_anonymous']:
        summary_stats[key]['anonymous_edits'] += 1
    
    if abs(metric['bytes_changed']) > 1000:
        summary_stats[key]['major_changes'] += 1



def save_data():
    if metrics_data:
        df_metrics = pd.DataFrame(metrics_data)
    else:
        df_metrics = pd.DataFrame(columns=[
            'timestamp', 'entity_key', 'entity_name', 'user', 
            'is_bot', 'is_anonymous', 'bytes_changed', 'comment'
        ])
    df_metrics.to_csv(METRICS_FILE, index=False)
    print(f"✓ {len(metrics_data)} métriques sauvegardées")
    
    if alerts_data:
        df_alerts = pd.DataFrame(alerts_data)
    else:
        df_alerts = pd.DataFrame(columns=[
            'timestamp', 'entity_name', 'alert_type', 'description', 'user'
        ])
    df_alerts.to_csv(ALERTS_FILE, index=False)
    print(f"✓ {len(alerts_data)} alertes sauvegardées")

    summary_list = []
    for entity_key, stats in summary_stats.items():
        summary_list.append({
            'entity_key': entity_key,
            'entity_name': ENTITIES[entity_key]['name'],
            'total_edits': stats['total_edits'],
            'total_bytes_changed': stats['total_bytes_changed'],
            'unique_users': len(stats['unique_users']),
            'anonymous_edits': stats['anonymous_edits'],
            'major_changes': stats['major_changes']
        })
    
    if summary_list:
        df_summary = pd.DataFrame(summary_list)
    else:
        summary_list = []
        for entity_key, entity_info in ENTITIES.items():
            summary_list.append({
                'entity_key': entity_key,
                'entity_name': entity_info['name'],
                'total_edits': 0,
                'total_bytes_changed': 0,
                'unique_users': 0,
                'anonymous_edits': 0,
                'major_changes': 0
            })
        df_summary = pd.DataFrame(summary_list)
    
    df_summary.to_csv(SUMMARY_FILE, index=False)

In [31]:
def connect_to_stream(duration_seconds=300):
    url = 'https://stream.wikimedia.org/v2/stream/recentchange'
    headers = {
        'User-Agent': 'IMDb-Stream-Project/1.0 (Educational; Contact: votre.email@example.com)',
        'Accept': 'text/event-stream'
    }
    
    print(f"URL : {url}")
    print(f"Duration : {duration_seconds} seconds")

    start_time = time.time()
    event_count = 0
    reconnect_count = 0
    max_reconnects = 5
    
    while time.time() - start_time < duration_seconds and reconnect_count < max_reconnects:
        try:
                       
            response = requests.get(
                url, 
                stream=True, 
                headers=headers,
                timeout=60
            )
            
            if response.status_code != 200:
                print(f"Erreur HTTP {response.status_code}")
                reconnect_count += 1
                continue
            buffer = ""
            
            for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
                # Check time
                elapsed = time.time() - start_time
                if elapsed > duration_seconds:
                    return
                
                if not chunk:
                    continue
                
                buffer += chunk
                
                while '\n\n' in buffer:
                    event_text, buffer = buffer.split('\n\n', 1)
                    
                    if not event_text.strip():
                        continue
                    data_line = None
                    for line in event_text.split('\n'):
                        if line.startswith('data: '):
                            data_line = line[6:]
                            break
                    
                    if not data_line:
                        continue
                    
                    event_count += 1
                    
                    if event_count % 100 == 0:
                        print(".", end="", flush=True)
                    
                    if event_count == 1:
                        print(f"First event after {elapsed:.1f}s\n")
                    
                    try:
                        change = json.loads(data_line)
                        
                        if event_count % 500 == 0:
                            print(f"{event_count} events")
                            if 'wiki' in change:
                                print(f"   Exemple : {change.get('wiki')} - {change.get('title', 'N/A')}")
                        
                        metric = process_change_event(change)
                        
                        if metric:
                            print(f"   Entity : {metric['entity_name']}")
                            print(f"   User : {metric['user']}")
                            print(f"   Change : {metric['bytes_changed']:+d} bytes\n")
                            
                            metrics_data.append(metric)
                            
                            alerts = check_for_alerts(metric)
                            if alerts:
                                print(f"   ⚠️  {len(alerts)} ALERTE(S) !")
                                for alert in alerts:
                                    print(f"      → {alert['alert_type']}")
                                alerts_data.extend(alerts)
                            
                            update_summary_stats(metric)
                            
                    except json.JSONDecodeError:
                        continue
                    except Exception:
                        continue            
            break
            
        except (requests.exceptions.ChunkedEncodingError, 
                requests.exceptions.ConnectionError,
                requests.exceptions.Timeout) as e:
            reconnect_count += 1
                
        except KeyboardInterrupt:
            break
            
        except Exception as e:
            print(f"Error {type(e).__name__}")
            reconnect_count += 1
            if reconnect_count >= max_reconnects:
                break
    save_data()


In [32]:
connect_to_stream(duration_seconds=300)

URL : https://stream.wikimedia.org/v2/stream/recentchange
Duration : 30 seconds
First event after 1.7s

.....500 events
   Exemple : wikidatawiki - Q105417433
....

In [33]:
def analyze_results():

    try:
        df_metrics = pd.read_csv(METRICS_FILE)
        df_alerts = pd.read_csv(ALERTS_FILE)
        df_summary = pd.read_csv(SUMMARY_FILE)
        
        print(df_summary.to_string(index=False))
        
        print("\nLast Alerts :")
        if len(df_alerts) > 0:
            print(df_alerts.tail().to_string(index=False))
        else:
            print("     No alerts")
        
        print("\nLast change :")

        if len(df_metrics) > 0:
            print(df_metrics[['timestamp', 'entity_name', 'user', 'bytes_changed']].tail().to_string(index=False))
        else:
            print("     No changes")
    
    except FileNotFoundError:
        print("No data")
analyze_results()

          entity_key              entity_name  total_edits  total_bytes_changed  unique_users  anonymous_edits  major_changes
      bob_vs_society          Bob vs. Society            0                    0             0                0              0
        fred_astaire             Fred Astaire            0                    0             0                0              0
shawshank_redemption The Shawshank Redemption            0                    0             0                0              0
   christopher_nolan        Christopher Nolan            0                    0             0                0              0
       the_godfather            The Godfather            0                    0             0                0              0

Last Alerts :
     No alerts

Last change :
     No changes
