# Week 6: Sankey Visualization Data Preprocessing

**Goal**: Process ACLED data to create actor-country-conflict type flows for Sankey visualization.

**Output**: `viz10_actor_sankey.json` with:
- Top 6 actors per actor type
- Aggregated flows: Actor → Country → Event Type
- Event counts and fatality statistics
- Timeline data hooks for future implementation

In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

print("Libraries imported successfully")

## 1. Load and Explore ACLED Data

In [None]:
# Load ACLED data
print("Loading ACLED dataset...")
df = pd.read_csv('../raw-data/ACLED/ACLED_2025-10-29.csv')

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"Date range: {df['event_date'].min()} to {df['event_date'].max()}")

# Display basic info
df.info()

In [None]:
# Key columns for Sankey visualization
key_columns = ['actor1', 'inter1', 'country', 'event_type', 'sub_event_type', 
               'fatalities', 'year', 'event_date', 'event_id_cnty']

print("Key columns for analysis:")
for col in key_columns:
    if col in df.columns:
        print(f"✓ {col}: {df[col].dtype}, {df[col].isnull().sum()} nulls")
    else:
        print(f"✗ {col}: NOT FOUND")

# Sample data
print("\nSample data:")
df[key_columns].head()

## 2. Analyze Actor Types and Distribution

In [None]:
# Actor type distribution
print("Actor Type Distribution (inter1):")
actor_type_counts = df['inter1'].value_counts()
print(actor_type_counts)

# Visualize distribution
plt.figure(figsize=(12, 6))
actor_type_counts.plot(kind='bar')
plt.title('Event Distribution by Actor Type')
plt.xlabel('Actor Type')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Event type distribution
print("\nEvent Type Distribution:")
event_type_counts = df['event_type'].value_counts()
print(event_type_counts)

# Visualize
plt.figure(figsize=(10, 6))
event_type_counts.plot(kind='bar')
plt.title('Distribution by Event Type')
plt.xlabel('Event Type')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Identify Top 6 Actors per Actor Type

In [None]:
# Count events per actor within each actor type
actor_stats = df.groupby(['inter1', 'actor1']).agg({
    'event_id_cnty': 'count',
    'fatalities': 'sum',
    'country': 'nunique'
}).rename(columns={
    'event_id_cnty': 'total_events',
    'country': 'countries_active'
}).reset_index()

print(f"Total unique actors: {len(actor_stats)}")
print("\nTop actors overall:")
print(actor_stats.nlargest(10, 'total_events')[['actor1', 'inter1', 'total_events', 'fatalities']])

In [None]:
# Get top 6 actors per actor type
top_actors_by_type = {}

for actor_type in actor_stats['inter1'].unique():
    if pd.isna(actor_type):
        continue
    
    type_actors = actor_stats[actor_stats['inter1'] == actor_type].nlargest(6, 'total_events')
    top_actors_by_type[actor_type] = type_actors['actor1'].tolist()
    
    print(f"\n{actor_type} - Top 6 Actors:")
    for i, (_, row) in enumerate(type_actors.iterrows(), 1):
        print(f"{i}. {row['actor1'][:60]:<60} | {row['total_events']:>6} events | {row['fatalities']:>7} fatalities")

# Summary
total_selected_actors = sum(len(actors) for actors in top_actors_by_type.values())
print(f"\nTotal selected actors: {total_selected_actors}")
print(f"Actor types: {list(top_actors_by_type.keys())}")

## 4. Filter Data and Create Flows

In [None]:
# Create list of all selected actors
all_selected_actors = []
for actor_list in top_actors_by_type.values():
    all_selected_actors.extend(actor_list)

print(f"Filtering data for {len(all_selected_actors)} selected actors...")

# Filter dataset to only selected actors
df_filtered = df[df['actor1'].isin(all_selected_actors)].copy()

print(f"Original dataset: {len(df):,} events")
print(f"Filtered dataset: {len(df_filtered):,} events ({len(df_filtered)/len(df)*100:.1f}% of original)")

# Check data quality
print("\nData quality check:")
print(f"Missing actor1: {df_filtered['actor1'].isnull().sum()}")
print(f"Missing country: {df_filtered['country'].isnull().sum()}")
print(f"Missing event_type: {df_filtered['event_type'].isnull().sum()}")
print(f"Missing fatalities: {df_filtered['fatalities'].isnull().sum()}")

In [None]:
# Clean data - fill missing fatalities with 0
df_filtered['fatalities'] = df_filtered['fatalities'].fillna(0)

# Remove rows with missing essential data
essential_columns = ['actor1', 'inter1', 'country', 'event_type']
df_clean = df_filtered.dropna(subset=essential_columns)

print(f"After cleaning: {len(df_clean):,} events ({len(df_clean)/len(df_filtered)*100:.1f}% retained)")

# Convert year to int
df_clean['year'] = df_clean['year'].astype(int)

print(f"Year range: {df_clean['year'].min()} - {df_clean['year'].max()}")

## 5. Aggregate Flows by Actor-Country-Event Type

In [None]:
# Create flows: Actor → Country → Event Type
print("Aggregating flows...")

flows = df_clean.groupby(['actor1', 'inter1', 'country', 'event_type']).agg({
    'event_id_cnty': 'count',
    'fatalities': 'sum', 
    'year': lambda x: sorted(x.unique()),
    'sub_event_type': lambda x: x.mode().iloc[0] if not x.empty else 'Unknown'
}).rename(columns={
    'event_id_cnty': 'events'
}).reset_index()

print(f"Total flows created: {len(flows)}")
print(f"Unique actors: {flows['actor1'].nunique()}")
print(f"Unique countries: {flows['country'].nunique()}")
print(f"Unique event types: {flows['event_type'].nunique()}")

# Display sample flows
print("\nSample flows:")
flows.head(10)

In [None]:
# Create year-by-year breakdown for timeline hooks
print("Creating year-by-year breakdown...")

year_breakdown = df_clean.groupby(['actor1', 'inter1', 'country', 'event_type', 'year']).agg({
    'event_id_cnty': 'count',
    'fatalities': 'sum'
}).rename(columns={'event_id_cnty': 'events'}).reset_index()

print(f"Year-breakdown records: {len(year_breakdown)}")
print("Sample year breakdown:")
year_breakdown.head()

## 6. Calculate Statistics for Color Coding

In [None]:
# Fatality statistics for color scaling
fatality_stats = {
    'min': int(flows['fatalities'].min()),
    'max': int(flows['fatalities'].max()),
    'mean': float(flows['fatalities'].mean()),
    'median': float(flows['fatalities'].median()),
    'q25': float(flows['fatalities'].quantile(0.25)),
    'q75': float(flows['fatalities'].quantile(0.75))
}

print("Fatality Statistics:")
for key, value in fatality_stats.items():
    print(f"{key}: {value:,.1f}")

# Event count statistics
event_stats = {
    'min': int(flows['events'].min()),
    'max': int(flows['events'].max()),
    'mean': float(flows['events'].mean()),
    'median': float(flows['events'].median())
}

print("\nEvent Count Statistics:")
for key, value in event_stats.items():
    print(f"{key}: {value:,.1f}")

In [None]:
# Visualize distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Fatalities distribution (log scale)
flows_nonzero_fatalities = flows[flows['fatalities'] > 0]
ax1.hist(flows_nonzero_fatalities['fatalities'], bins=50, alpha=0.7)
ax1.set_xlabel('Fatalities')
ax1.set_ylabel('Number of Flows')
ax1.set_title('Distribution of Fatalities (Non-zero only)')
ax1.set_yscale('log')

# Events distribution
ax2.hist(flows['events'], bins=50, alpha=0.7)
ax2.set_xlabel('Events')
ax2.set_ylabel('Number of Flows')
ax2.set_title('Distribution of Event Counts')
ax2.set_yscale('log')

plt.tight_layout()
plt.show()

print(f"Flows with zero fatalities: {(flows['fatalities'] == 0).sum()} / {len(flows)} ({(flows['fatalities'] == 0).mean()*100:.1f}%)")

## 7. Prepare Final Dataset Structure

In [None]:
# Create year breakdown lookup dictionary (more robust approach)
# Build a dictionary keyed by (actor, actor_type, country, event_type) -> year data
print("Building year breakdown lookup...")

year_breakdown_lookup = {}
for _, row in year_breakdown.iterrows():
    key = (row['actor1'], row['inter1'], row['country'], row['event_type'])
    if key not in year_breakdown_lookup:
        year_breakdown_lookup[key] = {}
    year_breakdown_lookup[key][str(int(row['year']))] = {
        'events': int(row['events']),
        'fatalities': int(row['fatalities'])
    }

print(f"Created lookup with {len(year_breakdown_lookup)} unique flow keys")

# Test lookup
sample_key = list(year_breakdown_lookup.keys())[0]
print(f"\nSample lookup for {sample_key}:")
print(year_breakdown_lookup[sample_key])

In [None]:
# Create final dataset structure
print("Building final dataset structure...")

# Calculate statistics
fatality_stats = {
    'min': int(flows['fatalities'].min()),
    'max': int(flows['fatalities'].max()),
    'mean': float(flows['fatalities'].mean()),
    'median': float(flows['fatalities'].median()),
    'q25': float(flows['fatalities'].quantile(0.25)),
    'q75': float(flows['fatalities'].quantile(0.75))
}

event_stats = {
    'min': int(flows['events'].min()),
    'max': int(flows['events'].max()),
    'mean': float(flows['events'].mean()),
    'median': float(flows['events'].median())
}

# Convert flows to final format with year_breakdown from lookup
flows_list = []
missing_year_breakdown = 0

for _, row in flows.iterrows():
    # Get year breakdown from lookup dictionary
    key = (row['actor1'], row['inter1'], row['country'], row['event_type'])
    year_breakdown_dict = year_breakdown_lookup.get(key, {})
    
    if not year_breakdown_dict:
        missing_year_breakdown += 1
    
    flow_dict = {
        'actor': row['actor1'],
        'actor_type': row['inter1'],
        'country': row['country'],
        'event_type': row['event_type'],
        'sub_event_type': row['sub_event_type'],  # Added sub_event_type
        'events': int(row['events']),
        'fatalities': int(row['fatalities']),
        'years': [int(y) for y in row['year']],
        'year_breakdown': year_breakdown_dict
    }
    
    flows_list.append(flow_dict)

print(f"Created {len(flows_list)} flow records")
print(f"Flows missing year_breakdown: {missing_year_breakdown}")

# Verify year_breakdown is populated
populated = sum(1 for f in flows_list if f['year_breakdown'])
print(f"Flows with populated year_breakdown: {populated}/{len(flows_list)}")

# Create final dataset
final_dataset = {
    'metadata': {
        'created_date': datetime.now().isoformat(),
        'source': 'ACLED_2025-10-29.csv',
        'total_actors': len(all_selected_actors),
        'total_flows': len(flows_list),
        'year_range': [int(df_clean['year'].min()), int(df_clean['year'].max())],
        'fatality_stats': fatality_stats,
        'event_stats': event_stats
    },
    'actor_types': top_actors_by_type,
    'flows': flows_list
}

print("\nFinal dataset structure:")
print(f"- Metadata: {len(final_dataset['metadata'])} fields")
print(f"- Actor types: {len(final_dataset['actor_types'])} categories")
print(f"- Flows: {len(final_dataset['flows'])} records")

# Show sample flow with year_breakdown
print("\nSample flow with year_breakdown:")
sample_with_data = next((f for f in flows_list if f['year_breakdown']), None)
if sample_with_data:
    print(json.dumps(sample_with_data, indent=2))

## 8. Export to JSON

In [None]:
# Export to JSON file
output_file = '../viz-datasets/viz10_actor_sankey.json'

print(f"Exporting to {output_file}...")

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, indent=2, ensure_ascii=False)

# Check file size
import os
file_size = os.path.getsize(output_file) / 1024**2
print(f"File exported successfully!")
print(f"File size: {file_size:.1f} MB")

# Test loading
with open(output_file, 'r', encoding='utf-8') as f:
    test_load = json.load(f)

print(f"\nVerification - loaded {len(test_load['flows'])} flows")
print(f"Sample flow keys: {list(test_load['flows'][0].keys())}")

## 9. Summary and Next Steps

In [None]:
# Final summary
print("=== DATA PREPROCESSING COMPLETE ===")
print(f"\nOriginal dataset: {len(df):,} events")
print(f"Processed flows: {len(flows_list):,}")
print(f"Selected actors: {len(all_selected_actors)}")
print(f"Actor types: {len(top_actors_by_type)}")

print("\nActor type breakdown:")
for actor_type, actors in top_actors_by_type.items():
    print(f"- {actor_type}: {len(actors)} actors")

print(f"\nOutput file: {output_file}")
print(f"File size: {file_size:.1f} MB")

print("\n=== READY FOR PHASE 2: SANKEY IMPLEMENTATION ===")
print("Next steps:")
print("1. Copy JSON to public/src/assets/data/")
print("2. Update SankeyChart.vue with actor dropdown")
print("3. Implement 3-level sankey visualization")
print("4. Add fatality-based color coding")