# Wikipedia Bias Analysis: Pageview-Filtered Networks

This notebook analyzes retention bias in Wikipedia link networks filtered by page popularity (pageviews).

## Filtering Modes

1. **Language-Specific**: Different pageview thresholds per language edition
2. **Global/Multilayer**: Single threshold based on aggregated pageviews across all languages

## Workflow

1. **Exploration**: Compare statistics for multiple quantile thresholds
2. **Analysis**: Select one threshold and run complete bias analysis
3. **Results**: View and save retention curves and AUC matrices

## Dataset

- **Edges**: Weighted links from DBpedia Spotlight
- **Metadata**: Demographic attributes (gender, birth year, region)
- **Pageviews**: Daily average views from Wikipedia API

## Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root: {project_root}")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s: %(message)s'
)

# Import analysis modules
from src import analysis, filters

print("✓ Modules imported successfully")

# Set visualization defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Helper function
def extract_dataset_name(csv_path):
    """Extract dataset name from CSV filename (without extension)."""
    basename = os.path.basename(csv_path)
    return os.path.splitext(basename)[0]

print("✓ Setup complete")

## Configuration

**Modify these variables** to change the analysis parameters.

In [None]:
# ============================================================================
# DATA PATHS
# ============================================================================

# Edges CSV (update to your most recent SpotlightWeightSource file)
EDGE_CSV = os.path.join(project_root, "data/out/SpotlightWeightSource_0102_0505.csv")

# Metadata CSV
META_CSV = os.path.join(project_root, "data/entities_filtered_by_languages.csv")

# Pageviews CSV (generated by: python src/main.py pageviews)
PAGEVIEW_CSV = os.path.join(project_root, "data/out/pageviews_2023-01-01_to_2025-12-14.csv")

# Output directory
BASE_OUTPUT_DIR = os.path.join(project_root, "data/out/plots")
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

# ============================================================================
# ANALYSIS PARAMETERS
# ============================================================================

# Languages to analyze
SELECTED_LANGUAGES = ['en', 'de', 'fr', 'it', 'es']

# Attributes to analyze
ATTRIBUTES = ['gender', 'un_subregion', 'bigperiod_birth']

# Statistical parameters
MIN_EDGES = 500  # Minimum edges for statistical reliability
RESOLUTION = 50  # Threshold points (higher = more precise but slower)

# ============================================================================
# FILTERING MODE
# ============================================================================

# False = Language-specific filtering (different threshold per language)
# True = Global/multilayer filtering (same node set across all languages)
USE_GLOBAL_PAGEVIEWS = False

# ============================================================================
# EXPLORATION: Thresholds to compare
# ============================================================================

# Quantile values to explore (0.8 = top 20%, 0.9 = top 10%, etc.)
EXPLORE_QUANTILES = [0.5, 0.7, 0.8, 0.9]

# ============================================================================
# ANALYSIS: Selected threshold (set after exploring)
# ============================================================================

# Quantile threshold for full analysis
ANALYSIS_QUANTILE = 0.8  # Change this after reviewing exploration results

# Alternative: Use absolute threshold instead of quantile
# ANALYSIS_ABSOLUTE = 1000  # Uncomment and modify analysis code to use this

# ============================================================================

DATASET_NAME = extract_dataset_name(EDGE_CSV)
print(f"Configuration loaded: {len(SELECTED_LANGUAGES)} languages, {len(ATTRIBUTES)} attributes")
print(f"Dataset: {DATASET_NAME}")
print(f"Filtering mode: {'Global/Multilayer' if USE_GLOBAL_PAGEVIEWS else 'Language-Specific'}")

## Load Data

In [3]:
print("Loading data...")
edges_df = pd.read_csv(EDGE_CSV)
meta_df = pd.read_csv(META_CSV)
pageviews_df = pd.read_csv(PAGEVIEW_CSV)

print(f"✓ Loaded {len(edges_df):,} edges")
print(f"✓ Loaded {len(meta_df):,} metadata records")
print(f"✓ Loaded {len(pageviews_df):,} pageview records")

print(f"\nEdge columns: {list(edges_df.columns)}")
print(f"Meta columns: {list(meta_df.columns)}")
print(f"Pageview columns: {list(pageviews_df.columns)}")

print(f"\nPageview statistics:")
print(f"  Min: {pageviews_df['pageviews'].min():.2f}")
print(f"  Median: {pageviews_df['pageviews'].median():.2f}")
print(f"  Mean: {pageviews_df['pageviews'].mean():.2f}")
print(f"  Max: {pageviews_df['pageviews'].max():.2f}")

FileNotFoundError: [Errno 2] No such file or directory: '/home/vlr/Workspaces/WikipediaBiasProject/PyProject/data/out/pageviews_2023-01-01_to_2025-12-14.csv'

---
# Exploration: Compare Thresholds

This section compares different quantile thresholds **without running the full analysis**.

Review the statistics below, then:
1. Set `ANALYSIS_QUANTILE` in the configuration cell above
2. Rerun from the Analysis section

In [None]:
print("=" * 80)
print("EXPLORATION: Comparing Quantile Thresholds")
print("=" * 80)
print(f"Filtering mode: {'Global (multilayer)' if USE_GLOBAL_PAGEVIEWS else 'Language-specific'}\n")

# Aggregate pageviews if using global mode
if USE_GLOBAL_PAGEVIEWS:
    print("Aggregating pageviews across all languages...")
    pageviews_agg = pageviews_df.groupby('wikidata_id')['pageviews'].sum().reset_index()
    pageviews_agg.columns = ['wikidata_id', 'pageviews_total']
    print(f"✓ Aggregated pageviews for {len(pageviews_agg):,} unique entities\n")
    
    # Show aggregated pageview statistics
    print("Aggregated pageview statistics:")
    print(f"  Min: {pageviews_agg['pageviews_total'].min():.2f}")
    print(f"  Median: {pageviews_agg['pageviews_total'].median():.2f}")
    print(f"  Mean: {pageviews_agg['pageviews_total'].mean():.2f}")
    print(f"  Max: {pageviews_agg['pageviews_total'].max():.2f}")
    print()
else:
    pageviews_agg = pageviews_df

# Compare quantiles
results_summary = []

for quantile in EXPLORE_QUANTILES:
    print(f"\nQuantile {quantile:.2f} (top {(1-quantile)*100:.0f}%)")
    print("-" * 50)
    
    # Calculate threshold
    if USE_GLOBAL_PAGEVIEWS:
        threshold = pageviews_agg['pageviews_total'].quantile(quantile)
        pageviews_col = 'pageviews_total'
    else:
        threshold = pageviews_df['pageviews'].quantile(quantile)
        pageviews_col = 'pageviews'
    
    print(f"  Threshold value: {threshold:.2f} daily views")
    
    # Count retained entities
    if USE_GLOBAL_PAGEVIEWS:
        retained_entities = pageviews_agg[pageviews_agg[pageviews_col] >= threshold]['wikidata_id'].unique()
        total_entities = pageviews_agg['wikidata_id'].nunique()
    else:
        retained_entities = pageviews_df[pageviews_df[pageviews_col] >= threshold]['wikidata_id'].unique()
        total_entities = pageviews_df['wikidata_id'].nunique()
    
    n_retained = len(retained_entities)
    pct_retained = (n_retained / total_entities) * 100
    
    print(f"  Entities retained: {n_retained:,} / {total_entities:,} ({pct_retained:.1f}%)")
    
    # Calculate actual edge retention (both source AND target must be retained)
    retained_set = set(retained_entities)
    
    # Filter edges: keep only if both source and target are in retained set
    edges_retained = edges_df[
        edges_df['source_wikidata_id'].isin(retained_set) & 
        edges_df['target_wikidata_id'].isin(retained_set)
    ]
    
    n_edges_retained = len(edges_retained)
    edge_retention_rate = (n_edges_retained / len(edges_df)) * 100
    
    print(f"  Edges retained: {n_edges_retained:,} / {len(edges_df):,} ({edge_retention_rate:.1f}%)")
    
    results_summary.append({
        'quantile': quantile,
        'threshold': threshold,
        'entities_retained': n_retained,
        'entities_pct': pct_retained,
        'edges_retained': n_edges_retained,
        'edges_pct': edge_retention_rate
    })

# Summary table
print("\n" + "=" * 80)
print("SUMMARY TABLE")
print("=" * 80)
summary_df = pd.DataFrame(results_summary)
summary_df.columns = ['Quantile', 'Threshold', 'Entities', 'Entities %', 'Edges', 'Edges %']
print(summary_df.to_string(index=False))

print("\n" + "=" * 80)
print("Based on these statistics, set ANALYSIS_QUANTILE in the configuration cell above.")
print("Then rerun from the Analysis section below.")
print("=" * 80)

---
# Analysis: Run Bias Analysis with Selected Threshold

This section runs the complete bias analysis with the selected quantile threshold.

In [None]:
print("=" * 80)
if USE_GLOBAL_PAGEVIEWS:
    print(f"ANALYSIS: Global pageview filtering (quantile={ANALYSIS_QUANTILE})")
    filter_desc = f"pageview_q{ANALYSIS_QUANTILE}_global"
else:
    print(f"ANALYSIS: Language-specific pageview filtering (quantile={ANALYSIS_QUANTILE})")
    filter_desc = f"pageview_q{ANALYSIS_QUANTILE}_lang"
print("=" * 80)

# Build filter function
if USE_GLOBAL_PAGEVIEWS:
    filter_func = filters.restrict_by_pageviews_quantile_global(pageviews_df, ANALYSIS_QUANTILE)
    print(f"✓ Created global pageview filter (top {(1-ANALYSIS_QUANTILE)*100:.0f}%)")
else:
    filter_func = filters.restrict_by_pageviews_quantile(pageviews_df, ANALYSIS_QUANTILE)
    print(f"✓ Created language-specific pageview filter (top {(1-ANALYSIS_QUANTILE)*100:.0f}%)")

# Build output directories
output_dirs = analysis.build_analysis_output_dirs(BASE_OUTPUT_DIR, DATASET_NAME, filter_desc)
print(f"✓ Output directory: {output_dirs['base']}")

# Check for cached results
print("\nChecking for cached results...")
retention_results = analysis.load_cached_analysis_results(
    output_dirs, 
    SELECTED_LANGUAGES, 
    ATTRIBUTES
)

if retention_results is None:
    print("No cache found. Running analysis...\n")
    
    retention_results = analysis.run_bias_analysis(
        edges_df=edges_df,
        meta_df=meta_df,
        edge_id_map={
            'source_wikidata_id': 'src',
            'target_wikidata_id': 'trg',
            'weight': 'nij'
        },
        meta_id_col='wikidata_id',
        selected_languages=SELECTED_LANGUAGES,
        min_edges=MIN_EDGES,
        resolution=RESOLUTION,
        filter_zero_weights=True,
        language_col='language_code',
        add_aggregated_all=True,
        pre_transform_filters=[filter_func],
        post_transform_filters=None,
        dataset_label=filter_desc
    )
    
    # Save results
    print("\nSaving results...")
    analysis.save_analysis_results(
        retention_results, 
        output_dirs, 
        SELECTED_LANGUAGES, 
        ATTRIBUTES
    )
    print("✓ Results saved")
else:
    print("✓ Using cached results (skipping computation)")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

## Display Results

In [None]:
print("\n" + "=" * 80)
print("ANALYSIS RESULTS")
print("=" * 80)

for lang in retention_results:
    print(f"\n{'━'*80}")
    print(f" Language: {lang.upper()}")
    print(f"{'━'*80}")
    
    for attr in retention_results[lang]:
        result_tuple = retention_results[lang][attr]
        if len(result_tuple) == 4:
            edge_results, node_results, auc_matrix, dataset_label = result_tuple
        else:
            edge_results, node_results, auc_matrix = result_tuple
        
        print(f"\n  {attr.upper()} - AUC Matrix:")
        print(f"  {'-'*76}")
        print(auc_matrix.to_string(index=True))
        
        print(f"\n  {attr.upper()} - Edge Pairs Summary:")
        print(f"  {'-'*76}")
        for res in edge_results:
            src, trg = res['pair']
            print(f"    {src:15} → {trg:15}  |  AUC: {res['auc']:.4f}  |  Edges: {res['n_edges']:,}")
        
        if not edge_results:
            print(f"    (No pairs met the minimum threshold of {MIN_EDGES} edges)")

## Results Location

In [None]:
print(f"Results saved to: {output_dirs['base']}")
print(f"  - AUC matrices: {output_dirs['auc']}")
print(f"  - Retention data: {output_dirs['retention_data']}")