## Purpose
This notebook processes IEA World Energy Balance data to extract electricity consumption by sector for all countries/regions. The primary purpose is to provide **seed values** for electricity demand that will be used with AR6 scenario growth rates in VerveStacks country models.

## Methodology
1. **Data Loading**: Reads large IEA balance files (WORLDBIG1.TXT, WORLDBIG2.TXT ~2.4GB total)
2. **Caching**: Creates optimized cache files for fast subsequent processing
3. **Sectoral Aggregation**: Groups electricity flows into 4 key demand sectors:
   - **Industry**: Includes TOTIND + losses + energy sector + power plant own use + pipelines
   - **Buildings + Agriculture**: Residential + commercial + agriculture
   - **Non-road Transport**: Rail + aviation + navigation
   - **Road Transport**: Road vehicles
4. **Trade Flows**: Captures electricity imports and exports
5. **ISO Mapping**: Maps IEA region names to ISO country codes using KiNESYS region mapping

## Integration with AR6 Scenarios
The electricity consumption values from this analysis serve as **baseline seed values** for VerveStacks models. Each ISO will inherit growth rates from their parent AR6 R10 region for different climate categories (e.g., Current Policies, NDCs, 1.5°C pathways).

## Output
- **Time Period**: 2018-2022 (5 years of historical data)
- **Geographic Coverage**: All KiNESYS-mapped ISOs (excludes "other*" regions)
- **Format**: Long table CSV with 6 numbers per ISO-year:
  - industry_twh, buildings_agri_twh, nonroad_transport_twh, road_transport_twh, imports_twh, exports_twh
- **File**: `iea_electricity_summary_2018_2022.csv`

## Key Features
- **Vectorized Processing**: Optimized pandas operations for fast execution (~6 seconds)
- **Comprehensive Coverage**: Includes all electricity flows from IEA balance methodology
- **Quality Assurance**: Absolute values for consumption, proper sign handling for trade
- **VerveStacks Ready**: Direct integration with country model generation workflows

In [None]:
# IEA Electricity Balance Data Processing for VerveStacks

import pandas as pd
import pickle
import os
from pathlib import Path
import time

# IEA Data Reader and Cacher
def load_iea_balance_files():
    """
    Read and cache the two large IEA balance TXT files
    WORLDBIG1.TXT (~1.1GB) and WORLDBIG2.TXT (~1.3GB)
    """
    
    # File paths
    data_dir = Path("../data/IEA/Balances 2024")
    cache_dir = Path("../cache")
    cache_dir.mkdir(exist_ok=True)
    
    worldbig1_path = data_dir / "WORLDBIG1.TXT"
    worldbig2_path = data_dir / "WORLDBIG2.TXT"
    
    cache_file1 = cache_dir / "iea_worldbig1_cache.pkl"
    cache_file2 = cache_dir / "iea_worldbig2_cache.pkl"
    
    # Check if cached versions exist and are newer than source files
    def needs_refresh(source_file, cache_file):
        if not cache_file.exists():
            return True
        return source_file.stat().st_mtime > cache_file.stat().st_mtime
    
    # Load WORLDBIG1
    if needs_refresh(worldbig1_path, cache_file1):
        print(f"Reading WORLDBIG1.TXT ({worldbig1_path.stat().st_size / (1024**3):.2f} GB)...")
        start_time = time.time()
        
        # Read with chunking for memory efficiency
        chunks = []
        chunk_size = 100000  # Process 100k rows at a time
        
        for chunk in pd.read_csv(worldbig1_path, 
                                names=['COUNTRY', 'FUEL', 'YEAR', 'FLOW', 'UNIT', 'VALUE'],
                                chunksize=chunk_size):
            chunks.append(chunk)
        
        worldbig1_df = pd.concat(chunks, ignore_index=True)
        
        # Cache the result
        with open(cache_file1, 'wb') as f:
            pickle.dump(worldbig1_df, f)
        
        elapsed = time.time() - start_time
        print(f"WORLDBIG1 loaded and cached: {len(worldbig1_df):,} rows in {elapsed:.1f}s")
    else:
        print("Loading WORLDBIG1 from cache...")
        with open(cache_file1, 'rb') as f:
            worldbig1_df = pickle.load(f)
        print(f"WORLDBIG1 loaded from cache: {len(worldbig1_df):,} rows")
    
    # Load WORLDBIG2
    if needs_refresh(worldbig2_path, cache_file2):
        print(f"Reading WORLDBIG2.TXT ({worldbig2_path.stat().st_size / (1024**3):.2f} GB)...")
        start_time = time.time()
        
        # Read with chunking for memory efficiency
        chunks = []
        chunk_size = 100000  # Process 100k rows at a time
        
        for chunk in pd.read_csv(worldbig2_path, 
                                names=['COUNTRY', 'FUEL', 'YEAR', 'FLOW', 'UNIT', 'VALUE'],
                                chunksize=chunk_size):
            chunks.append(chunk)
        
        worldbig2_df = pd.concat(chunks, ignore_index=True)
        
        # Cache the result
        with open(cache_file2, 'wb') as f:
            pickle.dump(worldbig2_df, f)
        
        elapsed = time.time() - start_time
        print(f"WORLDBIG2 loaded and cached: {len(worldbig2_df):,} rows in {elapsed:.1f}s")
    else:
        print("Loading WORLDBIG2 from cache...")
        with open(cache_file2, 'rb') as f:
            worldbig2_df = pickle.load(f)
        print(f"WORLDBIG2 loaded from cache: {len(worldbig2_df):,} rows")
    
    return worldbig1_df, worldbig2_df

# Load the data
print("Loading IEA Balance Data...")
worldbig1, worldbig2 = load_iea_balance_files()


In [None]:
# Explore the data structure
print("=== WORLDBIG1 Data Structure ===")
print(f"Shape: {worldbig1.shape}")
print(f"Columns: {list(worldbig1.columns)}")
print(f"Memory usage: {worldbig1.memory_usage(deep=True).sum() / (1024**2):.1f} MB")
print("\nFirst few rows:")
print(worldbig1.head())
print(f"\nUnique countries: {worldbig1['COUNTRY'].nunique()}")
print(f"Unique fuels: {worldbig1['FUEL'].nunique()}")
print(f"Year range: {worldbig1['YEAR'].min()} - {worldbig1['YEAR'].max()}")
print(f"Unique flows: {worldbig1['FLOW'].nunique()}")
print(f"Unique units: {worldbig1['UNIT'].nunique()}")

print("\n" + "="*50)
print("=== WORLDBIG2 Data Structure ===")
print(f"Shape: {worldbig2.shape}")
print(f"Columns: {list(worldbig2.columns)}")
print(f"Memory usage: {worldbig2.memory_usage(deep=True).sum() / (1024**2):.1f} MB")
print("\nFirst few rows:")
print(worldbig2.head())
print(f"\nUnique countries: {worldbig2['COUNTRY'].nunique()}")
print(f"Unique fuels: {worldbig2['FUEL'].nunique()}")
print(f"Year range: {worldbig2['YEAR'].min()} - {worldbig2['YEAR'].max()}")
print(f"Unique flows: {worldbig2['FLOW'].nunique()}")
print(f"Unique units: {worldbig2['UNIT'].nunique()}")

# Check for overlap between the two files
print("\n" + "="*50)
print("=== Data Overlap Analysis ===")
countries1 = set(worldbig1['COUNTRY'].unique())
countries2 = set(worldbig2['COUNTRY'].unique())
print(f"Countries only in WORLDBIG1: {len(countries1 - countries2)}")
print(f"Countries only in WORLDBIG2: {len(countries2 - countries1)}")
print(f"Countries in both: {len(countries1 & countries2)}")

# Sample countries from each
print(f"\nSample countries in WORLDBIG1: {list(worldbig1['COUNTRY'].unique()[:10])}")
print(f"Sample countries in WORLDBIG2: {list(worldbig2['COUNTRY'].unique()[:10])}")

# Sample fuels and flows
print(f"\nSample fuels: {list(worldbig1['FUEL'].unique()[:10])}")
print(f"Sample flows: {list(worldbig1['FLOW'].unique()[:10])}")
print(f"Sample units: {list(worldbig1['UNIT'].unique())}")


In [None]:
# Load KiNESYS region mapping from VS_mappings
import pandas as pd

# Read the KiNESYS region mapping
vs_mappings_path = "../assumptions/VS_mappings.xlsx"

# Load the kinesys_region_map sheet
print("Loading kinesys_region_map sheet...")
kinesys_mapping = pd.read_excel(vs_mappings_path, sheet_name='kinesys_region_map')

print(f"KiNESYS region mapping loaded:")
print(f"Shape: {kinesys_mapping.shape}")
print(f"Columns: {list(kinesys_mapping.columns)}")
print("\nFirst few rows:")
print(kinesys_mapping.head())

# Get regions that have IEAReg mappings (non-null values in IEAReg column)
iea_mapped_regions = kinesys_mapping[kinesys_mapping['IEAReg'].notna()]['IEAReg'].unique()
iea_mapped_regions = [reg for reg in iea_mapped_regions if not str(reg).startswith('OTHER')]
print(f"\nFound {len(iea_mapped_regions)} regions with IEAReg mappings:")
print(f"IEA regions: {list(iea_mapped_regions)}")


In [None]:
# Filter IEA data based on KiNESYS region mapping and exclude KTOE
def filter_and_combine_iea_data(worldbig1, worldbig2, iea_mapped_regions):
    """
    Filter IEA balance data to include only regions from IEAReg column and exclude KTOE units
    """
    
    # Combine both dataframes
    print("\nCombining WORLDBIG1 and WORLDBIG2...")
    combined_df = pd.concat([worldbig1, worldbig2], ignore_index=True)
    print(f"Combined shape: {combined_df.shape}")
    
    # Filter out KTOE units (keep only TJ)
    print("\nFiltering out KTOE units...")
    before_unit_filter = len(combined_df)
    combined_df = combined_df[combined_df['UNIT'] != 'KTOE']
    after_unit_filter = len(combined_df)
    print(f"Removed {before_unit_filter - after_unit_filter:,} KTOE rows")
    print(f"Remaining units: {combined_df['UNIT'].unique()}")
    
    # Filter by IEAReg mapped regions
    print(f"\nFiltering by IEAReg mapped regions...")
    print(f"Target regions: {list(iea_mapped_regions)}")
    
    before_region_filter = len(combined_df)
    combined_df = combined_df[combined_df['COUNTRY'].isin(iea_mapped_regions)]
    after_region_filter = len(combined_df)
    
    print(f"Removed {before_region_filter - after_region_filter:,} rows from non-IEAReg regions")
    print(f"Remaining countries in dataset: {sorted(combined_df['COUNTRY'].unique())}")

    # Add ISO column to combined_df using kinesys_mapping
    # Map COUNTRY to iso3 using kinesys_mapping DataFrame
    country_to_iso = dict(zip(kinesys_mapping['IEAReg'], kinesys_mapping['iso']))
    combined_df['iso'] = combined_df['COUNTRY'].map(country_to_iso)
    
    return combined_df

# Apply filtering (but keep WORLD data separate for global totals)
filtered_iea_data = filter_and_combine_iea_data(worldbig1, worldbig2, iea_mapped_regions)

# Also create unfiltered dataset for WORLD totals
print("\n=== Creating Unfiltered Dataset for WORLD Totals ===")
combined_df_all = pd.concat([worldbig1, worldbig2], ignore_index=True)

# Filter out KTOE units but keep all countries
print("Filtering out KTOE units from complete dataset...")
before_unit_filter = len(combined_df_all)
unfiltered_iea_data = combined_df_all[combined_df_all['UNIT'] != 'KTOE'].copy()
after_unit_filter = len(unfiltered_iea_data)
print(f"Removed {before_unit_filter - after_unit_filter:,} KTOE rows from complete dataset")

print(f"Unfiltered dataset shape: {unfiltered_iea_data.shape}")
print(f"Countries in unfiltered: {unfiltered_iea_data['COUNTRY'].nunique()}")
print(f"Filtered dataset shape: {filtered_iea_data.shape}")
print(f"Countries in filtered: {filtered_iea_data['COUNTRY'].nunique()}")

print(f"\n=== Final Filtered Dataset ===")
print(f"Shape: {filtered_iea_data.shape}")
print(f"Countries: {filtered_iea_data['COUNTRY'].nunique()}")
print(f"Fuels: {filtered_iea_data['FUEL'].nunique()}")
print(f"Flows: {filtered_iea_data['FLOW'].nunique()}")
print(f"Year range: {filtered_iea_data['YEAR'].min()} - {filtered_iea_data['YEAR'].max()}")
print(f"Memory usage: {filtered_iea_data.memory_usage(deep=True).sum() / (1024**2):.1f} MB")


In [None]:
# Cache the filtered and combined dataset
cache_file = Path("../cache/iea_balance_2024.pkl")

print(f"\nCaching filtered IEA balance data to: {cache_file}")
with open(cache_file, 'wb') as f:
    pickle.dump(filtered_iea_data, f)

print(f"Successfully cached {len(filtered_iea_data):,} rows of IEA balance data")
print(f"Cache file size: {cache_file.stat().st_size / (1024**2):.1f} MB")

# Create a summary for reference
summary = {
    'total_rows': len(filtered_iea_data),
    'countries': filtered_iea_data['COUNTRY'].nunique(),
    'fuels': filtered_iea_data['FUEL'].nunique(),
    'flows': filtered_iea_data['FLOW'].nunique(),
    'year_range': (int(filtered_iea_data['YEAR'].min()), int(filtered_iea_data['YEAR'].max())),
    'units': list(filtered_iea_data['UNIT'].unique()),
    'sample_countries': list(filtered_iea_data['COUNTRY'].unique()[:20]),
    'sample_fuels': list(filtered_iea_data['FUEL'].unique()[:20]),
    'sample_flows': list(filtered_iea_data['FLOW'].unique()[:20])
}

# Save summary as well
summary_file = Path("../cache/iea_balance_2024_summary.pkl")
with open(summary_file, 'wb') as f:
    pickle.dump(summary, f)

print(f"\nDataset Summary:")
for key, value in summary.items():
    if isinstance(value, list) and len(value) > 10:
        print(f"  {key}: {len(value)} items (showing first 10: {value[:10]})")
    else:
        print(f"  {key}: {value}")

print(f"\n✅ IEA Balance 2024 data successfully processed and cached!")
print(f"   - Main data: {cache_file}")
print(f"   - Summary: {summary_file}")
print(f"   - Ready for integration into VerveStacks workflows")


In [None]:
def get_electricity_consumption_by_sector(iso_code, year, iea_data=None, use_unfiltered_for_world=True):
    """
    Extract electricity consumption by major sectors for a given ISO and year.
    
    Parameters:
    -----------
    iso_code : str
        ISO country code (e.g., 'USA', 'CHE', 'DEU') or 'WORLD' for global totals
    year : int
        Year for data extraction
    iea_data : pd.DataFrame, optional
        IEA balance data. If None, uses appropriate dataset based on iso_code.
    use_unfiltered_for_world : bool, optional
        If True, uses complete dataset for WORLD totals (not just KiNESYS regions)
    
    Returns:
    --------
    dict : Dictionary with TWh consumption for each sector and flow details
    
    Flow Mapping:
    -------------
    INDUSTRY: TOTIND (Total Industry aggregate)
        - Includes: CHEMICAL, IRONSTL, NONFERR, NONMET, MACHINE, FOODPRO, 
                   PAPERPRO, TEXTILES, WOODPRO, MINING, CONSTRUC
    
    BUILDINGS+AGRI: RESIDENT + COMMPUB + AGRICULT
        - RESIDENT: Residential buildings
        - COMMPUB: Commercial and public services
        - AGRICULT: Agriculture, forestry, fishing
    
    NON-ROAD TRANSPORT: RAIL + DOMESAIR + DOMESNAV
        - RAIL: Rail transport
        - DOMESAIR: Domestic aviation
        - DOMESNAV: Domestic navigation
    
    ROAD TRANSPORT: ROAD
        - ROAD: Road transport
    
    ENERGY SECTOR: EREFINER + EBLASTFUR + ECOKEOVS + EGASWKS + EGTL + EMINES + EOILGASEX + PIPELINE
        - EREFINER: Oil refineries
        - EBLASTFUR: Blast furnaces (iron & steel)
        - ECOKEOVS: Coke ovens
        - EGASWKS: Gas works
        - EGTL: Gas-to-liquids plants
        - EMINES: Coal mines
        - EOILGASEX: Oil and gas extraction
        - PIPELINE: Pipeline transport
    """
    
    # Load appropriate data if not provided
    if iea_data is None:
        # Use unfiltered data for WORLD totals, filtered for specific countries
        if iso_code == 'WORLD' and use_unfiltered_for_world:
            try:
                iea_data = unfiltered_iea_data
                print(f"Using unfiltered dataset for WORLD totals")
            except NameError:
                # Fallback: create unfiltered data on the fly
                print("Creating unfiltered dataset for WORLD totals...")
                combined_df_all = pd.concat([worldbig1, worldbig2], ignore_index=True)
                iea_data = combined_df_all[combined_df_all['UNIT'] != 'KTOE'].copy()
                print(f"Created unfiltered dataset with {len(iea_data):,} rows")
        else:
            try:
                iea_data = filtered_iea_data  # Use in-memory filtered data
                print(f"Using filtered dataset for {iso_code}")
            except NameError:
                # Fallback: load from cache
                cache_file = Path("../cache/iea_balance_2024.pkl")
                if cache_file.exists():
                    with open(cache_file, 'rb') as f:
                        iea_data = pickle.load(f)
                    print(f"Using cached filtered dataset for {iso_code}")
                else:
                    raise FileNotFoundError("No IEA data available. Run the data loading cells first.")
    
    # Filter for electricity product, specific country and year
    elec_data = iea_data[
        (iea_data['FUEL'] == 'ELECTR') & 
        (iea_data['COUNTRY'] == iso_code) & 
        (iea_data['YEAR'] == year) &
        (iea_data['UNIT'] == 'TJ')  # Use TJ units
    ].copy()
    
    if len(elec_data) == 0:
        print(f"No electricity data found for {iso_code} in {year}")
        return {
            'industry_twh': 0.0,
            'buildings_agri_twh': 0.0, 
            'nonroad_transport_twh': 0.0,
            'road_transport_twh': 0.0,
            'flows_used': {},
            'data_available': False
        }
    
    # Define flow mappings
    industry_flows = ['TOTIND']
    buildings_agri_flows = ['RESIDENT', 'COMMPUB', 'AGRICULT']
    nonroad_transport_flows = ['RAIL', 'DOMESAIR', 'DOMESNAV']  # Removed PIPELINE - it's energy sector
    road_transport_flows = ['ROAD']
    
    # Energy sector consumption (often missing from final consumption totals)
    energy_sector_flows = [
        'EREFINER',   # Oil refineries
        'EBLASTFUR',  # Blast furnaces (iron & steel)
        'ECOKEOVS',   # Coke ovens
        'EGASWKS',    # Gas works
        'EGTL',       # Gas-to-liquids plants
        'EMINES',     # Coal mines
        'EOILGASEX',  # Oil and gas extraction
        'PIPELINE'    # Pipeline transport (energy sector, not final consumption)
    ]
    
    # Helper function to sum flows and convert TJ to TWh
    def sum_flows_to_twh(flows, data, use_abs=True):
        """Convert TJ to TWh: 1 TWh = 3,600 TJ"""
        flow_data = data[data['FLOW'].isin(flows)]
        
        # Handle missing values (represented as '..' in IEA data)
        flow_data = flow_data[flow_data['VALUE'] != '..'].copy()
        if len(flow_data) > 0:
            flow_data['VALUE'] = pd.to_numeric(flow_data['VALUE'], errors='coerce')
            if use_abs:
                total_tj = flow_data['VALUE'].abs().sum()  # Use absolute values for consumption
            else:
                total_tj = flow_data['VALUE'].sum()  # Keep sign for exports
            return total_tj / 3600.0  # Convert TJ to TWh
        return 0.0
    
    # Calculate consumption by sector
    industry_twh = sum_flows_to_twh(industry_flows, elec_data)
    buildings_agri_twh = sum_flows_to_twh(buildings_agri_flows, elec_data)
    nonroad_transport_twh = sum_flows_to_twh(nonroad_transport_flows, elec_data)
    road_transport_twh = sum_flows_to_twh(road_transport_flows, elec_data)
    energy_sector_twh = sum_flows_to_twh(energy_sector_flows, elec_data)
    
    # Get details of which flows were actually found
    flows_found = {
        'industry': elec_data[elec_data['FLOW'].isin(industry_flows)]['FLOW'].unique().tolist(),
        'buildings_agri': elec_data[elec_data['FLOW'].isin(buildings_agri_flows)]['FLOW'].unique().tolist(),
        'nonroad_transport': elec_data[elec_data['FLOW'].isin(nonroad_transport_flows)]['FLOW'].unique().tolist(),
        'road_transport': elec_data[elec_data['FLOW'].isin(road_transport_flows)]['FLOW'].unique().tolist(),
        'energy_sector': elec_data[elec_data['FLOW'].isin(energy_sector_flows)]['FLOW'].unique().tolist()
    }
    
    return {
        'industry_twh': round(industry_twh, 2),
        'buildings_agri_twh': round(buildings_agri_twh, 2),
        'nonroad_transport_twh': round(nonroad_transport_twh, 2),
        'road_transport_twh': round(road_transport_twh, 2),
        'energy_sector_twh': round(energy_sector_twh, 2),
        'flows_used': flows_found,
        'data_available': True,
        'total_twh': round(industry_twh + buildings_agri_twh + nonroad_transport_twh + road_transport_twh + energy_sector_twh, 2)
    }

# Test the function with a few examples
print("=== Testing Electricity Consumption Function ===")

# Test with major economies (assuming they're in the IEA data)
test_countries = ['WORLD', 'USA', 'CHINA', 'GERMANY', 'JAPAN']  # Using IEA region names
test_year = 2022

for country in test_countries:
    try:
        # Use appropriate dataset based on country
        if country == 'WORLD':
            result = get_electricity_consumption_by_sector(country, test_year)  # Will use unfiltered data
        else:
            result = get_electricity_consumption_by_sector(country, test_year, filtered_iea_data)
        
        if result['data_available']:
            print(f"\n{country} - {test_year} Electricity Consumption:")
            print(f"  Industry: {result['industry_twh']} TWh")
            print(f"  Buildings+Agri: {result['buildings_agri_twh']} TWh") 
            print(f"  Non-road Transport: {result['nonroad_transport_twh']} TWh")
            print(f"  Road Transport: {result['road_transport_twh']} TWh")
            print(f"  Energy Sector: {result['energy_sector_twh']} TWh")
            print(f"  TOTAL: {result['total_twh']} TWh")
            print(f"  Flows found: {result['flows_used']}")
        else:
            print(f"\n{country} - {test_year}: No data available")
            
    except Exception as e:
        print(f"\n{country} - {test_year}: Error - {e}")

# Show available countries in the dataset
print(f"\n=== Available Countries in Dataset ===")
available_countries = sorted(filtered_iea_data['COUNTRY'].unique())
print(f"Total countries: {len(available_countries)}")
print(f"Sample countries: {available_countries[:20]}")

# Show available years
print(f"\n=== Available Years ===")
available_years = sorted(filtered_iea_data['YEAR'].unique())
print(f"Year range: {available_years[0]} - {available_years[-1]}")
print(f"Recent years: {available_years[-10:]}")


In [None]:
def get_complete_electricity_balance(iso_code, year, iea_data=None, use_unfiltered_for_world=True):
    """
    Get the complete electricity balance showing all major components.
    
    Based on IEA Energy Balance methodology, electricity balance includes:
    
    SUPPLY SIDE:
    - INDPROD: Indigenous production (electricity generation)
    - IMPORTS: Electricity imports
    - EXPORTS: Electricity exports (negative)
    
    TRANSFORMATION & LOSSES:
    - EPOWERPLT: Own use by power plants (negative - electricity used in power generation)
    - ELMAINC: Main activity electricity plants CHP (electricity output)
    - ELMAINE: Main activity electricity plants (electricity output) 
    - ELAUTOC: Autoproducer CHP plants (electricity output)
    - ELAUTOE: Autoproducer electricity plants (electricity output)
    - DISTLOSS: Distribution losses (negative)
    
    FINAL CONSUMPTION:
    - Industry, Buildings+Agri, Transport (as calculated in previous function)
    
    STATISTICAL DIFFERENCE:
    - Balancing item to reconcile supply and demand
    """
    
    # Load appropriate data if not provided
    if iea_data is None:
        # Use unfiltered data for WORLD totals, filtered for specific countries
        if iso_code == 'WORLD' and use_unfiltered_for_world:
            try:
                iea_data = unfiltered_iea_data
            except NameError:
                # Fallback: create unfiltered data on the fly
                combined_df_all = pd.concat([worldbig1, worldbig2], ignore_index=True)
                iea_data = combined_df_all[combined_df_all['UNIT'] != 'KTOE'].copy()
        else:
            try:
                iea_data = filtered_iea_data  # Use in-memory filtered data
            except NameError:
                # Fallback: load from cache
                cache_file = Path("../cache/iea_balance_2024.pkl")
                if cache_file.exists():
                    with open(cache_file, 'rb') as f:
                        iea_data = pickle.load(f)
                else:
                    raise FileNotFoundError("No IEA data available. Run the data loading cells first.")
    
    # Filter for electricity product, specific country and year
    elec_data = iea_data[
        (iea_data['FUEL'] == 'ELECTR') & 
        (iea_data['COUNTRY'] == iso_code) & 
        (iea_data['YEAR'] == year) &
        (iea_data['UNIT'] == 'TJ')
    ].copy()
    
    if len(elec_data) == 0:
        return {'error': f'No electricity data found for {iso_code} in {year}'}
    
    # Helper function to get flow value in TWh
    def get_flow_twh(flow_name, data, use_abs=False):
        flow_data = data[data['FLOW'] == flow_name]
        if len(flow_data) > 0 and flow_data.iloc[0]['VALUE'] != '..':
            value = pd.to_numeric(flow_data.iloc[0]['VALUE'], errors='coerce') / 3600.0
            return abs(value) if use_abs else value
        return 0.0
    
    # SUPPLY SIDE
    indigenous_production = get_flow_twh('TOTTRANF', elec_data)  # Total indigenous production
    imports = get_flow_twh('IMPORTS', elec_data) 
    exports = get_flow_twh('EXPORTS', elec_data)  # Keep negative for exports
    
    # NON-FINAL CONSUMPTION (use absolute values)
    power_plant_own_use = get_flow_twh('EPOWERPLT', elec_data, use_abs=True)  # Power plant own use
    distribution_losses = get_flow_twh('DISTLOSS', elec_data, use_abs=True)  # Distribution losses
    
    # TRANSFORMATION SECTOR (Electricity generation outputs - typically not needed for balance)
    main_activity_elec = get_flow_twh('ELMAINE', elec_data)  # Main electricity plants output
    main_activity_chp = get_flow_twh('ELMAINC', elec_data)   # Main CHP electricity output
    autoproducer_elec = get_flow_twh('ELAUTOE', elec_data)   # Autoproducer electricity output
    autoproducer_chp = get_flow_twh('ELAUTOC', elec_data)    # Autoproducer CHP electricity output
    
    # FINAL CONSUMPTION (using our previous function)
    consumption = get_electricity_consumption_by_sector(iso_code, year, iea_data, use_unfiltered_for_world)
    
    # Calculate totals
    total_supply = indigenous_production + imports + exports  # exports is negative (net supply)
    total_transformation_output = main_activity_elec + main_activity_chp + autoproducer_elec + autoproducer_chp
    total_non_final_consumption = power_plant_own_use + distribution_losses + consumption['energy_sector_twh']  # All positive
    total_final_consumption = (consumption['industry_twh'] + consumption['buildings_agri_twh'] + 
                              consumption['nonroad_transport_twh'] + consumption['road_transport_twh']) if consumption['data_available'] else 0.0
    total_all_consumption = total_final_consumption + total_non_final_consumption
    
    # Statistical difference (should be close to zero in a balanced system)
    # Supply should equal total consumption
    statistical_difference = total_supply - total_all_consumption
    
    return {
        'country': iso_code,
        'year': year,
        
        # SUPPLY SIDE (TWh)
        'indigenous_production_twh': round(indigenous_production, 2),
        'imports_twh': round(imports, 2),
        'exports_twh': round(exports, 2),  # negative
        'net_imports_twh': round(imports + exports, 2),
        'total_supply_twh': round(total_supply, 2),
        
        # NON-FINAL CONSUMPTION (TWh) - all positive values
        'power_plant_own_use_twh': round(power_plant_own_use, 2),
        'distribution_losses_twh': round(distribution_losses, 2),
        'total_non_final_consumption_twh': round(total_non_final_consumption, 2),
        
        # TRANSFORMATION SECTOR (TWh) - generation outputs
        'main_activity_elec_twh': round(main_activity_elec, 2),
        'main_activity_chp_twh': round(main_activity_chp, 2), 
        'autoproducer_elec_twh': round(autoproducer_elec, 2),
        'autoproducer_chp_twh': round(autoproducer_chp, 2),
        'total_transformation_output_twh': round(total_transformation_output, 2),
        
        # FINAL CONSUMPTION (TWh)
        'industry_twh': consumption['industry_twh'],
        'buildings_agri_twh': consumption['buildings_agri_twh'],
        'nonroad_transport_twh': consumption['nonroad_transport_twh'],
        'road_transport_twh': consumption['road_transport_twh'],
        'energy_sector_twh': consumption['energy_sector_twh'],
        'total_final_consumption_twh': round(total_final_consumption, 2),
        'total_all_consumption_twh': round(total_all_consumption, 2),
        
        # BALANCE CHECK
        'statistical_difference_twh': round(statistical_difference, 2),
        
        # FLOWS FOUND
        'flows_available': sorted(elec_data['FLOW'].unique().tolist())
    }

# Test the complete balance function
print("=== Complete Electricity Balance Analysis ===")

# Test with a major economy
test_country = 'WORLD'  # Start with world totals
test_year = 2022

balance = get_complete_electricity_balance(test_country, test_year)  # Will use unfiltered data for WORLD

if 'error' not in balance:
    print(f"\n{balance['country']} - {balance['year']} Complete Electricity Balance:")
    print(f"\n--- SUPPLY SIDE ---")
    print(f"Indigenous Production: {balance['indigenous_production_twh']:>10.1f} TWh")
    print(f"Imports:              {balance['imports_twh']:>10.1f} TWh")  
    print(f"Exports:              {balance['exports_twh']:>10.1f} TWh")
    print(f"Net Imports:          {balance['net_imports_twh']:>10.1f} TWh")
    print(f"TOTAL SUPPLY:         {balance['total_supply_twh']:>10.1f} TWh")
    
    print(f"\n--- NON-FINAL CONSUMPTION ---")
    print(f"Power Plant Own Use:  {balance['power_plant_own_use_twh']:>10.1f} TWh")
    print(f"Distribution Losses:  {balance['distribution_losses_twh']:>10.1f} TWh")
    print(f"Energy Sector:        {balance['energy_sector_twh']:>10.1f} TWh")
    print(f"TOTAL NON-FINAL:      {balance['total_non_final_consumption_twh']:>10.1f} TWh")
    
    print(f"\n--- FINAL CONSUMPTION ---")
    print(f"Industry:             {balance['industry_twh']:>10.1f} TWh")
    print(f"Buildings + Agri:     {balance['buildings_agri_twh']:>10.1f} TWh")
    print(f"Non-road Transport:   {balance['nonroad_transport_twh']:>10.1f} TWh")
    print(f"Road Transport:       {balance['road_transport_twh']:>10.1f} TWh")
    print(f"TOTAL FINAL CONS:     {balance['total_final_consumption_twh']:>10.1f} TWh")
    print(f"TOTAL ALL CONS:       {balance['total_all_consumption_twh']:>10.1f} TWh")
    
    print(f"\n--- BALANCE CHECK ---")
    print(f"Statistical Difference: {balance['statistical_difference_twh']:>8.1f} TWh")
    
    print(f"\n--- FLOWS AVAILABLE ---")
    print(f"Available flows: {balance['flows_available']}")
    
else:
    print(f"Error: {balance['error']}")

print(f"\n=== What We Were Missing ===")
print("From our original function, we were missing:")
print("1. SUPPLY SIDE: Indigenous production, imports, exports")
print("2. TRANSFORMATION: Power plant outputs and own use")  
print("3. LOSSES: Distribution losses")
print("4. BALANCE CHECK: Statistical difference to verify data consistency")
print("\nThese components are crucial for:")
print("- Understanding electricity trade flows")
print("- Calculating system efficiency") 
print("- Validating data quality")
print("- Energy security analysis")


In [None]:
# STOP THE SLOW LOOP! Use this VECTORIZED approach instead:

print("=== VECTORIZED Electricity Dataset Generation (2018-2022) ===")

# Load the cleaned cached data directly - NO LOOPS!
cache_file = Path("../cache/iea_balance_2024.pkl")
if cache_file.exists():
    print("Loading cleaned IEA data from cache...")
    with open(cache_file, 'rb') as f:
        iea_data = pickle.load(f)
    print(f"Loaded {len(iea_data):,} rows from cache")
    
    # Filter ALL data at once - much faster than looping
    years = list(range(2000, 2023))
    elec_data = iea_data[
        (iea_data['FUEL'] == 'ELECTR') & 
        (iea_data['YEAR'].isin(years)) &
        (iea_data['UNIT'] == 'TJ') &
        (iea_data['VALUE'] != '..')
    ].copy()
    
    print(f"Filtered to {len(elec_data):,} electricity records for 2018-2022")
    
    # Convert to numeric and calculate TWh (vectorized - all at once)
    elec_data['VALUE'] = pd.to_numeric(elec_data['VALUE'], errors='coerce')
    elec_data['TWH_ABS'] = elec_data['VALUE'].abs() / 3600.0  # Absolute TWh
    elec_data['TWH'] = elec_data['VALUE'] / 3600.0  # Keep sign for exports
    
    # Define flow groups
    industry_flows = ['TOTIND', 'EPOWERPLT', 'DISTLOSS', 'EREFINER', 'EBLASTFUR', 'ECOKEOVS', 
                     'EGASWKS', 'EGTL', 'EMINES', 'EOILGASEX', 'PIPELINE']
    buildings_agri_flows = ['RESIDENT', 'COMMPUB', 'AGRICULT']
    nonroad_transport_flows = ['RAIL', 'DOMESAIR', 'DOMESNAV']
    road_transport_flows = ['ROAD']
    total_production_flow = ['TOTTRANF']
    
    # Aggregate ALL countries and years at once using groupby
    print("Aggregating all data at once...")
    
    # Industry (use absolute values)
    industry_agg = elec_data[elec_data['FLOW'].isin(industry_flows)].groupby(['COUNTRY', 'YEAR'])['TWH_ABS'].sum().reset_index()
    industry_agg.rename(columns={'TWH_ABS': 'industry_twh'}, inplace=True)
    
    # Buildings + Agriculture
    buildings_agg = elec_data[elec_data['FLOW'].isin(buildings_agri_flows)].groupby(['COUNTRY', 'YEAR'])['TWH_ABS'].sum().reset_index()
    buildings_agg.rename(columns={'TWH_ABS': 'buildings_agri_twh'}, inplace=True)
    
    # Non-road Transport
    nonroad_agg = elec_data[elec_data['FLOW'].isin(nonroad_transport_flows)].groupby(['COUNTRY', 'YEAR'])['TWH_ABS'].sum().reset_index()
    nonroad_agg.rename(columns={'TWH_ABS': 'nonroad_transport_twh'}, inplace=True)
    
    # Road Transport
    road_agg = elec_data[elec_data['FLOW'].isin(road_transport_flows)].groupby(['COUNTRY', 'YEAR'])['TWH_ABS'].sum().reset_index()
    road_agg.rename(columns={'TWH_ABS': 'road_transport_twh'}, inplace=True)
    
    # Imports (keep sign)
    imports_agg = elec_data[elec_data['FLOW'] == 'IMPORTS'].groupby(['COUNTRY', 'YEAR'])['TWH'].sum().reset_index()
    imports_agg.rename(columns={'TWH': 'imports_twh'}, inplace=True)
    
    # Exports (keep sign)
    exports_agg = elec_data[elec_data['FLOW'] == 'EXPORTS'].groupby(['COUNTRY', 'YEAR'])['TWH'].sum().reset_index()
    exports_agg.rename(columns={'TWH': 'exports_twh'}, inplace=True)
    
    total_production_agg = elec_data[elec_data['FLOW'].isin(total_production_flow)].groupby(['COUNTRY', 'YEAR'])['TWH_ABS'].sum().reset_index()
    total_production_agg.rename(columns={'TWH_ABS': 'total_production_twh'}, inplace=True)

    # Load KiNESYS region mapping to get actual ISO codes
    print("Loading KiNESYS region mapping...")
    vs_mappings_path = "../assumptions/VS_mappings.xlsx"
    kinesys_mapping = pd.read_excel(vs_mappings_path, sheet_name='kinesys_region_map')
    
    # Exclude ISOs SMR, LIE, and AND from mapping
    excluded_isos = {'SMR', 'LIE', 'AND'}
    region_to_iso = {}
    excluded_count = 0
    for _, row in kinesys_mapping.iterrows():
        iea_reg = row.get('IEAReg')
        iso_code = row.get('iso')  # Fixed: use 'iso' not 'ISO'
        if pd.notna(iea_reg) and pd.notna(iso_code):
            # Exclude regions mapped to "other*" or that start with "OTHER" or in excluded_isos
            if (
                str(iso_code).lower().startswith('other')
                or str(iea_reg).upper().startswith('OTHER')
                or str(iso_code).upper() in excluded_isos
            ):
                excluded_count += 1
            else:
                region_to_iso[iea_reg] = iso_code
    
    print(f"Found {len(region_to_iso)} IEA regions mapped to valid ISOs")
    print(f"Excluded {excluded_count} 'other*' regions")
    print(f"Sample mappings: {dict(list(region_to_iso.items())[:5])}")
    
    # Get all unique country-year combinations and map to ISOs
    all_combinations = elec_data[['COUNTRY', 'YEAR']].drop_duplicates().reset_index(drop=True)
    
    # Map IEA regions to ISO codes
    all_combinations['iso'] = all_combinations['COUNTRY'].map(region_to_iso)
    
    # Filter out regions that don't have valid ISO mappings (including "other*")
    valid_combinations = all_combinations[all_combinations['iso'].notna()].copy()
    valid_combinations.rename(columns={'YEAR': 'year'}, inplace=True)
    
    print(f"Filtered from {len(all_combinations)} to {len(valid_combinations)} records with valid ISO codes")
    print(f"Valid ISOs: {sorted(valid_combinations['iso'].unique())[:10]}...")
    
    # Merge all aggregations efficiently, but first map IEA regions to ISOs in each agg
    result_df = valid_combinations[['iso', 'year']].copy()
    
    for agg_df in [industry_agg, buildings_agg, nonroad_agg, road_agg, imports_agg, exports_agg, total_production_agg]:
        # Map IEA regions to ISOs in aggregation dataframes
        agg_df['iso'] = agg_df['COUNTRY'].map(region_to_iso)
        agg_df = agg_df[agg_df['iso'].notna()]  # Keep only valid ISOs
        agg_df.rename(columns={'YEAR': 'year'}, inplace=True)
        agg_df = agg_df.drop('COUNTRY', axis=1)  # Remove original IEA region column
        
        result_df = result_df.merge(agg_df, on=['iso', 'year'], how='left')
    
    # Fill missing values and round
    numeric_cols = ['industry_twh', 'buildings_agri_twh', 'nonroad_transport_twh', 
                    'road_transport_twh', 'imports_twh', 'exports_twh', 'total_production_twh']
    for col in numeric_cols:
        result_df[col] = result_df[col].fillna(0).round(2)
    
    print(f"\n=== FAST Results ===")
    print(f"Total records: {len(result_df)}")
    print(f"Countries: {result_df['iso'].nunique()}")
    print(f"Years: {sorted(result_df['year'].unique())}")
    
    # Save to CSV
    output_file = Path("iea_electricity_summary_2018_2022.csv")
    result_df.to_csv(output_file, index=False)
    print(f"\n✅ FAST data saved to: {output_file}")
    
    # Show sample and WORLD totals
    print(f"\nSample data:")
    print(result_df.head())
    
    world_2022 = result_df[(result_df['iso'] == 'WORLD') & (result_df['year'] == 2022)]
    if len(world_2022) > 0:
        print(f"\nWORLD 2022 totals:")
        for col in numeric_cols:
            print(f"  {col}: {world_2022[col].iloc[0]:,.1f} TWh")
    
else:
    print("ERROR: No cached data found!")
