In [1]:
# World Inequality Database Analysis
# CS 328 Writing Assignment - 2025

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Set visualization styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("colorblind")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create output directories if they don't exist
os.makedirs('figures', exist_ok=True)
os.makedirs('output', exist_ok=True)

# Function to read WID CSV files with proper parameters
def read_wid_csv(file_path):
    """
    Read WID CSV files using the semicolon separator as specified in documentation.
    """
    try:
        return pd.read_csv(file_path, sep=';', encoding='utf-8')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Function to list available files in the WID data directory
def list_wid_files(directory='wid_all_data'):
    """List and categorize files in the WID data directory."""
    if not os.path.exists(directory):
        print(f"Directory {directory} not found")
        return None
    
    all_files = os.listdir(directory)
    
    # Categorize files
    country_file = [f for f in all_files if f == 'WID_countries.csv']
    data_files = sorted([f for f in all_files if f.startswith('WID_data_')])
    metadata_files = sorted([f for f in all_files if f.startswith('WID_metadata_')])
    other_files = [f for f in all_files if f not in country_file + data_files + metadata_files]
    
    # Create a summary dictionary
    file_summary = {
        'country_file': country_file,
        'data_files': data_files,
        'metadata_files': metadata_files,
        'other_files': other_files,
        'total_files': len(all_files),
        'total_countries': len(data_files)
    }
    
    return file_summary

# Explore available countries and their metadata
def explore_countries(directory='wid_all_data'):
    """Load and explore country data from WID_countries.csv."""
    countries_path = os.path.join(directory, 'WID_countries.csv')
    
    if not os.path.exists(countries_path):
        print(f"Country file not found at {countries_path}")
        return None
    
    countries_df = read_wid_csv(countries_path)
    
    if countries_df is not None:
        # Create a summary of regions
        region_counts = countries_df['region'].value_counts()
        region2_counts = countries_df['region2'].value_counts()
        
        # Filter actual countries (2-letter codes) from regions/aggregates
        countries_only = countries_df[countries_df['alpha2'].str.len() == 2]
        
        # Create a country summary
        country_summary = {
            'total_entries': len(countries_df),
            'country_count': len(countries_only),
            'regions': region_counts.to_dict(),
            'subregions': region2_counts.to_dict()
        }
        
        return {
            'countries_df': countries_df,
            'summary': country_summary
        }
    
    return None

# Function to explore the structure of a single country's data file
def explore_country_data(country_code, directory='wid_all_data'):
    """
    Explore the data structure for a single country.
    
    Args:
        country_code (str): Two-letter country code (e.g., 'US', 'FR')
        directory (str): Path to the WID data directory
    
    Returns:
        dict: Summary information about the country's data
    """
    data_path = os.path.join(directory, f'WID_data_{country_code}.csv')
    metadata_path = os.path.join(directory, f'WID_metadata_{country_code}.csv')
    
    if not os.path.exists(data_path) or not os.path.exists(metadata_path):
        print(f"Data or metadata file for {country_code} not found")
        return None
    
    # Load data and metadata
    data_df = read_wid_csv(data_path)
    metadata_df = read_wid_csv(metadata_path)
    
    if data_df is None or metadata_df is None:
        return None
    
    # Create data summary
    data_summary = {
        'rows': len(data_df),
        'variables': data_df['variable'].nunique(),
        'variable_list': sorted(data_df['variable'].unique()),
        'percentiles': data_df['percentile'].nunique(),
        'percentile_list': sorted(data_df['percentile'].unique()),
        'years': {
            'min': data_df['year'].min(),
            'max': data_df['year'].max(),
            'count': data_df['year'].nunique()
        }
    }
    
    # Create metadata summary
    metadata_summary = {
        'rows': len(metadata_df),
        'unique_variables': metadata_df['variable'].nunique(),
        'variable_list': sorted(metadata_df['variable'].unique())
    }
    
    return {
        'data_df': data_df,
        'metadata_df': metadata_df,
        'data_summary': data_summary,
        'metadata_summary': metadata_summary
    }

# Function to extract variable descriptions from metadata
def get_variable_descriptions(metadata_df):
    """
    Extract unique variable descriptions from metadata.
    
    Args:
        metadata_df (pd.DataFrame): Metadata dataframe
    
    Returns:
        pd.DataFrame: Dataframe with variable codes and descriptions
    """
    if metadata_df is None:
        return None
    
    # Check if required columns exist
    required_columns = ['variable', 'simpledes', 'technicaldes', 'unit']
    if not all(col in metadata_df.columns for col in required_columns):
        print(f"Metadata is missing required columns. Available columns: {metadata_df.columns.tolist()}")
        return None
    
    # Extract unique variable descriptions
    var_descriptions = metadata_df[required_columns].drop_duplicates()
    
    return var_descriptions.sort_values('variable').reset_index(drop=True)

# Function to examine variable availability across countries
def compare_variable_availability(country_list, directory='wid_all_data'):
    """
    Compare which variables are available across multiple countries.
    
    Args:
        country_list (list): List of country codes to compare
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Data frame showing variable availability by country
    """
    availability_data = []
    
    for country in country_list:
        data_path = os.path.join(directory, f'WID_data_{country}.csv')
        
        if os.path.exists(data_path):
            data_df = read_wid_csv(data_path)
            
            if data_df is not None:
                variables = data_df['variable'].unique()
                
                for var in variables:
                    # Check year range for this variable
                    var_data = data_df[data_df['variable'] == var]
                    year_min = var_data['year'].min()
                    year_max = var_data['year'].max()
                    
                    availability_data.append({
                        'country': country,
                        'variable': var,
                        'available': True,
                        'year_min': year_min,
                        'year_max': year_max,
                        'year_count': var_data['year'].nunique()
                    })
    
    # Convert to DataFrame
    availability_df = pd.DataFrame(availability_data)
    
    # Create a pivot table of availability
    if not availability_df.empty:
        pivot_df = pd.pivot_table(
            availability_df, 
            values='available',
            index='variable',
            columns='country',
            aggfunc=lambda x: True if len(x) > 0 else False,
            fill_value=False
        )
        
        # Add a total count column
        pivot_df['total_countries'] = pivot_df.sum(axis=1)
        
        # Sort by availability
        pivot_df = pivot_df.sort_values('total_countries', ascending=False)
        
        return pivot_df
    
    return None

# Main execution to explore the dataset
def explore_dataset(directory='wid_all_data'):
    """Main function to explore the WID dataset structure."""
    print("Exploring WID dataset structure...")
    
    # List available files
    files = list_wid_files(directory)
    if files:
        print(f"Total files: {files['total_files']}")
        print(f"Country files: {len(files['data_files'])}")
        
        # Show some example countries
        if files['data_files']:
            print("Example countries:", [f.replace('WID_data_', '').replace('.csv', '') 
                                         for f in files['data_files'][:10]])
    
    # Explore countries metadata
    countries_info = explore_countries(directory)
    if countries_info:
        countries_df = countries_info['countries_df']
        print(f"\nTotal countries/regions: {len(countries_df)}")
        
        # Display regions
        print("\nWorld regions:")
        for region, count in countries_info['summary']['regions'].items():
            print(f"  {region}: {count} entries")
    
    # Explore a sample country
    sample_country = 'US'  # United States as example
    country_info = explore_country_data(sample_country, directory)
    
    if country_info:
        print(f"\nSample data for {sample_country}:")
        print(f"  Rows: {country_info['data_summary']['rows']}")
        print(f"  Unique variables: {country_info['data_summary']['variables']}")
        print(f"  Year range: {country_info['data_summary']['years']['min']} - {country_info['data_summary']['years']['max']}")
        
        # Show some variable descriptions
        var_desc = get_variable_descriptions(country_info['metadata_df'])
        if var_desc is not None and len(var_desc) > 0:
            print("\nSample variable descriptions:")
            for _, row in var_desc.head(5).iterrows():
                print(f"  {row['variable']}: {row['simpledes']} ({row['unit']})")
    
    return {
        'files': files,
        'countries_info': countries_info,
        'sample_country_info': country_info
    }

# Run the exploration if executed as a script
if __name__ == "__main__":
    explore_result = explore_dataset()
    print("\nExploration complete!")

Exploring WID dataset structure...
Total files: 802
Country files: 400
Example countries: ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AN', 'AO', 'AR']

Total countries/regions: 400

World regions:
  Africa: 58 entries
  Europe: 54 entries
  Asia: 54 entries
  Americas: 53 entries
  Oceania: 23 entries

Sample data for US:
  Rows: 633484
  Unique variables: 1560
  Year range: 1800 - 2023

Sample variable descriptions:
  accmhni992: nan (USD)
  accmhni999: nan (USD)
  accmhoi992: nan (USD)
  accmhoi999: nan (USD)
  accshni992: nan (USD)

Exploration complete!


In [2]:
# World Inequality Database - Data Processing and Feature Engineering

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# Import functions from our exploration module
from data_exploration import read_wid_csv, explore_countries, explore_country_data

# Key variables we'll be using based on the WID documentation
# sptinc992j - Share (s) of pre-tax national income (ptinc) for adults (992) with equal-split (j)
# ahweal992j - Average (a) household wealth (hweal) for adults (992) with equal-split (j)
# anninc992i - Average (a) national income (nninc) for adults (992) for individuals (i)

# Define percentiles of interest
TOP_PERCENTILES = ['p99p100', 'p90p100', 'p99.9p100']  # Top 1%, Top 10%, Top 0.1%
BOTTOM_PERCENTILES = ['p0p50', 'p0p90']  # Bottom 50%, Bottom 90%
MIDDLE_PERCENTILES = ['p50p90']  # Middle 40%

# Countries to include in our analysis
# We'll select a diverse set of countries from different regions and development levels
COUNTRIES_TO_ANALYZE = [
    # High-income countries
    'US',   # United States
    'FR',   # France
    'DE',   # Germany
    'GB',   # United Kingdom
    'JP',   # Japan
    
    # Upper-middle income countries
    'BR',   # Brazil
    'CN',   # China
    'RU',   # Russia
    'ZA',   # South Africa
    
    # Lower-middle and low-income countries
    'IN',   # India
    'ID',   # Indonesia
    'NG',   # Nigeria
    'EG'    # Egypt
]

# Function to load country data with selected variables
def load_country_data(country_code, directory='wid_all_data'):
    """
    Load specific inequality variables for a given country.
    
    Args:
        country_code (str): Two-letter country code
        directory (str): Path to WID data directory
    
    Returns:
        tuple: (data_df, metadata_df) for the country
    """
    data_path = os.path.join(directory, f'WID_data_{country_code}.csv')
    metadata_path = os.path.join(directory, f'WID_metadata_{country_code}.csv')
    
    if not os.path.exists(data_path) or not os.path.exists(metadata_path):
        print(f"Data or metadata not found for {country_code}")
        return None, None
    
    data_df = read_wid_csv(data_path)
    metadata_df = read_wid_csv(metadata_path)
    
    return data_df, metadata_df

# Function to create a dataset for a specific inequality metric
def create_inequality_dataset(countries, variable_code, percentiles, directory='wid_all_data'):
    """
    Create a dataset comparing a specific inequality variable across countries.
    
    Args:
        countries (list): List of country codes
        variable_code (str): WID variable code (e.g., 'sptinc992j' for pre-tax income share)
        percentiles (list): List of percentile codes (e.g., ['p99p100', 'p0p50'])
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Combined dataset with inequality data
    """
    # Load country information for names
    countries_info = explore_countries(directory)
    if countries_info is None:
        print("Could not load country information")
        return None
    
    countries_df = countries_info['countries_df']
    country_name_map = dict(zip(countries_df['alpha2'], countries_df['shortname']))
    
    combined_df = pd.DataFrame()
    
    for country in countries:
        data_df, metadata_df = load_country_data(country, directory)
        
        if data_df is None:
            print(f"Skipping {country} - could not load data")
            continue
        
        # Filter for the requested variable and percentiles
        filtered_df = data_df[(data_df['variable'] == variable_code) & 
                             (data_df['percentile'].isin(percentiles))]
        
        if filtered_df.empty:
            print(f"No data for {variable_code} with percentiles {percentiles} in {country}")
            continue
        
        # Add country name
        filtered_df['country_code'] = country
        filtered_df['country_name'] = country_name_map.get(country, country)
        
        # Append to combined dataset
        combined_df = pd.concat([combined_df, filtered_df])
    
    if combined_df.empty:
        print(f"No data found for {variable_code} across specified countries and percentiles")
        return None
    
    return combined_df

# Function to create a comparative dataset of income/wealth distribution over time
def create_time_series_dataset(variable_code, percentile, countries=COUNTRIES_TO_ANALYZE, directory='wid_all_data'):
    """
    Create a dataset of inequality metrics over time for multiple countries.
    
    Args:
        variable_code (str): WID variable code
        percentile (str): Percentile code
        countries (list): List of country codes
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Time series data for the specified variable and percentile
    """
    # Get variable description
    sample_country = countries[0]
    _, metadata_df = load_country_data(sample_country, directory)
    
    variable_desc = None
    if metadata_df is not None:
        var_info = metadata_df[metadata_df['variable'] == variable_code]
        if not var_info.empty:
            variable_desc = var_info.iloc[0]['simpledes']
    
    dataset = create_inequality_dataset(countries, variable_code, [percentile], directory)
    
    if dataset is not None:
        # Pivot to have years as columns and countries as rows for easier plotting
        dataset = dataset.sort_values(['country_name', 'year'])
        
        # Add metadata
        dataset.attrs['variable_code'] = variable_code
        dataset.attrs['variable_desc'] = variable_desc
        dataset.attrs['percentile'] = percentile
    
    return dataset

# Function to create a dataset for GDP per capita
def create_gdp_dataset(countries=COUNTRIES_TO_ANALYZE, directory='wid_all_data'):
    """
    Create a dataset of GDP per capita for comparison with inequality metrics.
    Using national income per adult as a proxy.
    
    Args:
        countries (list): List of country codes
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: GDP per capita data
    """
    # National income per adult in constant local currency
    variable_code = 'anninc992i'
    
    # We don't need a percentile for this aggregate measure, but WID still requires one
    # p0p100 represents the entire population
    gdp_data = create_inequality_dataset(countries, variable_code, ['p0p100'], directory)
    
    if gdp_data is not None:
        # Add variable description
        gdp_data.attrs['variable_desc'] = 'National Income per Adult'
        
        # Convert to common currency (USD) using most recent PPP rates
        # This would require additional implementation to get PPP conversion rates
        # For simplicity, we'll leave the values in local currency
    
    return gdp_data

# Function to combine multiple inequality metrics for cross-sectional analysis
def create_cross_sectional_dataset(countries=COUNTRIES_TO_ANALYZE, year=2020, directory='wid_all_data'):
    """
    Create a cross-sectional dataset combining multiple inequality metrics for a specific year.
    
    Args:
        countries (list): List of country codes
        year (int): Reference year for the cross-section
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Combined dataset with multiple inequality metrics
    """
    # Define the variables and percentiles we want to include
    metrics = [
        {'variable': 'sptinc992j', 'percentile': 'p99p100', 'name': 'top1_income_share'},
        {'variable': 'sptinc992j', 'percentile': 'p90p100', 'name': 'top10_income_share'},
        {'variable': 'sptinc992j', 'percentile': 'p0p50', 'name': 'bottom50_income_share'},
        {'variable': 'sptinc992j', 'percentile': 'p50p90', 'name': 'middle40_income_share'},
        {'variable': 'shweal992j', 'percentile': 'p99p100', 'name': 'top1_wealth_share'},
        {'variable': 'shweal992j', 'percentile': 'p90p100', 'name': 'top10_wealth_share'},
        {'variable': 'shweal992j', 'percentile': 'p0p50', 'name': 'bottom50_wealth_share'},
        {'variable': 'anninc992i', 'percentile': 'p0p100', 'name': 'gdp_per_adult'},
        # Add the Gini coefficient if available
        {'variable': 'gptinc992i', 'percentile': 'p0p100', 'name': 'income_gini'}
    ]
    
    # Initialize results dataframe
    result_data = []
    
    # Load country info for names
    countries_info = explore_countries(directory)
    countries_df = countries_info['countries_df'] if countries_info else None
    country_name_map = dict(zip(countries_df['alpha2'], countries_df['shortname'])) if countries_df is not None else {}
    
    # Process each country
    for country in countries:
        country_data = {'country_code': country, 'country_name': country_name_map.get(country, country)}
        
        data_df, _ = load_country_data(country, directory)
        
        if data_df is None:
            continue
        
        # Extract values for each metric
        for metric in metrics:
            var = metric['variable']
            perc = metric['percentile']
            name = metric['name']
            
            # Filter data for this variable, percentile, and closest year
            filtered = data_df[(data_df['variable'] == var) & (data_df['percentile'] == perc)]
            
            if filtered.empty:
                country_data[name] = np.nan
                continue
            
            # Find closest year to the reference year
            available_years = filtered['year'].unique()
            if year in available_years:
                closest_year = year
            else:
                closest_year = available_years[np.abs(available_years - year).argmin()]
            
            # Get the value for the closest year
            year_value = filtered[filtered['year'] == closest_year]['value'].iloc[0]
            country_data[name] = year_value
            country_data[f'{name}_year'] = closest_year
        
        # Add region information if available
        if countries_df is not None:
            country_region = countries_df[countries_df['alpha2'] == country]
            if not country_region.empty:
                country_data['region'] = country_region['region'].iloc[0]
                country_data['region2'] = country_region['region2'].iloc[0]
        
        result_data.append(country_data)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(result_data)
    
    return result_df

# Function to create a dataset comparing changes in inequality over time
def create_inequality_change_dataset(countries=COUNTRIES_TO_ANALYZE, 
                                     variable_code='sptinc992j',
                                     percentile='p99p100', 
                                     start_year=1980, 
                                     end_year=2020,
                                     directory='wid_all_data'):
    """
    Create a dataset showing changes in inequality metrics over time.
    
    Args:
        countries (list): List of country codes
        variable_code (str): WID variable code
        percentile (str): Percentile code
        start_year (int): Starting year for change calculation
        end_year (int): Ending year for change calculation
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Dataset with inequality changes
    """
    # Get the time series data
    time_series = create_time_series_dataset(variable_code, percentile, countries, directory)
    
    if time_series is None:
        return None
    
    # Calculate changes
    change_data = []
    
    # Group by country
    for country, group in time_series.groupby('country_code'):
        group = group.sort_values('year')
        country_name = group['country_name'].iloc[0]
        
        # Try to get values for exact years
        start_data = group[group['year'] == start_year]
        end_data = group[group['year'] == end_year]
        
        # If exact years not available, find closest years
        if start_data.empty:
            available_years = group['year'].unique()
            closest_start = available_years[np.abs(available_years - start_year).argmin()]
            start_data = group[group['year'] == closest_start]
        
        if end_data.empty:
            available_years = group['year'].unique()
            closest_end = available_years[np.abs(available_years - end_year).argmin()]
            end_data = group[group['year'] == closest_end]
        
        # Skip if we don't have data for both periods
        if start_data.empty or end_data.empty:
            print(f"Insufficient data for {country} to calculate changes")
            continue
        
        # Calculate changes
        start_value = start_data['value'].iloc[0]
        end_value = end_data['value'].iloc[0]
        actual_start_year = start_data['year'].iloc[0]
        actual_end_year = end_data['year'].iloc[0]
        
        absolute_change = end_value - start_value
        percent_change = (absolute_change / start_value) * 100 if start_value != 0 else np.nan
        
        change_data.append({
            'country_code': country,
            'country_name': country_name,
            'start_year': actual_start_year,
            'end_year': actual_end_year,
            'start_value': start_value,
            'end_value': end_value,
            'absolute_change': absolute_change,
            'percent_change': percent_change
        })
    
    # Convert to DataFrame
    change_df = pd.DataFrame(change_data)
    
    # Add metadata
    change_df.attrs['variable_code'] = variable_code
    change_df.attrs['variable_desc'] = time_series.attrs.get('variable_desc', '')
    change_df.attrs['percentile'] = percentile
    
    return change_df

# Function to combine income and wealth inequality data for correlation analysis
def create_correlation_dataset(countries=COUNTRIES_TO_ANALYZE, reference_year=2020, directory='wid_all_data'):
    """
    Create a dataset to analyze correlations between income and wealth inequality.
    
    Args:
        countries (list): List of country codes
        reference_year (int): Reference year for the cross-section
        directory (str): Path to WID data directory
    
    Returns:
        pd.DataFrame: Dataset with income and wealth inequality metrics
    """
    # Get cross-sectional data
    cross_section = create_cross_sectional_dataset(countries, reference_year, directory)
    
    if cross_section is None or cross_section.empty:
        print("Could not create cross-sectional dataset for correlation analysis")
        return None
    
    # Create metrics for correlation analysis
    corr_metrics = [
        ('top1_income_share', 'top1_wealth_share'),
        ('top10_income_share', 'top10_wealth_share'),
        ('bottom50_income_share', 'bottom50_wealth_share'),
        ('gdp_per_adult', 'top1_income_share'),
        ('gdp_per_adult', 'top1_wealth_share')
    ]
    
    # Calculate correlations
    correlations = {}
    
    for x_var, y_var in corr_metrics:
        if x_var in cross_section.columns and y_var in cross_section.columns:
            # Filter out NaN values
            valid_data = cross_section[[x_var, y_var]].dropna()
            
            if len(valid_data) >= 5:  # Require at least 5 countries for meaningful correlation
                corr, p_value = stats.pearsonr(valid_data[x_var], valid_data[y_var])
                correlations[f'{x_var}_vs_{y_var}'] = {
                    'correlation': corr,
                    'p_value': p_value,
                    'n': len(valid_data)
                }
    
    # Add correlations to dataset attributes
    cross_section.attrs['correlations'] = correlations
    
    return cross_section

# Main function to prepare all datasets
def prepare_all_datasets(directory='wid_all_data'):
    """
    Prepare all datasets needed for our inequality analysis.
    
    Args:
        directory (str): Path to WID data directory
    
    Returns:
        dict: Dictionary of prepared datasets
    """
    print("Preparing inequality datasets...")
    
    datasets = {}
    
    # 1. Time series of top 1% income share
    print("Creating top 1% income share time series...")
    datasets['top1_income_time'] = create_time_series_dataset(
        'sptinc992j', 'p99p100', COUNTRIES_TO_ANALYZE, directory)
    
    # 2. Time series of top 10% income share 
    print("Creating top 10% income share time series...")
    datasets['top10_income_time'] = create_time_series_dataset(
        'sptinc992j', 'p90p100', COUNTRIES_TO_ANALYZE, directory)
    
    # 3. Time series of bottom 50% income share
    print("Creating bottom 50% income share time series...")
    datasets['bottom50_income_time'] = create_time_series_dataset(
        'sptinc992j', 'p0p50', COUNTRIES_TO_ANALYZE, directory)
    
    # 4. Time series of top 1% wealth share
    print("Creating top 1% wealth share time series...")
    datasets['top1_wealth_time'] = create_time_series_dataset(
        'shweal992j', 'p99p100', COUNTRIES_TO_ANALYZE, directory)
    
    # 5. Cross-sectional data for most recent year
    print("Creating cross-sectional dataset...")
    datasets['cross_section'] = create_cross_sectional_dataset(
        COUNTRIES_TO_ANALYZE, 2020, directory)
    
    # 6. Changes in income inequality (1980-2020)
    print("Creating income inequality change dataset...")
    datasets['income_change'] = create_inequality_change_dataset(
        COUNTRIES_TO_ANALYZE, 'sptinc992j', 'p99p100', 1980, 2020, directory)
    
    # 7. Changes in wealth inequality (1980-2020)
    print("Creating wealth inequality change dataset...")
    datasets['wealth_change'] = create_inequality_change_dataset(
        COUNTRIES_TO_ANALYZE, 'shweal992j', 'p99p100', 1980, 2020, directory)
    
    # 8. Correlation dataset
    print("Creating correlation dataset...")
    datasets['correlation'] = create_correlation_dataset(
        COUNTRIES_TO_ANALYZE, 2020, directory)
    
    # Save datasets to CSV files
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)
    
    for name, df in datasets.items():
        if df is not None:
            df.to_csv(os.path.join(output_dir, f'{name}.csv'), index=False)
            print(f"Saved {name}.csv")
    
    print("Dataset preparation complete!")
    return datasets

# Run the data preparation if executed as a script
if __name__ == "__main__":
    prepared_data = prepare_all_datasets()

ModuleNotFoundError: No module named 'data_exploration'