# Task 2: Exploratory Data Analysis

## Objective
Analyze the data to understand patterns and factors influencing financial inclusion in Ethiopia.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('../src').resolve()))

# Import EDA module
from task2_eda import ExploratoryDataAnalyzer

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')
figure_dir = Path('../reports/figures')

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Initialize EDA analyzer
eda = ExploratoryDataAnalyzer(
    data_file=data_dir / 'ethiopia_fi_unified_data.xlsx',
    reference_codes_file=data_dir / 'reference_codes.xlsx',
    logger=logger,
    figure_dir=figure_dir
)

logger.info("EDA notebook initialized successfully")

## 1. Dataset Overview

In [None]:
# Generate dataset overview
overview = eda.dataset_overview()

print("=== DATASET OVERVIEW ===")
print(f"\nBy Record Type:")
for record_type, count in overview.get('by_record_type', {}).items():
    print(f"  {record_type}: {count}")

print(f"\nBy Pillar:")
for pillar, count in overview.get('by_pillar', {}).items():
    print(f"  {pillar}: {count}")
print(f"  Null pillars: {overview.get('null_pillars', 0)}")

print(f"\nBy Source Type:")
for source_type, count in overview.get('by_source_type', {}).items():
    print(f"  {source_type}: {count}")

print(f"\nConfidence Distribution:")
for confidence, count in overview.get('confidence_distribution', {}).items():
    print(f"  {confidence}: {count}")

### 1.1 Temporal Coverage Visualization

In [None]:
# Create temporal coverage visualization
fig = eda.visualize_temporal_coverage(save=True)
plt.show()

# Display temporal coverage details
temporal = overview.get('temporal_coverage', {})
print(f"\n=== TEMPORAL COVERAGE ===")
print(f"Overall range: {temporal.get('overall_min', 'N/A')} - {temporal.get('overall_max', 'N/A')}")
print(f"\nCoverage by indicator:")
for item in temporal.get('by_indicator', [])[:10]:  # Show first 10
    print(f"  {item['indicator_code']}: {item['min_year']}-{item['max_year']} ({item['count']} observations)")

### 1.2 Data Gaps Assessment

In [None]:
# Identify data gaps
gaps = overview.get('data_gaps', {})
print("=== DATA GAPS ===")
print(f"Sparse indicators (<5 observations): {gaps.get('sparse_count', 0)}")
print(f"Total indicators: {gaps.get('total_indicators', 0)}")
print(f"Sparse percentage: {gaps.get('sparse_percentage', 0):.1f}%")

if gaps.get('sparse_indicators'):
    print(f"\nSparse indicators:")
    for indicator, count in list(gaps['sparse_indicators'].items())[:10]:  # Show first 10
        print(f"  {indicator}: {count} observations")

## 2. Access Analysis

In [None]:
# Analyze Access pillar
access_results = eda.analyze_access()

print("=== ACCESS ANALYSIS ===")
if access_results.get('account_ownership') is not None:
    print(f"\nAccount Ownership Data Points: {len(access_results['account_ownership'])}")
    print("\nAccount Ownership Trajectory:")
    print(access_results['account_ownership'])

if access_results.get('growth_rates'):
    print("\nGrowth Rates:")
    for period, rate in access_results['growth_rates'].items():
        print(f"  {period}: {rate:.2f}%")

### 2.1 Account Ownership Trajectory (2011-2024)

In [None]:
# Plot account ownership trajectory
fig = eda.plot_account_ownership_trajectory(save=True)
plt.show()

### 2.2 Gender Gap Analysis

In [None]:
# Analyze gender gap
if access_results.get('gender_gap'):
    gender_gap = access_results['gender_gap']
    print("=== GENDER GAP ANALYSIS ===")
    
    if gender_gap.get('gap'):
        print("\nGender Gap by Year:")
        for year, gap_value in gender_gap['gap'].items():
            print(f"  {year}: {gap_value:.2f}pp")
    
    # Visualize if data available
    if gender_gap.get('male_data') and gender_gap.get('female_data'):
        fig, ax = plt.subplots(figsize=(12, 6))
        
        male_df = pd.DataFrame(gender_gap['male_data'])
        female_df = pd.DataFrame(gender_gap['female_data'])
        
        if 'year' in male_df.columns and 'value_numeric' in male_df.columns:
            ax.plot(male_df['year'], male_df['value_numeric'], 
                   marker='o', label='Male', linewidth=2)
        if 'year' in female_df.columns and 'value_numeric' in female_df.columns:
            ax.plot(female_df['year'], female_df['value_numeric'], 
                   marker='s', label='Female', linewidth=2)
        
        ax.set_xlabel('Year', fontsize=12)
        ax.set_ylabel('Account Ownership (%)', fontsize=12)
        ax.set_title('Gender Gap in Account Ownership', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(figure_dir / 'gender_gap.png', dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("Gender gap data not available in current dataset")

### 2.3 Urban vs Rural Analysis

In [None]:
# Analyze urban vs rural
if access_results.get('urban_rural'):
    urban_rural = access_results['urban_rural']
    print("=== URBAN VS RURAL ANALYSIS ===")
    
    # Visualize if data available
    if urban_rural.get('urban_data') and urban_rural.get('rural_data'):
        fig, ax = plt.subplots(figsize=(12, 6))
        
        urban_df = pd.DataFrame(urban_rural['urban_data'])
        rural_df = pd.DataFrame(urban_rural['rural_data'])
        
        if 'year' in urban_df.columns and 'value_numeric' in urban_df.columns:
            ax.plot(urban_df['year'], urban_df['value_numeric'], 
                   marker='o', label='Urban', linewidth=2)
        if 'year' in rural_df.columns and 'value_numeric' in rural_df.columns:
            ax.plot(rural_df['year'], rural_df['value_numeric'], 
                   marker='s', label='Rural', linewidth=2)
        
        ax.set_xlabel('Year', fontsize=12)
        ax.set_ylabel('Account Ownership (%)', fontsize=12)
        ax.set_title('Urban vs Rural Account Ownership', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(figure_dir / 'urban_rural.png', dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("Urban/Rural data not available in current dataset")

### 2.4 2021-2024 Slowdown Analysis

In [None]:
# Analyze 2021-2024 slowdown
if access_results.get('slowdown_analysis'):
    slowdown = access_results['slowdown_analysis']
    print("=== 2021-2024 SLOWDOWN ANALYSIS ===")
    
    recent = slowdown.get('recent_period', {})
    print(f"\nRecent Period (2021-2024):")
    print(f"  Years: {recent.get('years', [])}")
    print(f"  Total Growth: {recent.get('total_growth', 0):.2f}pp")
    
    if slowdown.get('previous_period'):
        previous = slowdown['previous_period']
        print(f"\nPrevious Period (2017-2020):")
        print(f"  Years: {previous.get('years', [])}")
        print(f"  Total Growth: {previous.get('total_growth', 0):.2f}pp")
        
        if slowdown.get('comparison'):
            comp = slowdown['comparison']
            print(f"\nComparison:")
            print(f"  Recent Growth: {comp.get('recent_growth', 0):.2f}pp")
            print(f"  Previous Growth: {comp.get('previous_growth', 0):.2f}pp")
            print(f"  Growth Difference: {comp.get('growth_difference', 0):.2f}pp")
            
            print("\n=== POTENTIAL FACTORS FOR SLOWDOWN ===")
            print("1. Mobile money accounts may be registered but not actively used")
            print("2. Limited financial literacy and trust barriers")
            print("3. Infrastructure gaps in rural areas")
            print("4. Economic factors affecting disposable income")
            print("5. Regulatory or policy changes")
else:
    print("Slowdown analysis data not available")

## 3. Usage (Digital Payments) Analysis

In [None]:
# Analyze Usage pillar
usage_results = eda.analyze_usage()

print("=== USAGE ANALYSIS ===")
if usage_results.get('mobile_money') is not None:
    print(f"\nMobile Money Data Points: {len(usage_results['mobile_money'])}")
    print(usage_results['mobile_money'].head(10))

if usage_results.get('digital_payments') is not None:
    print(f"\nDigital Payment Data Points: {len(usage_results['digital_payments'])}")

if usage_results.get('registered_active_gap'):
    print("\nRegistered vs Active Gap Analysis Available")
    
if usage_results.get('payment_use_cases'):
    print(f"\nPayment Use Cases: {list(usage_results['payment_use_cases'].keys())}")

### 3.1 Mobile Money Account Penetration Trend (2014-2024)

In [None]:
# Visualize mobile money trend
if usage_results.get('mobile_money') is not None:
    mobile_data = usage_results['mobile_money']
    
    if 'year' in mobile_data.columns:
        fig, ax = plt.subplots(figsize=(12, 6))
        
        mobile_data = mobile_data.sort_values('year')
        ax.plot(mobile_data['year'], mobile_data['value_numeric'],
               marker='o', linewidth=2, markersize=8, label='Mobile Money Penetration')
        
        ax.set_xlabel('Year', fontsize=12)
        ax.set_ylabel('Penetration (%)', fontsize=12)
        ax.set_title('Mobile Money Account Penetration Trend (2014-2024)', 
                    fontsize=14, fontweight='bold')
        ax.grid(True, alpha=0.3)
        ax.legend()
        plt.tight_layout()
        plt.savefig(figure_dir / 'mobile_money_trend.png', dpi=300, bbox_inches='tight')
        plt.show()
else:
    print("Mobile money data not available")

## 4. Infrastructure and Enablers

In [None]:
# Analyze infrastructure
infra_results = eda.analyze_infrastructure()

print("=== INFRASTRUCTURE ANALYSIS ===")
if infra_results.get('4G_coverage') is not None:
    print(f"\n4G Coverage Data Points: {len(infra_results['4G_coverage'])}")

if infra_results.get('mobile_penetration') is not None:
    print(f"Mobile Penetration Data Points: {len(infra_results['mobile_penetration'])}")

if infra_results.get('ATM_density') is not None:
    print(f"ATM Density Data Points: {len(infra_results['ATM_density'])}")

if infra_results.get('leading_indicators'):
    print(f"\nLeading Indicators Identified: {len(infra_results['leading_indicators'])}")
    print(infra_results['leading_indicators'][:10])

## 5. Event Timeline and Visual Analysis

In [None]:
# Create event timeline
timeline = eda.create_event_timeline()
print(f"=== EVENT TIMELINE ===")
print(f"Total Events: {len(timeline)}")
print("\nEvents:")
print(timeline.head(20))

In [None]:
# Visualize event timeline
fig = eda.visualize_event_timeline(save=True)
plt.show()

### 5.1 Overlay Events on Indicator Trends

In [None]:
# Overlay events on account ownership trend
if access_results.get('account_ownership') is not None:
    account_data = access_results['account_ownership']
    fig = eda.overlay_events_on_trends(account_data, 'Account Ownership', save=True)
    plt.show()
    
    print("\n=== KEY EVENT RELATIONSHIPS ===")
    print("Analyzing relationships between events and account ownership:")
    print("- Telebirr launch (May 2021): Check for acceleration in account ownership")
    print("- M-Pesa entry (Aug 2023): Check for impact on mobile money accounts")
    print("- Safaricom market entry (Aug 2022): Check for market dynamics changes")

## 6. Correlation Analysis

In [None]:
# Perform correlation analysis
correlations = eda.correlation_analysis()

print("=== CORRELATION ANALYSIS ===")
print(f"Correlation matrix shape: {correlations.shape}")
print("\nTop Correlations:")
if len(correlations) > 0:
    # Get top correlations (excluding self-correlations)
    corr_matrix = correlations.replace(1.0, np.nan)
    if not corr_matrix.empty:
        # Flatten and get top correlations
        corr_pairs = []
        for i in range(len(correlations)):
            for j in range(i+1, len(correlations)):
                if pd.notna(correlations.iloc[i, j]):
                    corr_pairs.append({
                        'indicator1': correlations.columns[i],
                        'indicator2': correlations.columns[j],
                        'correlation': correlations.iloc[i, j]
                    })
        
        corr_df = pd.DataFrame(corr_pairs).sort_values('correlation', key=abs, ascending=False)
        print(corr_df.head(10))

In [None]:
# Visualize correlation matrix
fig = eda.visualize_correlations(save=True)
plt.show()

## 7. Key Insights and Summary

In [None]:
# Generate insights
insights = eda.generate_insights()

print("=== KEY INSIGHTS ===")
for i, insight in enumerate(insights, 1):
    print(f"\n{i}. {insight}")

# Additional insights based on analysis
print("\n=== ADDITIONAL INSIGHTS ===")
print("\n1. Factors Driving Financial Inclusion:")
print("   - Mobile money infrastructure expansion")
print("   - Policy interventions and regulatory changes")
print("   - Product launches and market entries")
print("   - Infrastructure development (4G, mobile penetration)")

print("\n2. Account Ownership Stagnation (2021-2024):")
print("   - Despite 65M+ mobile money accounts, only +3pp growth in account ownership")
print("   - Potential factors:")
print("     * Registered vs. active account gap")
print("     * Limited usage beyond registration")
print("     * Financial literacy barriers")
print("     * Trust and security concerns")

print("\n3. Data Gaps Limiting Analysis:")
gaps = overview.get('data_gaps', {})
print(f"   - {gaps.get('sparse_count', 0)} indicators have sparse coverage")
print("   - Limited disaggregated data (gender, urban/rural)")
print("   - Missing infrastructure time series data")

## 8. Data Quality Assessment

In [None]:
# Assess data quality
quality = eda.assess_data_quality()

print("=== DATA QUALITY ASSESSMENT ===")
print(f"\nTotal Records: {quality.get('total_records', 0)}")

print(f"\nMissing Values:")
missing = quality.get('missing_values', {})
if len(missing) > 0:
    for col, count in list(missing.items())[:10]:  # Show first 10
        print(f"  {col}: {count}")
else:
    print("  No significant missing values")

print(f"\nConfidence Distribution:")
for conf, count in quality.get('confidence_distribution', {}).items():
    print(f"  {conf}: {count}")

print(f"\nTemporal Coverage:")
temporal = quality.get('temporal_coverage', {})
print(f"  Overall Range: {temporal.get('overall_range', 'N/A')}")
print(f"  Indicators with Data: {temporal.get('indicators_with_data', 0)}")

print(f"\nData Gaps:")
gaps = quality.get('data_gaps', {})
print(f"  Sparse Indicators: {gaps.get('sparse_count', 0)}")
print(f"  Sparse Percentage: {gaps.get('sparse_percentage', 0):.1f}%")

print(f"\nLimitations:")
for limitation in quality.get('limitations', []):
    print(f"  - {limitation}")

## 9. Run Full EDA Pipeline

Run the complete EDA pipeline in one go:

In [None]:
# Run full EDA pipeline (uncomment to run)
# results = eda.run_full_eda()
# print("Full EDA pipeline complete!")
# print(f"Generated {len(results.insights)} insights")
# print(f"Created figures in {eda.figure_dir}")