# Task 1: Data Exploration and Enrichment

## Objective
Understand the starter dataset and enrich it with additional data useful for forecasting financial inclusion in Ethiopia.

## 1. Load and Explore Datasets

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path('../src').resolve()))

# Import Task1 classes
from task1_data_exploration import (
    Task1DataProcessor,
    DataExplorer,
    DataEnricher,
    ObservationRecord,
    EventRecord,
    ImpactLinkRecord,
    ConfidenceLevel,
    RecordType,
    ImpactDirection
)

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

# Initialize the processor
processor = Task1DataProcessor(
    data_file=data_dir / 'ethiopia_fi_unified_data.xlsx',
    reference_codes_file=data_dir / 'reference_codes.xlsx',
    logger=logger
)

logger.info("Notebook initialized successfully")

### 1.1 Load Main Dataset (ethiopia_fi_unified_data.xlsx)

In [None]:
# Load all datasets using the processor
data_df, impact_links_df, reference_codes_df = processor.explorer.load_data()

print(f"Data sheet shape: {data_df.shape}")
print(f"Impact links shape: {impact_links_df.shape}")
print(f"Reference codes shape: {reference_codes_df.shape}")
print(f"\nData columns: {list(data_df.columns)}")
print(f"\nFirst few rows of data:")
data_df.head()

In [None]:
# Display impact links
print(f"Impact links shape: {impact_links_df.shape}")
print(f"\nColumns: {list(impact_links_df.columns)}")
print(f"\nFirst few rows:")
impact_links_df.head()

### 1.2 Load Reference Codes

In [None]:
# Display reference codes
print(f"Reference codes shape: {reference_codes_df.shape}")
print(f"\nColumns: {list(reference_codes_df.columns)}")
print(f"\nFirst few rows:")
reference_codes_df.head(20)

## 2. Understand the Schema

In [None]:
# Get schema information using the processor
schema_info = processor.explorer.get_schema_info()

print("=== DATA SHEET INFO ===")
print(f"Shape: {schema_info['data_shape']}")
print(f"Columns: {schema_info['data_columns']}")
print(f"\nData types:")
for col, dtype in schema_info['data_dtypes'].items():
    print(f"  {col}: {dtype}")

print("\n=== IMPACT LINKS INFO ===")
print(f"Shape: {schema_info['impact_links_shape']}")
print(f"Columns: {schema_info['impact_links_columns']}")

In [None]:
# Get missing values using the processor
missing = processor.explorer.get_missing_values()

print("=== MISSING VALUES IN DATA SHEET ===")
missing_data = missing['data']
print(missing_data[missing_data > 0])

print("\n=== MISSING VALUES IN IMPACT LINKS ===")
missing_links = missing['impact_links']
print(missing_links[missing_links > 0])

## 3. Explore the Data

### 3.1 Count Records by Type

In [None]:
# Count by record_type using the processor
print("=== RECORD TYPE DISTRIBUTION ===")
record_type_counts = processor.explorer.count_by_record_type()
print(record_type_counts)
print(f"\nTotal records: {len(processor.explorer.data_df)}")

In [None]:
# Count by pillar using the processor
print("=== PILLAR DISTRIBUTION ===")
pillar_counts = processor.explorer.count_by_pillar()
print(pillar_counts)
if len(processor.explorer.data_df) > 0:
    null_count = processor.explorer.data_df['pillar'].isnull().sum()
    print(f"\nRecords with null pillar: {null_count}")

In [None]:
# Count by source_type and confidence using the processor
print("=== SOURCE TYPE DISTRIBUTION ===")
source_type_counts = processor.explorer.count_by_source_type()
print(source_type_counts)

print("\n=== CONFIDENCE DISTRIBUTION ===")
confidence_counts = processor.explorer.count_by_confidence()
print(confidence_counts)

### 3.2 Temporal Range Analysis

In [None]:
# Get temporal range using the processor
temporal_info = processor.explorer.get_temporal_range()

for col, info in temporal_info.items():
    print(f"\n=== {col.upper()} ===")
    print(f"Min: {info['min']}")
    print(f"Max: {info['max']}")
    print(f"Non-null count: {info['non_null_count']}")
    print(f"Null count: {info['null_count']}")

### 3.3 Unique Indicators Analysis

In [None]:
# Get unique indicators using the processor
print("=== UNIQUE INDICATOR CODES ===")
unique_indicators = processor.explorer.get_unique_indicators()
print(f"Total unique indicators: {len(unique_indicators)}")
print(f"\nIndicator coverage (top 20):")
print(unique_indicators.head(20))

# Show all unique indicator codes
if len(processor.explorer.data_df) > 0 and 'indicator_code' in processor.explorer.data_df.columns:
    print(f"\nAll unique indicator codes:")
    print(sorted(processor.explorer.data_df['indicator_code'].dropna().unique()))

In [None]:
# Get indicator coverage by pillar using the processor
print("=== INDICATOR COVERAGE BY PILLAR ===")
coverage = processor.explorer.get_indicator_coverage_by_pillar()
print(coverage)

### 3.4 Events Catalog Analysis

In [None]:
# Get events catalog using the processor
events = processor.explorer.get_events_catalog()
print(f"=== EVENTS CATALOG ===")
print(f"Total events: {len(events)}")

if len(events) > 0:
    events_by_category = processor.explorer.get_events_by_category()
    print(f"\nEvent categories:")
    print(events_by_category)
    
    # Show events with dates
    date_col = [col for col in events.columns if 'date' in col.lower()][0] if [col for col in events.columns if 'date' in col.lower()] else None
    if date_col:
        print(f"\nEvents by date (first 20):")
        events_sorted = events.sort_values(date_col)
        display_cols = [col for col in ['category', date_col, 'pillar'] if col in events.columns]
        print(events_sorted[display_cols].head(20))

### 3.5 Impact Links Analysis

In [None]:
# Get impact links summary using the processor
print("=== IMPACT LINKS OVERVIEW ===")
summary = processor.explorer.get_impact_links_summary()
print(f"Total impact links: {summary['total_links']}")

if 'by_pillar' in summary:
    print(f"\nImpact links by pillar:")
    print(summary['by_pillar'])

if 'by_direction' in summary:
    print(f"\nImpact direction distribution:")
    print(summary['by_direction'])

if 'unique_parent_events' in summary:
    print(f"\nUnique parent events linked: {summary['unique_parent_events']}")
    if 'top_linked_events' in summary:
        print(f"\nTop events by number of links:")
        print(summary['top_linked_events'])

## 4. Data Quality Checks

In [None]:
# Validate data quality using the processor
print("=== DATA QUALITY CHECKS ===")
quality_issues = processor.explorer.validate_data_quality()

print(f"\nDuplicate records in data sheet: {quality_issues['duplicates_data']}")
print(f"Duplicate records in impact links: {quality_issues['duplicates_impact_links']}")
print(f"Invalid pillar values: {quality_issues['invalid_pillars']}")

if 'invalid_pillar_values' in quality_issues:
    print(f"\nInvalid pillar value counts:")
    print(quality_issues['invalid_pillar_values'])

## 5. Summary Statistics

In [None]:
# Get comprehensive summary statistics using the processor
print("=== DATASET SUMMARY ===")
summary = processor.explorer.get_summary_statistics()
for key, value in summary.items():
    print(f"{key}: {value}")

# Run full exploration to get all results at once
print("\n=== RUNNING FULL EXPLORATION ===")
full_results = processor.run_full_exploration()
print("Full exploration complete! All results stored in 'full_results' variable.")