# Task 1: Data Enrichment

This notebook contains the code for enriching the dataset with additional observations, events, and impact links.

In [None]:
import sys
from pathlib import Path
import pandas as pd
from datetime import datetime
import logging

# Add src to path
sys.path.append(str(Path('../src').resolve()))

# Import Task1 classes
from task1_data_exploration import (
    Task1DataProcessor,
    DataEnricher,
    ObservationRecord,
    EventRecord,
    ImpactLinkRecord,
    ConfidenceLevel,
    ImpactDirection
)

# Set up paths
data_dir = Path('../data/raw')
processed_dir = Path('../data/processed')
data_file = data_dir / 'ethiopia_fi_unified_data.xlsx'
reference_codes_file = data_dir / 'reference_codes.xlsx'

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Initialize the processor
processor = Task1DataProcessor(
    data_file=data_file,
    reference_codes_file=reference_codes_file,
    logger=logger
)

logger.info("Enrichment notebook initialized successfully")

## Load Existing Data

In [None]:
# Load existing data using the processor
existing_data, existing_links = processor.enricher.load_existing_data()

print(f"Existing records: {len(existing_data)}")
print(f"Existing impact links: {len(existing_links)}")

# Get max IDs for new records
max_data_id = existing_data['id'].max() if 'id' in existing_data.columns else 0
print(f"Max data ID: {max_data_id}")

# Display sample of existing data
print("\nSample of existing data:")
print(existing_data.head())

## Add New Observations

Add observations following the schema:
- pillar, indicator, indicator_code, value_numeric, observation_date
- source_name, source_url, confidence
- collected_by, original_text, notes

In [None]:
# Add new observations using ObservationRecord dataclass
# Example: Replace with your actual data

# Example observation (commented out - uncomment and modify with real data):
# observation = ObservationRecord(
#     pillar='access',
#     indicator='Account ownership',
#     indicator_code='ACC_001',
#     value_numeric=45.2,
#     observation_date='2023-01-01',
#     source_name='World Bank Findex',
#     source_url='https://example.com',
#     confidence=ConfidenceLevel.HIGH.value,
#     collected_by='Your Name',
#     original_text='45.2% of adults in Ethiopia have an account',
#     notes='Important baseline for access pillar'
# )
# processor.enricher.add_observation(observation)

# Add your observations here:
# processor.enricher.add_observation(ObservationRecord(...))

print(f"New observations to add: {len(processor.enricher.new_observations)}")

## Add New Events

Add events following the schema:
- category (e.g., policy, product_launch, infrastructure)
- event_date
- source_name, source_url, confidence
- collected_by, original_text, notes
- **Note**: pillar should be left empty for events

In [None]:
# Add new events using EventRecord dataclass
# Example: Replace with your actual data

# Example event (commented out - uncomment and modify with real data):
# event = EventRecord(
#     category='policy',
#     event_date='2023-06-15',
#     source_name='National Bank of Ethiopia',
#     source_url='https://example.com',
#     confidence=ConfidenceLevel.HIGH.value,
#     collected_by='Your Name',
#     original_text='New digital payment policy announced',
#     notes='Could significantly impact access and usage'
# )
# processor.enricher.add_event(event)

# Add your events here:
# processor.enricher.add_event(EventRecord(...))

print(f"New events to add: {len(processor.enricher.new_events)}")

## Add New Impact Links

Add impact links following the schema:
- parent_id (linking to event ID)
- pillar, related_indicator
- impact_direction, impact_magnitude, lag_months
- evidence_basis
- source_name, source_url, confidence
- collected_by, notes

In [None]:
# Add new impact links using ImpactLinkRecord dataclass
# Example: Replace with your actual data

# Example impact link (commented out - uncomment and modify with real data):
# impact_link = ImpactLinkRecord(
#     parent_id='EVENT_123',  # ID of the event this links to
#     pillar='access',
#     related_indicator='ACC_001',
#     impact_direction=ImpactDirection.POSITIVE.value,
#     impact_magnitude=5.0,  # Optional
#     lag_months=6,  # Optional
#     evidence_basis='Historical analysis of similar policies',
#     source_name='Research Paper',
#     source_url='https://example.com',
#     confidence=ConfidenceLevel.MEDIUM.value,
#     collected_by='Your Name',
#     notes='Policy expected to increase account ownership'
# )
# processor.enricher.add_impact_link(impact_link)

# Add your impact links here:
# processor.enricher.add_impact_link(ImpactLinkRecord(...))

print(f"New impact links to add: {len(processor.enricher.new_impact_links)}")

## Merge and Save Enriched Dataset

In [None]:
# Merge all enrichments and save using the processor
output_file = processed_dir / 'ethiopia_fi_unified_data_enriched.xlsx'

enriched_data, enriched_links = processor.enrich_and_save(output_file)

# Get enrichment summary
summary = processor.enricher.get_enrichment_summary()

print(f"\n=== ENRICHMENT SUMMARY ===")
print(f"Total records: {len(enriched_data)}")
print(f"Total impact links: {len(enriched_links)}")
print(f"\nNew records added:")
print(f"  - Observations: {summary['new_observations']}")
print(f"  - Events: {summary['new_events']}")
print(f"  - Impact Links: {summary['new_impact_links']}")
print(f"  - Total new records: {summary['total_new_records']}")
print(f"\nEnriched dataset saved to: {output_file}")

## Verify Enrichments

In [None]:
# Verify the enriched dataset
print("=== ENRICHED DATA SUMMARY ===")
print(f"Total records: {len(enriched_data)}")
if 'record_type' in enriched_data.columns:
    print(f"\nBy record type:")
    print(enriched_data['record_type'].value_counts())

print(f"\n=== ENRICHED IMPACT LINKS SUMMARY ===")
print(f"Total impact links: {len(enriched_links)}")
if 'pillar' in enriched_links.columns:
    print(f"\nBy pillar:")
    print(enriched_links['pillar'].value_counts())

# Display sample of enriched data
print("\n=== SAMPLE OF ENRICHED DATA ===")
print(enriched_data.tail(10))