# Task 1: Data Exploration and Enrichment

## Objective
Understand the starter dataset and enrich it with additional data useful for the forecasting task.

## Dataset Overview
- `ethiopia_fi_unified_data.xlsx` - Main dataset with observations, events, and targets
- `reference_codes.xlsx` - Valid values for categorical fields
- `Additional Data Points Guide.xlsx` - Guidance for data enrichment

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Load and Examine the Datasets

In [None]:
# Load the main dataset
file_path = '../data/raw/ethiopia_fi_unified_data.xlsx'

# Check available sheets
excel_file = pd.ExcelFile(file_path)
print("Available sheets:", excel_file.sheet_names)

In [None]:
# Load the main data sheet
df_main = pd.read_excel(file_path, sheet_name='data')
print(f"Main dataset shape: {df_main.shape}")
print("\nFirst few rows:")
df_main.head()

In [None]:
# Load the impact_links sheet
df_impact = pd.read_excel(file_path, sheet_name='impact_links')
print(f"Impact links dataset shape: {df_impact.shape}")
print("\nFirst few rows:")
df_impact.head()

In [None]:
# Load reference codes
ref_codes = pd.read_excel('../data/raw/reference_codes.xlsx')
print(f"Reference codes shape: {ref_codes.shape}")
print("\nReference codes:")
ref_codes.head()

## 2. Understand the Schema Structure

In [None]:
# Examine column structure
print("Main dataset columns:")
for i, col in enumerate(df_main.columns):
    print(f"{i+1:2d}. {col}")

In [None]:
# Check record types distribution
print("Record type distribution:")
record_counts = df_main['record_type'].value_counts()
print(record_counts)

print("\nPercentage distribution:")
print(record_counts / len(df_main) * 100)

In [None]:
# Examine observations specifically
observations = df_main[df_main['record_type'] == 'observation']
print(f"Observations: {len(observations)} records")
print("\nObservation sample:")
observations[['indicator_code', 'indicator', 'value_numeric', 'observation_date', 'source_name']].head()

In [None]:
# Examine events
events = df_main[df_main['record_type'] == 'event']
print(f"Events: {len(events)} records")
print("\nEvent sample:")
events[['event_date', 'category', 'description', 'source_name']].head()

In [None]:
# Examine targets
targets = df_main[df_main['record_type'] == 'target']
print(f"Targets: {len(targets)} records")
print("\nTarget sample:")
targets[['indicator_code', 'indicator', 'value_numeric', 'target_date']].head()

## 3. Temporal Analysis

In [None]:
# Convert date columns to datetime
df_main['observation_date'] = pd.to_datetime(df_main['observation_date'], errors='coerce')
df_main['event_date'] = pd.to_datetime(df_main['event_date'], errors='coerce')
df_main['target_date'] = pd.to_datetime(df_main['target_date'], errors='coerce')

# Temporal range of observations
obs_dates = df_main['observation_date'].dropna()
if len(obs_dates) > 0:
    print(f"Observation date range: {obs_dates.min().date()} to {obs_dates.max().date()}")
    print(f"Observation span: {(obs_dates.max() - obs_dates.min()).days / 365.25:.1f} years")
else:
    print("No valid observation dates found")

In [None]:
# Temporal range of events
event_dates = df_main['event_date'].dropna()
if len(event_dates) > 0:
    print(f"Event date range: {event_dates.min().date()} to {event_dates.max().date()}")
else:
    print("No valid event dates found")

## 4. Indicator Analysis

In [None]:
# Unique indicators and their coverage
indicators = df_main[df_main['record_type'] == 'observation']['indicator_code'].value_counts()
print(f"Total unique indicators: {len(indicators)}")
print("\nIndicator coverage:")
for indicator, count in indicators.items():
    print(f"{indicator}: {count} observations")

In [None]:
# Pillar distribution
pillar_counts = df_main['pillar'].value_counts()
print("Pillar distribution:")
print(pillar_counts)

## 5. Impact Links Analysis

In [None]:
# Examine impact links structure
print("Impact links columns:")
for i, col in enumerate(df_impact.columns):
    print(f"{i+1:2d}. {col}")

In [None]:
# Impact links sample
print("Impact links sample:")
df_impact.head()

In [None]:
# Check how events connect to indicators
print(f"Total impact links: {len(df_impact)}")
print("\nImpact direction distribution:")
print(df_impact['impact_direction'].value_counts())

print("\nImpact magnitude summary:")
print(df_impact['impact_magnitude'].describe())

## 6. Data Quality Assessment

In [None]:
# Check for missing values
print("Missing values in main dataset:")
missing_data = df_main.isnull().sum()
missing_percentage = (missing_data / len(df_main)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_percentage
})
print(missing_summary[missing_summary['Missing Count'] > 0])

In [None]:
# Check confidence levels
confidence_dist = df_main['confidence'].value_counts()
print("Confidence level distribution:")
print(confidence_dist)

print("\nConfidence by record type:")
confidence_by_type = pd.crosstab(df_main['record_type'], df_main['confidence'])
print(confidence_by_type)

## 7. Reference Codes Analysis

In [None]:
# Examine reference codes structure
print("Reference codes structure:")
ref_codes.head(10)

In [None]:
# Check what categorical fields have reference codes
if 'field_name' in ref_codes.columns:
    print("Fields with reference codes:")
    print(ref_codes['field_name'].value_counts())
else:
    print("Reference codes columns:", ref_codes.columns.tolist())

## 8. Summary Statistics

In [None]:
# Summary of key findings
print("=== DATASET SUMMARY ===")
print(f"Total records: {len(df_main)}")
print(f"Observations: {len(observations)}")
print(f"Events: {len(events)}")
print(f"Targets: {len(targets)}")
print(f"Impact links: {len(df_impact)}")
print(f"Unique indicators: {len(indicators)}")
print(f"Date range: {obs_dates.min().date()} to {obs_dates.max().date()}")

print("\n=== KEY INSIGHTS ===")
print("1. Dataset structure and completeness assessment")
print("2. Temporal coverage analysis")
print("3. Indicator diversity and coverage")
print("4. Event catalog and impact relationships")
print("5. Data quality and confidence levels")