# Data Acquisition for Food Insecurity Early Warning

**Author**: Victor Collins Oppon

**MSc Data Science Dissertation, Middlesex University 2025**

---

## Overview

This notebook downloads and prepares all external data sources:
- **IPC data**: FEWSNET IPC assessments (55,129 records, 2021-2024)
- **GDELT data**: Global news event database (7.6M articles, 5.2M locations)
- **Geographic boundaries**: GADM, Natural Earth, IPC custom boundaries

**Runtime**: ~30 minutes (depending on network speed)

**Outputs**:
- `data/external/ipc/ipcFic_Africa_Current_Only.csv`
- `data/external/gdelt/african_gkg_locations_aligned.parquet`
- `data/external/shapefiles/`

## 1. Setup

In [None]:
# Import libraries
import os
import sys
from pathlib import Path
import requests
import zipfile
from io import BytesIO
import pandas as pd
import geopandas as gpd
from tqdm.notebook import tqdm
import time

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Import configuration
from config import (
    BASE_DIR, EXTERNAL_DATA_DIR, IPC_DIR, GDELT_DIR, 
    SHAPEFILES_DIR, IPC_FILE, GDELT_LOCATIONS
)

print(f"Project root: {BASE_DIR}")
print(f"External data: {EXTERNAL_DATA_DIR}")

## 2. IPC Data Acquisition

The Integrated Food Security Phase Classification (IPC) provides ground truth labels for food insecurity.

In [None]:
# Check if IPC data already exists
if IPC_FILE.exists():
    print(f"✓ IPC data already exists: {IPC_FILE}")
    ipc_df = pd.read_csv(IPC_FILE)
    print(f"  Records: {len(ipc_df):,}")
    print(f"  Countries: {ipc_df['country'].nunique()}")
    print(f"  Date range: {ipc_df['analysis_date'].min()} to {ipc_df['analysis_date'].max()}")
else:
    print("IPC data not found.")
    print("")
    print("MANUAL DOWNLOAD REQUIRED:")
    print("1. Visit: https://fews.net/fews-data/335")
    print("2. Download: Current IPC classifications for Africa")
    print(f"3. Save to: {IPC_FILE}")
    print("")
    print("OR download the pre-processed data archive from Zenodo:")
    print("https://doi.org/10.5281/zenodo.XXXXXXX")

### IPC Data Preview

In [None]:
if IPC_FILE.exists():
    # Load and preview IPC data
    ipc_df = pd.read_csv(IPC_FILE)
    
    print("IPC Data Sample:")
    display(ipc_df.head())
    
    print("\nIPC Phase Distribution:")
    print(ipc_df['ipc_phase'].value_counts().sort_index())
    
    print("\nCountry Coverage:")
    country_counts = ipc_df.groupby('country').size().sort_values(ascending=False)
    display(country_counts.head(10))

## 3. GDELT Data Acquisition

GDELT (Global Database of Events, Language, and Tone) provides news event data.

**Note**: The full GDELT dataset is 47GB. We use a pre-processed version (613MB) that contains only African location mentions.

In [None]:
# Check if GDELT data already exists
if GDELT_LOCATIONS.exists():
    print(f"✓ GDELT data already exists: {GDELT_LOCATIONS}")
    
    # Load sample
    gdelt_df = pd.read_parquet(GDELT_LOCATIONS)
    print(f"  Total location mentions: {len(gdelt_df):,}")
    print(f"  Unique articles: {gdelt_df['gkg_id'].nunique():,}")
    print(f"  Date range: {gdelt_df['date'].min()} to {gdelt_df['date'].max()}")
else:
    print("GDELT data not found.")
    print("")
    print("The pre-processed GDELT data (613MB) is available in the data archive:")
    print("https://doi.org/10.5281/zenodo.XXXXXXX")
    print("")
    print("To process from raw GDELT:")
    print("1. Download GDELT 2.0 GKG files from https://www.gdeltproject.org/")
    print("2. Filter for African locations")
    print("3. Extract location coordinates")
    print("4. Save as parquet for efficiency")
    print("")
    print("WARNING: Processing raw GDELT takes ~12 hours and requires 100GB+ disk space")

### GDELT Data Preview

In [None]:
if GDELT_LOCATIONS.exists():
    # Load sample
    gdelt_sample = pd.read_parquet(GDELT_LOCATIONS, nrows=1000)
    
    print("GDELT Data Sample:")
    display(gdelt_sample.head())
    
    print("\nColumn Info:")
    print(gdelt_sample.dtypes)
    
    print("\nTop Countries by Location Mentions:")
    full_df = pd.read_parquet(GDELT_LOCATIONS)
    country_mentions = full_df['country'].value_counts().head(10)
    display(country_mentions)

## 4. Download Geographic Boundaries

Download administrative boundaries from GADM and Natural Earth.

### 4.1 Download Natural Earth (Country Boundaries)

In [None]:
# Natural Earth 1:50m Admin 0 (countries)
ne_dir = SHAPEFILES_DIR / "natural_earth"
ne_dir.mkdir(parents=True, exist_ok=True)

ne_file = ne_dir / "ne_50m_admin_0_countries.shp"

if ne_file.exists():
    print(f"✓ Natural Earth data already exists: {ne_file}")
else:
    print("Downloading Natural Earth countries (1:50m scale)...")
    
    url = "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip"
    
    try:
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        
        with zipfile.ZipFile(BytesIO(response.content)) as z:
            z.extractall(ne_dir)
        
        print(f"✓ Downloaded and extracted to {ne_dir}")
    except Exception as e:
        print(f"Error downloading Natural Earth: {e}")
        print("Please download manually from:")
        print("https://www.naturalearthdata.com/downloads/50m-cultural-vectors/")

### 4.2 Download GADM (District Boundaries)

In [None]:
# GADM administrative boundaries
gadm_dir = SHAPEFILES_DIR / "gadm"
gadm_dir.mkdir(parents=True, exist_ok=True)

# List of 18 countries in final analysis
countries = ['ZWE', 'SDN', 'COD', 'NGA', 'MOZ', 'MLI', 'KEN', 'ETH', 
             'MWI', 'SOM', 'TCD', 'NER', 'CMR', 'UGA', 'BDI', 'MDG', 'SSD', 'BFA']

print(f"Downloading GADM data for {len(countries)} countries...")
print("This may take 10-15 minutes depending on network speed.")
print("")

for country_iso in tqdm(countries, desc="Countries"):
    country_file = gadm_dir / f"gadm41_{country_iso}_2.shp"
    
    if country_file.exists():
        continue
    
    # Download GADM level 2 (districts)
    url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/shp/gadm41_{country_iso}_shp.zip"
    
    try:
        response = requests.get(url, timeout=60)
        
        if response.status_code == 200:
            with zipfile.ZipFile(BytesIO(response.content)) as z:
                # Extract only level 2 files
                for file in z.namelist():
                    if '_2.' in file:
                        z.extract(file, gadm_dir)
        
        time.sleep(1)  # Be nice to server
        
    except Exception as e:
        print(f"\nError downloading {country_iso}: {e}")
        continue

print("\n✓ GADM download complete")

### 4.3 Verify Shapefiles

In [None]:
# Count shapefiles
ne_shapefiles = list((SHAPEFILES_DIR / "natural_earth").glob("*.shp"))
gadm_shapefiles = list((SHAPEFILES_DIR / "gadm").glob("*_2.shp"))

print(f"Natural Earth shapefiles: {len(ne_shapefiles)}")
print(f"GADM shapefiles (level 2): {len(gadm_shapefiles)}")

if ne_shapefiles:
    # Test loading one
    ne_gdf = gpd.read_file(ne_shapefiles[0])
    print(f"\nNatural Earth sample: {len(ne_gdf)} countries")
    
if gadm_shapefiles:
    # Test loading one
    test_file = gadm_shapefiles[0]
    test_gdf = gpd.read_file(test_file)
    print(f"\nGADM sample ({test_file.name}): {len(test_gdf)} districts")

## 5. Data Summary

In [None]:
print("="*70)
print("DATA ACQUISITION SUMMARY")
print("="*70)

# Check each data source
data_sources = {
    'IPC data': IPC_FILE,
    'GDELT locations': GDELT_LOCATIONS,
    'Natural Earth': SHAPEFILES_DIR / "natural_earth" / "ne_50m_admin_0_countries.shp",
    'GADM (sample)': SHAPEFILES_DIR / "gadm" / "gadm41_ZWE_2.shp"
}

for name, path in data_sources.items():
    if path.exists():
        size_mb = path.stat().st_size / (1024**2)
        print(f"✓ {name:20s}: {size_mb:>8.1f} MB")
    else:
        print(f"✗ {name:20s}: NOT FOUND")

print("="*70)
print("")
print("Next step: 02_Data_Processing.ipynb")
print("This notebook aggregates GDELT articles and locations to IPC districts.")