# Fetch KPI Metadata from Kolada API

This notebook fetches all KPI (Key Performance Indicator) metadata from the Kolada API and stores it in the Lakehouse.

**API Endpoint:** `http://api.kolada.se/v2/kpi`

**Output:** KPI metadata table in Lakehouse

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime
import time

In [None]:
# Configuration
API_BASE_URL = "http://api.kolada.se/v2"
ENDPOINT = "kpi"
PER_PAGE = 5000

# Lakehouse table name
TABLE_NAME = "kpi_metadata"

In [None]:
def fetch_all_kpi_metadata():
    """
    Fetch all KPI metadata from Kolada API with pagination support.
    
    Returns:
        list: List of all KPI metadata objects
    """
    all_data = []
    url = f"{API_BASE_URL}/{ENDPOINT}?per_page={PER_PAGE}"
    
    page_count = 0
    
    while url:
        try:
            print(f"Fetching page {page_count + 1}...")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            
            if 'values' in data:
                all_data.extend(data['values'])
                print(f"  Retrieved {len(data['values'])} KPIs (Total: {len(all_data)})")
            
            # Check for next page
            url = data.get('next_page', None)
            page_count += 1
            
            # Be nice to the API
            if url:
                time.sleep(0.5)
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break
    
    print(f"\nTotal KPIs fetched: {len(all_data)}")
    return all_data

In [None]:
# Fetch KPI metadata
print("Starting KPI metadata fetch...")
print(f"API URL: {API_BASE_URL}/{ENDPOINT}")
print(f"Timestamp: {datetime.now()}\n")

kpi_data = fetch_all_kpi_metadata()

In [None]:
# Convert to DataFrame
if kpi_data:
    df_kpi = pd.DataFrame(kpi_data)
    
    # Add metadata columns
    df_kpi['ingestion_timestamp'] = datetime.now()
    df_kpi['source_system'] = 'Kolada API'
    
    print(f"DataFrame shape: {df_kpi.shape}")
    print(f"\nColumn names: {list(df_kpi.columns)}")
    print(f"\nFirst few rows:")
    display(df_kpi.head())
else:
    print("No data retrieved")

In [None]:
# Write to Lakehouse (Delta table)
if kpi_data:
    spark_df = spark.createDataFrame(df_kpi)
    spark_df.write.mode("overwrite").format("delta").saveAsTable(TABLE_NAME)
    
    print(f"\n✓ Successfully wrote {len(df_kpi)} KPIs to {TABLE_NAME}")

In [None]:
# Summary statistics
if kpi_data:
    print("\n=== Summary Statistics ===")
    print(f"Total KPIs: {len(df_kpi)}")
    print(f"\nKPIs by municipality type:")
    print(df_kpi['municipality_type'].value_counts())
    print(f"\nKPIs divided by gender:")
    print(df_kpi['is_divided_by_gender'].value_counts())
    print(f"\nKPIs with OU data:")
    print(df_kpi['has_ou_data'].value_counts())