# Fetch Kolada Data

This notebook fetches actual data values from the Kolada API for specified KPIs, municipalities, and years.

**API Endpoints:**
- `http://api.kolada.se/v2/data/kpi/{kpi}/municipality/{municipality}/year/{year}`
- `http://api.kolada.se/v2/oudata/kpi/{kpi}/ou/{ou}/year/{year}`

**Parameters to customize:**
- KPI IDs (comma-separated)
- Municipality IDs (comma-separated)
- Years (comma-separated)

**Output:** Kolada data tables in Lakehouse

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime
import time
from typing import List, Optional

In [None]:
# Configuration
API_BASE_URL = "http://api.kolada.se/v2"
PER_PAGE = 5000

# Data fetch parameters - CUSTOMIZE THESE
# Example KPIs (you can add more)
KPI_IDS = ["N00945", "N00946"]  # Add your KPI IDs here

# Example municipalities (empty means all)
MUNICIPALITY_IDS = []  # e.g., ["1860", "0180"] or leave empty for all

# Years to fetch
YEARS = ["2020", "2021", "2022", "2023"]  # Customize years

# Batch size for API requests (to avoid URL length limits)
BATCH_SIZE = 10

In [None]:
def fetch_data_paginated(url: str) -> List[dict]:
    """
    Fetch data from Kolada API with pagination support.
    
    Args:
        url: Initial API URL
        
    Returns:
        List of data objects
    """
    all_data = []
    page_count = 0
    
    while url:
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            
            if 'values' in data:
                all_data.extend(data['values'])
                page_count += 1
                print(f"  Page {page_count}: Retrieved {len(data['values'])} items (Total: {len(all_data)})")
            
            # Check for next page
            url = data.get('next_page', None)
            
            # Be nice to the API
            if url:
                time.sleep(0.5)
                
        except requests.exceptions.RequestException as e:
            print(f"  Error fetching data: {e}")
            break
    
    return all_data

In [None]:
def fetch_kolada_data(kpi_ids: List[str], municipality_ids: Optional[List[str]], years: List[str]) -> pd.DataFrame:
    """
    Fetch Kolada data for specified parameters.
    
    Args:
        kpi_ids: List of KPI IDs
        municipality_ids: List of municipality IDs (None for all)
        years: List of years
        
    Returns:
        DataFrame with all fetched data
    """
    all_records = []
    
    # Build URL based on parameters
    kpi_param = ','.join(kpi_ids)
    year_param = ','.join(years)
    
    if municipality_ids and len(municipality_ids) > 0:
        municipality_param = ','.join(municipality_ids)
        url = f"{API_BASE_URL}/data/kpi/{kpi_param}/municipality/{municipality_param}/year/{year_param}?per_page={PER_PAGE}"
    else:
        url = f"{API_BASE_URL}/data/kpi/{kpi_param}/year/{year_param}?per_page={PER_PAGE}"
    
    print(f"Fetching data from: {url}")
    
    data = fetch_data_paginated(url)
    
    # Flatten the data structure
    for item in data:
        kpi = item.get('kpi')
        municipality = item.get('municipality')
        period = item.get('period')
        
        # Each item can have multiple values (by gender)
        if 'values' in item:
            for value_item in item['values']:
                record = {
                    'kpi': kpi,
                    'municipality': municipality,
                    'period': period,
                    'gender': value_item.get('gender'),
                    'value': value_item.get('value'),
                    'count': value_item.get('count'),
                    'status': value_item.get('status'),
                    'ingestion_timestamp': datetime.now(),
                    'source_system': 'Kolada API'
                }
                all_records.append(record)
    
    if all_records:
        return pd.DataFrame(all_records)
    else:
        return pd.DataFrame()

In [None]:
def fetch_ou_data(kpi_ids: List[str], ou_ids: Optional[List[str]], years: List[str]) -> pd.DataFrame:
    """
    Fetch Kolada organizational unit data for specified parameters.
    
    Args:
        kpi_ids: List of KPI IDs
        ou_ids: List of OU IDs (None for all)
        years: List of years
        
    Returns:
        DataFrame with all fetched data
    """
    all_records = []
    
    # Build URL based on parameters
    kpi_param = ','.join(kpi_ids)
    year_param = ','.join(years)
    
    if ou_ids and len(ou_ids) > 0:
        ou_param = ','.join(ou_ids)
        url = f"{API_BASE_URL}/oudata/kpi/{kpi_param}/ou/{ou_param}/year/{year_param}?per_page={PER_PAGE}"
    else:
        url = f"{API_BASE_URL}/oudata/kpi/{kpi_param}/year/{year_param}?per_page={PER_PAGE}"
    
    print(f"Fetching OU data from: {url}")
    
    data = fetch_data_paginated(url)
    
    # Flatten the data structure
    for item in data:
        kpi = item.get('kpi')
        ou = item.get('ou')
        period = item.get('period')
        
        # Each item can have multiple values (by gender)
        if 'values' in item:
            for value_item in item['values']:
                record = {
                    'kpi': kpi,
                    'ou': ou,
                    'period': period,
                    'gender': value_item.get('gender'),
                    'value': value_item.get('value'),
                    'count': value_item.get('count'),
                    'status': value_item.get('status'),
                    'ingestion_timestamp': datetime.now(),
                    'source_system': 'Kolada API'
                }
                all_records.append(record)
    
    if all_records:
        return pd.DataFrame(all_records)
    else:
        return pd.DataFrame()

In [None]:
# Fetch municipality data
print("="*60)
print("Fetching Municipality Data")
print("="*60)
print(f"KPIs: {KPI_IDS}")
print(f"Municipalities: {MUNICIPALITY_IDS if MUNICIPALITY_IDS else 'All'}")
print(f"Years: {YEARS}")
print()

df_data = fetch_kolada_data(KPI_IDS, MUNICIPALITY_IDS if MUNICIPALITY_IDS else None, YEARS)

if not df_data.empty:
    print(f"\nFetched {len(df_data)} data points")
    print(f"DataFrame shape: {df_data.shape}")
    print(f"Columns: {list(df_data.columns)}")
    display(df_data.head(10))
else:
    print("No data retrieved")

In [None]:
# Save municipality data to Lakehouse
if not df_data.empty:
    try:
        table_path = "Tables/kolada_data"
        
        # Use overwrite mode (change to 'append' for incremental loads)
        spark.createDataFrame(df_data).write.format("delta").mode("overwrite").save(table_path)
        
        print(f"\n✓ Successfully wrote {len(df_data)} rows to kolada_data")
        print(f"  Table path: {table_path}")
    except Exception as e:
        print(f"\n✗ Error writing to Lakehouse: {e}")
        print(f"  Attempting to save as Parquet file instead...")
        
        try:
            file_path = "Files/kolada_data.parquet"
            df_data.to_parquet(file_path, index=False)
            print(f"  ✓ Saved to {file_path}")
        except Exception as e2:
            print(f"  ✗ Error saving Parquet: {e2}")

In [None]:
# Data quality summary
if not df_data.empty:
    print("\n" + "="*60)
    print("DATA SUMMARY")
    print("="*60)
    
    print(f"\nTotal data points: {len(df_data)}")
    print(f"\nData points by KPI:")
    print(df_data['kpi'].value_counts())
    
    print(f"\nData points by period:")
    print(df_data['period'].value_counts().sort_index())
    
    print(f"\nData points by gender:")
    print(df_data['gender'].value_counts())
    
    print(f"\nNull values:")
    print(df_data['value'].isnull().sum(), "out of", len(df_data))
    
    if MUNICIPALITY_IDS:
        print(f"\nData points by municipality:")
        print(df_data['municipality'].value_counts())

## Optional: Fetch Organizational Unit Data

Uncomment and customize the cells below to fetch organizational unit level data.

In [None]:
# # Example OU data fetch (uncomment to use)
# OU_KPI_IDS = ["N15033", "N15030"]  # KPIs with OU data
# OU_IDS = []  # Specific OUs or leave empty for all
# OU_YEARS = ["2020", "2021", "2022"]

# print("="*60)
# print("Fetching Organizational Unit Data")
# print("="*60)
# print(f"KPIs: {OU_KPI_IDS}")
# print(f"OUs: {OU_IDS if OU_IDS else 'All'}")
# print(f"Years: {OU_YEARS}")
# print()

# df_ou_data = fetch_ou_data(OU_KPI_IDS, OU_IDS if OU_IDS else None, OU_YEARS)

# if not df_ou_data.empty:
#     print(f"\nFetched {len(df_ou_data)} OU data points")
#     display(df_ou_data.head(10))
    
#     # Save to Lakehouse
#     try:
#         table_path = "Tables/kolada_ou_data"
#         spark.createDataFrame(df_ou_data).write.format("delta").mode("overwrite").save(table_path)
#         print(f"\n✓ Successfully wrote {len(df_ou_data)} rows to kolada_ou_data")
#     except Exception as e:
#         print(f"\n✗ Error writing to Lakehouse: {e}")
# else:
#     print("No OU data retrieved")