# Fetch Municipality and Groups Metadata from Kolada API

This notebook fetches municipality, municipality groups, and organizational units metadata from the Kolada API and stores them in the Lakehouse.

**API Endpoints:**
- `http://api.kolada.se/v2/municipality`
- `http://api.kolada.se/v2/municipality_groups`
- `http://api.kolada.se/v2/ou`
- `http://api.kolada.se/v2/kpi_groups`

**Output:** Multiple metadata tables in Lakehouse

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime
import time

In [None]:
# Configuration
API_BASE_URL = "http://api.kolada.se/v2"
PER_PAGE = 5000

In [None]:
def fetch_metadata(endpoint, table_name):
    """
    Generic function to fetch metadata from Kolada API with pagination.
    
    Args:
        endpoint: API endpoint (e.g., 'municipality', 'kpi_groups')
        table_name: Name for the output table
        
    Returns:
        DataFrame: Pandas DataFrame with the fetched data
    """
    print(f"\n{'='*60}")
    print(f"Fetching {endpoint} metadata")
    print(f"{'='*60}")
    
    all_data = []
    url = f"{API_BASE_URL}/{endpoint}?per_page={PER_PAGE}"
    
    page_count = 0
    
    while url:
        try:
            print(f"Fetching page {page_count + 1}...")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            
            if 'values' in data:
                all_data.extend(data['values'])
                print(f"  Retrieved {len(data['values'])} items (Total: {len(all_data)})")
            
            # Check for next page
            url = data.get('next_page', None)
            page_count += 1
            
            # Be nice to the API
            if url:
                time.sleep(0.5)
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break
    
    print(f"Total items fetched: {len(all_data)}")
    
    if all_data:
        df = pd.DataFrame(all_data)
        df['ingestion_timestamp'] = datetime.now()
        df['source_system'] = 'Kolada API'
        return df
    else:
        return None

In [None]:
def save_to_lakehouse(df, table_name):
    """
    Save DataFrame to Lakehouse as Delta table.
    
    Args:
        df: Pandas DataFrame to save
        table_name: Name of the table
    """
    if df is None or df.empty:
        print(f"  ✗ No data to save for {table_name}")
        return
    
    try:
        # Write to Delta table in Lakehouse
        table_path = f"Tables/{table_name}"
        
        # Use overwrite mode for full refresh
        spark.createDataFrame(df).write.format("delta").mode("overwrite").save(table_path)
        
        print(f"  ✓ Successfully wrote {len(df)} rows to {table_name}")
        print(f"    Table path: {table_path}")
    except Exception as e:
        print(f"  ✗ Error writing to Lakehouse: {e}")
        print(f"    Attempting to save as Parquet file instead...")
        
        try:
            # Fallback to Files section
            file_path = f"Files/{table_name}.parquet"
            df.to_parquet(file_path, index=False)
            print(f"    ✓ Saved to {file_path}")
        except Exception as e2:
            print(f"    ✗ Error saving Parquet: {e2}")

In [None]:
# Fetch Municipality metadata
df_municipality = fetch_metadata('municipality', 'municipality_metadata')
if df_municipality is not None:
    print(f"\nDataFrame shape: {df_municipality.shape}")
    print(f"Columns: {list(df_municipality.columns)}")
    display(df_municipality.head())
    save_to_lakehouse(df_municipality, 'municipality_metadata')

In [None]:
# Fetch Municipality Groups metadata
df_municipality_groups = fetch_metadata('municipality_groups', 'municipality_groups_metadata')
if df_municipality_groups is not None:
    print(f"\nDataFrame shape: {df_municipality_groups.shape}")
    print(f"Columns: {list(df_municipality_groups.columns)}")
    
    # Flatten the members array if it exists
    if 'members' in df_municipality_groups.columns:
        # Store the main groups table
        df_groups_main = df_municipality_groups[['id', 'title', 'ingestion_timestamp', 'source_system']].copy()
        save_to_lakehouse(df_groups_main, 'municipality_groups_metadata')
        
        # Create a separate table for group members
        members_list = []
        for idx, row in df_municipality_groups.iterrows():
            if row['members'] and isinstance(row['members'], list):
                for member in row['members']:
                    members_list.append({
                        'group_id': row['id'],
                        'group_title': row['title'],
                        'member_id': member.get('id'),
                        'member_title': member.get('title'),
                        'ingestion_timestamp': row['ingestion_timestamp'],
                        'source_system': row['source_system']
                    })
        
        if members_list:
            df_members = pd.DataFrame(members_list)
            save_to_lakehouse(df_members, 'municipality_group_members')
    else:
        save_to_lakehouse(df_municipality_groups, 'municipality_groups_metadata')
    
    display(df_municipality_groups.head())

In [None]:
# Fetch KPI Groups metadata
df_kpi_groups = fetch_metadata('kpi_groups', 'kpi_groups_metadata')
if df_kpi_groups is not None:
    print(f"\nDataFrame shape: {df_kpi_groups.shape}")
    print(f"Columns: {list(df_kpi_groups.columns)}")
    
    # Flatten the members array if it exists
    if 'members' in df_kpi_groups.columns:
        # Store the main groups table
        df_kpi_groups_main = df_kpi_groups[['id', 'title', 'ingestion_timestamp', 'source_system']].copy()
        save_to_lakehouse(df_kpi_groups_main, 'kpi_groups_metadata')
        
        # Create a separate table for group members
        kpi_members_list = []
        for idx, row in df_kpi_groups.iterrows():
            if row['members'] and isinstance(row['members'], list):
                for member in row['members']:
                    kpi_members_list.append({
                        'group_id': row['id'],
                        'group_title': row['title'],
                        'kpi_id': member.get('id'),
                        'kpi_title': member.get('title'),
                        'ingestion_timestamp': row['ingestion_timestamp'],
                        'source_system': row['source_system']
                    })
        
        if kpi_members_list:
            df_kpi_members = pd.DataFrame(kpi_members_list)
            save_to_lakehouse(df_kpi_members, 'kpi_group_members')
    else:
        save_to_lakehouse(df_kpi_groups, 'kpi_groups_metadata')
    
    display(df_kpi_groups.head())

In [None]:
# Fetch Organizational Units metadata
df_ou = fetch_metadata('ou', 'organizational_units_metadata')
if df_ou is not None:
    print(f"\nDataFrame shape: {df_ou.shape}")
    print(f"Columns: {list(df_ou.columns)}")
    display(df_ou.head())
    save_to_lakehouse(df_ou, 'organizational_units_metadata')

In [None]:
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

if df_municipality is not None:
    print(f"\nMunicipalities: {len(df_municipality)} total")
    print(f"  By type: {df_municipality['type'].value_counts().to_dict()}")

if df_municipality_groups is not None:
    print(f"\nMunicipality Groups: {len(df_municipality_groups)} total")

if df_kpi_groups is not None:
    print(f"\nKPI Groups: {len(df_kpi_groups)} total")

if df_ou is not None:
    print(f"\nOrganizational Units: {len(df_ou)} total")