In [1]:
%pip install httpx

StatementMeta(, 5d7d05d0-b2a0-4815-9eb7-ebb4a40f9912, 7, Finished, Available, Finished)

Collecting httpx
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx)
  Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx)
  Downloading h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.16.0-py3-none-any.whl (37 kB)
Installing collected packages: h11, httpcore, httpx
Successfully installed h11-0.16.0 httpcore-1.0.9 httpx-0.28.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To upda

In [5]:
import sys
sys.path.append("/lakehouse/default/Files/utils/")
print(f"Added {sys.path[-1]} to Python path.")

StatementMeta(, 5d7d05d0-b2a0-4815-9eb7-ebb4a40f9912, 12, Finished, Available, Finished)

Added /lakehouse/default/Files/utils/ to Python path.


In [7]:
# =============================================================================
#
# Microsoft Fabric Notebook for Dataflow Datasource Extraction (Final Version)
#
# This version includes batching and the 'overwriteSchema' option to handle
# schema evolution during full data refreshes.
#
# =============================================================================

import pandas as pd
import logging
import asyncio
import httpx
from pyspark.sql import SparkSession

# --- 1. Load Helper Scripts & Functions ---

try:
    from powerbi_api_utils import load_config, get_api_constants, get_access_token, get_paginated_data_async
    from fabric_utils import cast_dataframe_to_fabric_compatible_types
    print("Helper scripts loaded successfully.")
except ImportError as e:
    print(f"Error importing helper scripts: {e}")
    raise

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Helper functions for data transformation ---

def _clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    if not df.empty:
        df.columns = [str(col).replace('.', '_').replace(' ', '_') for col in df.columns]
    return df

def _normalize_and_merge_json_column(df: pd.DataFrame, col_name: str, prefix: str) -> pd.DataFrame:
    if df.empty or col_name not in df.columns: return df
    df[col_name] = df[col_name].apply(lambda x: x if isinstance(x, dict) and x else None)
    to_normalize = df.dropna(subset=[col_name]).copy()
    if to_normalize.empty: return df.drop(columns=[col_name], errors='ignore')
    normalized_df = pd.json_normalize(to_normalize[col_name]).add_prefix(prefix)
    df = df.drop(columns=[col_name])
    df = df.merge(normalized_df, left_index=True, right_index=True, how='left')
    return _clean_columns(df)

# --- Main Orchestration Logic ---

async def main(spark: SparkSession):
    """Main async function to orchestrate the extraction and loading process."""
    try:
        # --- 2. Configuration and Authentication ---
        logging.info("Loading configuration and authenticating...")
        config = load_config()
        AUTHORITY, SCOPE, BASE_URL, ADMIN_BASE_URL = get_api_constants(config["TENANT_ID"])
        access_token = get_access_token(config["CLIENT_ID"], config["CLIENT_SECRET"], AUTHORITY, SCOPE)
        headers = {'Authorization': f'Bearer {access_token}'}

        # --- 3. Read Existing Dataflow IDs from Lakehouse ---
        logging.info("Reading dataflow IDs from the Lakehouse 'dataflows' table...")
        warehouse_schema = "powerbi_metadata"
        dataflows_table_path = f"{warehouse_schema}.dataflows"
        
        dataflow_ids_df = spark.read.table(dataflows_table_path).select("objectId")
        dataflow_ids = [row.objectId for row in dataflow_ids_df.collect()]
        
        if not dataflow_ids:
            logging.warning("No dataflow IDs found in the 'dataflows' table. Exiting.")
            return
            
        logging.info(f"Found {len(dataflow_ids)} dataflows to process.")

        # --- 4. Fetch Datasources in Batches ---
        logging.info("Fetching datasources in batches...")
        all_datasources = []
        batch_size = 100
        
        async with httpx.AsyncClient(verify=config["VERIFY_SSL"]) as client:
            for i in range(0, len(dataflow_ids), batch_size):
                batch_ids = dataflow_ids[i:i + batch_size]
                total_batches = (len(dataflow_ids) + batch_size - 1) // batch_size
                logging.info(f"--- Processing Batch {i//batch_size + 1}/{total_batches} ---")
                
                tasks = [get_paginated_data_async(client, f"{ADMIN_BASE_URL}/dataflows/{dataflow_id}/datasources", headers) for dataflow_id in batch_ids]
                batch_results = await asyncio.gather(*tasks, return_exceptions=True)

                for dataflow_id, result in zip(batch_ids, batch_results):
                    if isinstance(result, Exception):
                        logging.error(f"❌ FAILED to fetch datasources for dataflow ID: {dataflow_id} | Error: {result}")
                        continue
                    
                    for datasource in result:
                        datasource['dataflowId'] = dataflow_id
                        all_datasources.append(datasource)
                
                logging.info(f"Batch {i//batch_size + 1} complete. Pausing for 1 second...")
                await asyncio.sleep(1)

        # --- 5. Transform and Normalize Final Data ---
        if not all_datasources:
            logging.warning("No datasources were successfully found after processing all batches.")
            return

        logging.info(f"All batches complete. Total datasources found: {len(all_datasources)}. Now creating DataFrame.")
        df_dataflow_sources = pd.DataFrame(all_datasources)
        
        logging.info("Flattening 'connectionDetails' column...")
        df_dataflow_sources = _normalize_and_merge_json_column(df_dataflow_sources, 'connectionDetails', 'connectionDetails_')

        df_dataflow_sources = _clean_columns(df_dataflow_sources)
        df_dataflow_sources['extraction_timestamp'] = pd.Timestamp.now()

        # --- 6. Save Data to Lakehouse (Overwrite Mode) ---
        new_table_name = "dataflow_sources"
        full_table_name = f"{warehouse_schema}.{new_table_name}"
        logging.info(f"Saving {len(df_dataflow_sources)} records to '{full_table_name}' with overwrite mode...")
        
        df_casted = cast_dataframe_to_fabric_compatible_types(df_dataflow_sources)
        spark_df = spark.createDataFrame(df_casted)
        
        # Add the .option("overwriteSchema", "true") to handle schema changes
        spark_df.write.format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(full_table_name)
        
        logging.info(f"✅ Successfully saved data to {full_table_name}.")

    except Exception as e:
        logging.error(f"❌ An error occurred during the main process: {e}", exc_info=True)
        raise

StatementMeta(, 5d7d05d0-b2a0-4815-9eb7-ebb4a40f9912, 14, Finished, Available, Finished)

Helper scripts loaded successfully.


In [8]:
await main(spark)

StatementMeta(, 5d7d05d0-b2a0-4815-9eb7-ebb4a40f9912, 15, Finished, Available, Finished)

2025-07-31 21:32:21,042 - INFO - Loading configuration and authenticating...
2025-07-31 21:32:21,079 - INFO - Configuration loaded successfully.
2025-07-31 21:32:21,080 - INFO - Requesting access token from https://login.microsoftonline.com/7f16c8b3-f0ef-45a1-aa81-9d5c90cb8ba5/oauth2/v2.0/token...
2025-07-31 21:32:21,325 - INFO - Access token obtained successfully.
2025-07-31 21:32:21,328 - INFO - Reading dataflow IDs from the Lakehouse 'dataflows' table...
2025-07-31 21:32:30,275 - INFO - Found 2912 dataflows to process.
2025-07-31 21:32:30,276 - INFO - Fetching datasources in batches...
2025-07-31 21:32:30,331 - INFO - --- Processing Batch 1/30 ---
2025-07-31 21:32:30,861 - INFO - HTTP Request: GET https://api.powerbi.com/v1.0/myorg/admin/dataflows/5a51436f-6d11-41f6-af74-47bc1750a383/datasources "HTTP/1.1 200 OK"
2025-07-31 21:32:30,883 - INFO - HTTP Request: GET https://api.powerbi.com/v1.0/myorg/admin/dataflows/aff210bb-d035-4447-872d-0883ec6a32ea/datasources "HTTP/1.1 200 OK"
202