In [None]:
pip install httpx

In [None]:
import sys
# Example: if your files are in Files/utils/ within the default Lakehouse
sys.path.append("/lakehouse/default/Files/utils/")
print(f"Added {sys.path[-1]} to Python path.")

In [None]:
import pandas as pd
import json
import logging
import requests
from pyspark.sql import SparkSession

try:
    from powerbi_api_utils import load_config, get_access_token
    from fabric_utils import cast_dataframe_to_fabric_compatible_types
    print("Helper scripts loaded successfully.")
except ImportError as e:
    print(f"Error importing helper scripts: {e}")
    print("Please ensure powerbi_api_utils.py and fabric_utils.py are in a 'utils' directory accessible by this notebook.")
    raise

# --- 1. Configuration & Setup ---

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_fabric_api_constants(tenant_id):
    """Returns Microsoft Fabric API constants."""
    AUTHORITY = f"https://login.microsoftonline.com/{tenant_id}"
    # NOTE: The scope for Fabric APIs is different from Power BI APIs
    SCOPE = ["https://api.fabric.microsoft.com/.default"]
    ADMIN_BASE_URL = "https://api.fabric.microsoft.com/v1/admin"
    return AUTHORITY, SCOPE, ADMIN_BASE_URL

def main(spark: SparkSession):
    """Main function to orchestrate the extraction and loading process."""
    try:
        # --- 2. Authentication ---
        logging.info("Loading configuration...")
        config = load_config()
        TENANT_ID = config["TENANT_ID"]
        
        logging.info("Setting up Fabric API constants...")
        AUTHORITY, SCOPE, ADMIN_BASE_URL = get_fabric_api_constants(TENANT_ID)

        logging.info("Acquiring access token for Fabric API...")
        access_token = get_access_token(
            config["CLIENT_ID"], config["CLIENT_SECRET"], AUTHORITY, SCOPE
        )
        headers = {'Authorization': f'Bearer {access_token}'}

        # --- 3. Data Fetching ---
        tenant_settings_url = f"{ADMIN_BASE_URL}/tenantsettings"
        logging.info(f"Fetching data from endpoint: {tenant_settings_url}")
        
        response = requests.get(tenant_settings_url, headers=headers, verify=config["VERIFY_SSL"])
        response.raise_for_status()
        
        raw_data = response.json()
        logging.info("Successfully fetched tenant settings data.")

        # --- 4. Data Normalization & Transformation ---
        logging.info("Normalizing and transforming data...")
        
        tenant_settings_list = raw_data.get('tenantSettings', [])
        
        if not tenant_settings_list:
            logging.warning("No tenant settings found in the API response. Exiting.")
            return

        all_security_groups = []
        all_properties = []

        # Create the main dataframe
        df_tenant_settings = pd.DataFrame(tenant_settings_list)

        # Use settingName as the foreign key to link child tables back to the parent
        for index, row in df_tenant_settings.iterrows():
            parent_setting_name = row['settingName']

            # Extract Enabled Security Groups
            security_groups = row.get('enabledSecurityGroups')
            if isinstance(security_groups, list) and security_groups:
                for group in security_groups:
                    group['settingName'] = parent_setting_name  # Add foreign key
                    all_security_groups.append(group)

            # Extract Properties
            properties = row.get('properties')
            if isinstance(properties, list) and properties:
                for prop in properties:
                    prop['settingName'] = parent_setting_name  # Add foreign key
                    all_properties.append(prop)

        # Create child dataframes
        df_security_groups = pd.DataFrame(all_security_groups)
        df_properties = pd.DataFrame(all_properties)
        
        # Drop the original nested columns from the main dataframe
        df_tenant_settings = df_tenant_settings.drop(columns=['enabledSecurityGroups', 'properties'], errors='ignore')

        logging.info(f"Created main table 'tenant_settings' with {len(df_tenant_settings)} rows.")
        logging.info(f"Created child table 'enabled_security_groups' with {len(df_security_groups)} rows.")
        logging.info(f"Created child table 'properties' with {len(df_properties)} rows.")

        # --- 5. Saving Data to Lakehouse (Overwrite Mode) ---
        
        dfs_to_save = {
            "tenant_settings": df_tenant_settings,
            "enabled_security_groups": df_security_groups,
            "properties": df_properties
        }
        
        warehouse_schema = "powerbi_metadata"
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {warehouse_schema}")
        logging.info(f"Ensured schema '{warehouse_schema}' exists.")
        
        for table_name, df_pandas in dfs_to_save.items():
            full_table_name = f"{warehouse_schema}.{table_name}"
            logging.info(f"Processing and saving table '{full_table_name}'...")

            if df_pandas.empty:
                logging.warning(f"DataFrame for '{table_name}' is empty. Creating/overwriting with an empty table.")
                # Create an empty Spark DF with a dummy column to ensure the table is created/overwritten
                spark.createDataFrame([], schema="dummy STRING").drop("dummy").write.mode("overwrite").saveAsTable(full_table_name)
                continue

            # Cast types to be compatible with Fabric Warehouse
            df_casted = cast_dataframe_to_fabric_compatible_types(df_pandas)
            
            # Add extraction timestamp
            df_casted['extraction_timestamp'] = pd.Timestamp.now()
            
            # Create Spark DataFrame and save with overwrite mode
            spark_df = spark.createDataFrame(df_casted)
            spark_df.write.format("delta").mode("overwrite").saveAsTable(full_table_name)
            
            logging.info(f"✅ Successfully saved data to {full_table_name}.")

    except Exception as e:
        logging.error(f"❌ An error occurred during the process: {e}", exc_info=True)
        raise

# --- Entry Point ---
if __name__ == "__main__":
    # Get the SparkSession from the Fabric notebook environment
    spark_session = globals().get('spark')
    if spark_session:
        main(spark_session)
    else:
        print("SparkSession not found. Please run this in a Fabric notebook environment.")