In [1]:
# Install semantic-link-labs for extended Fabric analytics
%pip install -q -U semantic-link-labs

StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 8, Finished, Available, Finished)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fsspec-wrapper 0.1.15 requires PyJWT>=2.6.0, but you have pyjwt 2.4.0 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [2]:
import pandas as pd
import sempy_labs
import sempy.fabric as fabric
from sempy_labs.report import ReportWrapper
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col
from sempy.fabric import FabricRestClient
import re
import json
from typing import Dict, List, Set, Any, Optional
import os
from dataclasses import dataclass
from datetime import datetime
import time

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()
print("✅ All imports successful. Spark session and Claude AI client initialized")

StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 10, Finished, Available, Finished)

✅ All imports successful. Spark session and Claude AI client initialized


In [3]:
# ============================================================
# UTILITY FUNCTIONS AND DATA STRUCTURES
# ============================================================
@dataclass
class DatasetInfo:
    """Data structure to hold comprehensive dataset information"""
    ds_id: str
    ds_name: str
    ws_id: str
    ws_name: str
    dependencies_df: Optional[pd.DataFrame] = None
    tables_df: Optional[pd.DataFrame] = None
    relationships_df: Optional[pd.DataFrame] = None
    measures_df: Optional[pd.DataFrame] = None
    columns_df: Optional[pd.DataFrame] = None

@dataclass
class ReportMetadata:
    """Data structure to hold Power BI report metadata analysis"""
    report_id: str
    report_name: str
    workspace_id: str
    workspace_name: str
    dataset_id: str
    report_format: str
    extraction_method: str
    tables: List[str]
    columns: List[str]
    measures: List[str]
    visuals_count: int
    filters_count: int
    extraction_success: bool
    error_message: str = ""

print("✅ Report metadata extraction function defined")



StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 11, Finished, Available, Finished)

✅ Report metadata extraction function defined


In [4]:

class PowerBIMetadataExtractor:
    """Extracts columns, tables, and measures from Power BI report metadata"""
    
    def __init__(self):
        self.tables = set()
        self.columns = set()
        self.measures = set()
        self.visual_details = []
        self.filter_details = []
        
    def extract_from_json_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract metadata from JSON data"""
        self._reset()
        
        # Extract from sections
        sections = data.get('sections', [])
        
        for section_idx, section in enumerate(sections):
            section_name = section.get('displayName', f'Section_{section_idx}')
            
            # Extract from section-level filters
            filters = section.get('filters', [])
            if isinstance(filters, str):
                filters = json.loads(filters)
            self._extract_from_filters(filters, 'section', section_name)
            
            # Extract from visual containers
            visual_containers = section.get('visualContainers', [])
            self._extract_from_visual_containers(visual_containers, section_name)
        
        # Compile results
        results = {
            'tables': sorted(list(self.tables)),
            'columns': sorted(list(self.columns)),
            'measures': sorted(list(self.measures)),
            'summary': {
                'total_tables': len(self.tables),
                'total_columns': len(self.columns),
                'total_measures': len(self.measures)
            },
            'visual_details': self.visual_details,
            'filter_details': self.filter_details
        }
        
        return results
    
    def _reset(self):
        """Reset all collections for new extraction"""
        self.tables.clear()
        self.columns.clear()
        self.measures.clear()
        self.visual_details.clear()
        self.filter_details.clear()
    
    def _extract_from_visual_containers(self, visual_containers: List[Dict], section_name: str):
        """Extract from visualContainers array"""
        for visual_idx, visual_container in enumerate(visual_containers):
            visual_config = visual_container.get('config', {})
            if isinstance(visual_config, str):
                visual_config = json.loads(visual_config)
            visual_name = visual_config.get('name', f'Visual_{visual_idx}')
            
            # Extract from visual-level filters
            filters = visual_container.get('filters', [])
            if isinstance(filters, str):
                filters = json.loads(filters)
            self._extract_from_filters(
                filters, 
                'visual', 
                f"{section_name}->{visual_name}"
            )
            
            # Extract from singleVisual
            single_visual = visual_config.get('singleVisual', {})
            if single_visual:
                self._extract_from_single_visual(single_visual, section_name, visual_name)
    
    def _extract_from_single_visual(self, single_visual: Dict, section_name: str, visual_name: str):
        """Extract from singleVisual object"""
        visual_type = single_visual.get('visualType', 'unknown')
        
        # Extract from projections
        projections = single_visual.get('projections', {})
        projection_refs = []
        
        for projection_type, projection_list in projections.items():
            for proj in projection_list:
                query_ref = proj.get('queryRef', '')
                if query_ref:
                    projection_refs.append(query_ref)
                    self._parse_query_ref(query_ref)
        
        # Extract from prototypeQuery
        prototype_query = single_visual.get('prototypeQuery', {})
        self._extract_from_prototype_query(prototype_query)
        
        # Extract from objects (labels and other formatting with field references)
        objects = single_visual.get('objects', {})
        if objects:
            self._extract_field_reference(objects)
        
        # Store visual details
        self.visual_details.append({
            'section': section_name,
            'visual_name': visual_name,
            'visual_type': visual_type,
            'projection_refs': projection_refs,
            'has_prototype_query': bool(prototype_query)
        })
    
    def _extract_from_prototype_query(self, prototype_query: Dict):
        """Extract from prototypeQuery object"""
        # Extract tables from 'From' clause
        from_clause = prototype_query.get('From', [])
        for from_item in from_clause:
            entity = from_item.get('Entity', '')
            if entity and self._is_actual_table_name(entity):
                self.tables.add(entity)
        
        # Extract columns and measures from 'Select' clause using unified extractor
        select_clause = prototype_query.get('Select', [])
        for select_item in select_clause:
            self._extract_field_reference(select_item)
    
    def _extract_from_filters(self, filters: List[Dict], filter_type: str, context: str):
        """Extract from filters array"""

        for filter_idx, filter_obj in enumerate(filters):
            filter_name = filter_obj.get('name', f'Filter_{filter_idx}')
            
            # Extract from expression
            expression = filter_obj.get('expression', {})
            self._extract_from_expression(expression)
            
            # Extract from filter object (nested structure)
            filter_def = filter_obj.get('filter', {})
            if filter_def:
                # Extract tables from 'From' clause in filter
                from_clause = filter_def.get('From', [])
                for from_item in from_clause:
                    entity = from_item.get('Entity', '')
                    if entity:
                        self.tables.add(entity)
                
                # Extract from 'Where' clause - might contain column references
                where_clause = filter_def.get('Where', [])
                for where_item in where_clause:
                    self._extract_from_where_condition(where_item)
            
            # Store filter details
            self.filter_details.append({
                'filter_type': filter_type,
                'context': context,
                'filter_name': filter_name,
                'has_expression': bool(expression),
                'has_filter_def': bool(filter_def)
            })
    
    def _extract_field_reference(self, item: Dict):
        """Unified field reference extractor for Columns and Measures.
        
        Works for both:
        - prototypeQuery.Select[] items
        - objects.labels[].properties nested structures
        - Any nested structure with Column/Measure patterns
        """
        if not isinstance(item, dict):
            return
        
        # Get Name if available (from Select clause)
        name = item.get('Name', '')
        
        # Extract Column reference
        if 'Column' in item:
            column_def = item['Column']
            if isinstance(column_def, dict):
                entity = self._get_entity_from_expression(column_def)
                property_name = column_def.get('Property', '')
                
                # If we have Name, parse it for the table name
                if name and '.' in name:
                    table_name, field_name = name.split('.', 1)
                    if self._is_actual_table_name(table_name):
                        self.tables.add(table_name)
                        self.columns.add(f"'{table_name}'[{field_name}]")
                # Otherwise use Entity from SourceRef
                elif entity and property_name and self._is_actual_table_name(entity):
                    self.tables.add(entity)
                    self.columns.add(f"'{entity}'[{property_name}]")
        
        # Extract Measure reference
        elif 'Measure' in item:
            measure_def = item['Measure']
            if isinstance(measure_def, dict):
                entity = self._get_entity_from_expression(measure_def)
                property_name = measure_def.get('Property', '')
                
                # If we have Name, parse it for the table name
                if name and '.' in name:
                    table_name, field_name = name.split('.', 1)
                    if self._is_actual_table_name(table_name):
                        self.tables.add(table_name)
                        self.measures.add(f"'{table_name}'[{field_name}]")
                # Otherwise use Entity from SourceRef
                elif entity and property_name and self._is_actual_table_name(entity):
                    self.tables.add(entity)
                    self.measures.add(f"'{entity}'[{property_name}]")
        
        # Recursively check nested structures (for objects.labels, etc.)
        for value in item.values():
            if isinstance(value, dict):
                self._extract_field_reference(value)
            elif isinstance(value, list):
                for list_item in value:
                    if isinstance(list_item, dict):
                        self._extract_field_reference(list_item)
    
    def _get_entity_from_expression(self, field_def: Dict) -> str:
        """Extract entity/table name from Expression.SourceRef.
        
        Handles both:
        - {"Expression": {"SourceRef": {"Entity": "TableName"}}}  # Actual table
        - {"Expression": {"SourceRef": {"Source": "t"}}}          # Alias
        """
        expression = field_def.get('Expression', {})
        if isinstance(expression, dict):
            source_ref = expression.get('SourceRef', {})
            if isinstance(source_ref, dict):
                # Prefer Entity over Source (Entity is actual table name)
                return source_ref.get('Entity', source_ref.get('Source', ''))
        return ''
    
    def _extract_from_expression(self, expression: Dict):
        """Extract from expression object"""
        if 'Column' in expression:
            # Extract table from SourceRef
            column_expr = expression['Column']
            source_ref = column_expr.get('Expression', {}).get('SourceRef', {})
            entity = source_ref.get('Entity', '')
            if entity:
                self.tables.add(entity)
            
            # Extract column property
            property_name = column_expr.get('Property', '')
            if property_name and entity:
                self.columns.add(f"'{entity}'[{property_name}]")
    
    def _extract_from_where_condition(self, where_item: Dict):
        """Extract from WHERE condition"""
        condition = where_item.get('Condition', {})
        if 'In' in condition:
            expressions = condition['In'].get('Expressions', [])
            for expr in expressions:
                self._extract_from_expression(expr)
    
    def _is_actual_table_name(self, table_name: str) -> bool:
        """Check if table name is an actual table, not a query alias/prefix."""
        if not table_name or not isinstance(table_name, str):
            return False
        
        # Filter out single character aliases (d, s, c, _, etc.)
        if len(table_name) <= 1:
            return False
        
        # Filter out common query aliases
        query_aliases = {'subquery', 'temp', 'alias', 'src', 'tgt'}
        if table_name.lower() in query_aliases:
            return False
            
        return True
    
    def _parse_query_ref(self, query_ref: str):
        """Parse queryRef format (e.g., 'table.column' or 'table.measure')"""
        if '.' in query_ref:
            table_name, field_name = query_ref.split('.', 1)
            if self._is_actual_table_name(table_name):
                self.tables.add(table_name)
            # We'll determine if it's a column or measure from prototype query
            # For now, just store the full reference


StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 12, Finished, Available, Finished)

In [5]:
class FabricWorkspaceAnalyzer:
    """Main analyzer class implementing the complete workflow"""
    
    def __init__(self):
        self.workspaces_df = pd.DataFrame()
        self.datasets_df = pd.DataFrame()
        self.reports_df = pd.DataFrame()
        self.pbi_reports_df = pd.DataFrame()
        self.all_dataset_info = {}
        self.report_metadata_list = []
        self.report_objects_used = []
        self.error_log = []  # Store detailed errors for later display
        
    def sanitize_df_columns(self, df, extra_columns=False, ws_id=None, ds_id=None, ws_name=None, ds_name=None):
        """Replaces spaces in column names with underscore to prevent errors during Spark Dataframe Creation"""
        if df.empty:
            return df
            
        df.columns = [
            re.sub(r'\W+', "_", col.strip().lower())
            for col in df.columns
        ]

        if extra_columns:
            df['workspace_id'] = ws_id
            df['dataset_id'] = ds_id
            df['workspace_name'] = ws_name
            df['dataset_name'] = ds_name
            
        return df

    def save_to_lakehouse(self, df, table_name, description=""):
        """Save DataFrame to lakehouse using Spark"""
        try:
            if df.empty:
                print(f"  ⚠️ Skipping empty DataFrame for table: {table_name}")
                return
                
            # Add analysis timestamp
            df_with_timestamp = df.copy()
            df_with_timestamp['analysis_date'] = datetime.now()
            
            # Convert to Spark DataFrame and save
            spark_df = spark.createDataFrame(df_with_timestamp)
            spark_df.write.mode("overwrite").saveAsTable(table_name)
            
            print(f"  ✅ Saved {len(df)} records to '{table_name}' table")
            if description:
                print(f"     📝 {description}")
                
        except Exception as e:
            print(f"  ❌ Error saving to {table_name}: {str(e)}")
    def _extract_meaningful_error(self, error_msg):
        """
        Extract the meaningful error message from exceptions.
        Removes technical details like stack traces, activity IDs, and timestamps.
        """
        lines = error_msg.split('\n')
        
        # First line is usually the initial error
        brief_error = lines[0].strip()
        
        # Look for the actual error message after "Caused by" or similar patterns
        for i, line in enumerate(lines):
            line_stripped = line.strip()
            
            # Look for patterns that indicate the meaningful error
            if 'Caused by' in line_stripped and i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                # Skip if next line is empty or starts with technical details
                if next_line and not next_line.startswith('Technical Details:'):
                    return next_line
            
            # Stop at technical details section
            if 'Technical Details:' in line_stripped:
                break
            
            # Stop at stack traces
            if line_stripped.startswith('at '):
                break
        
        # Return the first line if nothing better found
        return brief_error

    
    def get_workspaces(self):
        """Step 1: Get Workspaces"""
        print("🔍 STEP 1: Discovering workspaces...")
        
        self.workspaces_df = fabric.list_workspaces()
        self.workspaces_df = self.sanitize_df_columns(self.workspaces_df)
        self.workspaces_df = self.workspaces_df[['id', 'name', 'type']]
        
        print(f"  ✅ Found {len(self.workspaces_df)} workspaces")
        return self.workspaces_df
    
    def get_datasets_and_reports(self):
        """Step 2: Get Datasets and Reports in parallel"""
        print("\n🔍 STEP 2: Getting datasets and reports...")
        
        datasets_all, reports_all = [], []
        
        for _, ws in self.workspaces_df.iterrows():
            ws_id = ws['id']
            ws_name = ws['name']
            ws_type = ws['type']
            
            if ws_type == "AdminInsights":
                continue
                
            print(f"  📦 Scanning workspace: {ws_name}")
            
            # Get Datasets
            try:
                ds = fabric.list_datasets(workspace=ws_id)
                if not ds.empty:
                    ds['workspace_id'] = ws_id
                    ds['workspace_name'] = ws_name
                    datasets_all.append(ds)
            except Exception as e:
                print(f"    ⚠️ Datasets error in {ws_name}: {e}")
            
            # Get Reports
            try:
                rep = fabric.list_reports(workspace=ws_id)
                if not rep.empty:
                    rep['workspace_id'] = ws_id
                    rep['workspace_name'] = ws_name
                    reports_all.append(rep)
            except Exception as e:
                print(f"    ⚠️ Reports error in {ws_name}: {e}")
        
        # Combine results
        self.datasets_df = self.sanitize_df_columns(pd.concat(datasets_all, ignore_index=True) if datasets_all else pd.DataFrame())
        self.reports_df = self.sanitize_df_columns(pd.concat(reports_all, ignore_index=True) if reports_all else pd.DataFrame())
        
        # Filter PowerBI reports
        if not self.reports_df.empty and "report_type" in self.reports_df.columns:
            self.pbi_reports_df = self.reports_df[self.reports_df["report_type"] == "PowerBIReport"].copy()
        else:
            self.pbi_reports_df = self.reports_df
        
        print(f"  ✅ Found {len(self.datasets_df)} datasets and {len(self.reports_df)} reports ({len(self.pbi_reports_df)} PowerBI reports)")
        return self.datasets_df, self.reports_df
    
    def process_all_datasets(self):
        """Step 3: Process all datasets and aggregate all objects (tables, columns, measures, dependencies)"""
        print("\n🔍 STEP 3: Processing all datasets and aggregating objects...")
        
        all_columns_list = []
        all_tables_list = []
        all_measures_list = []
        all_dependencies_list = []
        all_relationships_list = []
        
        for _, ds_row in self.datasets_df.iterrows():
            ds_id = ds_row['dataset_id']
            ds_name = ds_row['dataset_name']
            ws_id = ds_row['workspace_id']
            ws_name = ds_row['workspace_name']
            
            print(f"  📊 Processing dataset: {ds_name}")
            
            # Collect comprehensive dataset info
            dataset_info = self.collect_dataset_info(ds_id, ds_name, ws_id, ws_name)
            self.all_dataset_info[ds_id] = dataset_info
            
            # Aggregate columns
            if dataset_info.columns_df is not None and not dataset_info.columns_df.empty:
                all_columns_list.append(dataset_info.columns_df)
            
            # Aggregate tables
            if dataset_info.tables_df is not None and not dataset_info.tables_df.empty:
                all_tables_list.append(dataset_info.tables_df)
            
            # Aggregate measures
            if dataset_info.measures_df is not None and not dataset_info.measures_df.empty:
                # Add additional context that might not be in the measures_df
                measures_with_context = dataset_info.measures_df.copy()
                if 'dataset_id' not in measures_with_context.columns:
                    measures_with_context['dataset_id'] = ds_id
                if 'dataset_name' not in measures_with_context.columns:
                    measures_with_context['dataset_name'] = dataset_info.ds_name
                if 'workspace_id' not in measures_with_context.columns:
                    measures_with_context['workspace_id'] = dataset_info.ws_id
                if 'workspace_name' not in measures_with_context.columns:
                    measures_with_context['workspace_name'] = dataset_info.ws_name
                all_measures_list.append(measures_with_context)
            
            # Aggregate dependencies
            if dataset_info.dependencies_df is not None and not dataset_info.dependencies_df.empty:
                all_dependencies_list.append(dataset_info.dependencies_df)
            
            # Aggregate relationships
            if dataset_info.relationships_df is not None and not dataset_info.relationships_df.empty:
                relationships_with_context = dataset_info.relationships_df.copy()
                relationships_with_context['dataset_id'] = ds_id
                relationships_with_context['dataset_name'] = dataset_info.ds_name
                relationships_with_context['workspace_id'] = dataset_info.ws_id
                relationships_with_context['workspace_name'] = dataset_info.ws_name
                all_relationships_list.append(relationships_with_context)
        
        # Combine all aggregated data
        all_columns_df = pd.concat(all_columns_list, ignore_index=True) if all_columns_list else pd.DataFrame()
        all_tables_df = pd.concat(all_tables_list, ignore_index=True) if all_tables_list else pd.DataFrame()
        all_measures_df = pd.concat(all_measures_list, ignore_index=True) if all_measures_list else pd.DataFrame()
        all_dependencies_df = pd.concat(all_dependencies_list, ignore_index=True) if all_dependencies_list else pd.DataFrame()
        all_relationships_df = pd.concat(all_relationships_list, ignore_index=True) if all_relationships_list else pd.DataFrame()
        
        print(f"  ✅ Processed {len(self.all_dataset_info)} datasets")
        print(f"    📋 Aggregated: {len(all_columns_df)} columns, {len(all_tables_df)} tables, {len(all_measures_df)} measures")
        print(f"    🔗 Aggregated: {len(all_dependencies_df)} dependencies, {len(all_relationships_df)} relationships")
        
        return all_columns_df, all_tables_df, all_measures_df, all_dependencies_df, all_relationships_df
    
    def get_reports_metadata(self):
        """Step 4: Get Reports metadata (what objects they use)"""
        print("\n🔍 STEP 4: Extracting report metadata...")
        
        if self.pbi_reports_df.empty:
            print("  ⚠️ No PowerBI reports found")
            return []
        
        for idx, report_row in self.pbi_reports_df.iterrows():
            report_id = report_row.get('id', '')
            report_name = report_row.get('name', f'Report_{idx}')
            workspace_id = report_row.get('workspace_id', '')
            workspace_name = report_row.get('workspace_name', '')
            dataset_id = report_row.get('dataset_id', '')
            
            print(f"  📊 Processing report {idx+1}/{len(self.pbi_reports_df)+1}: {report_name}")
            
            # Extract metadata
            report_metadata = self.extract_report_metadata(
                report_id, report_name, workspace_id, workspace_name, dataset_id
            )
            
            self.report_metadata_list.append(report_metadata)
            
            # Create detailed records for each object used by this report
            if report_metadata.extraction_success:
                # Add table records
                for table in report_metadata.tables:
                    self.report_objects_used.append({
                        'report_id': report_id,
                        'report_name': report_name,
                        'workspace_id': workspace_id,
                        'workspace_name': workspace_name,
                        'dataset_id': dataset_id,
                        'object_type': 'Table',
                        'object_name': table,
                        'full_reference': table,
                        'extraction_method': report_metadata.extraction_method
                    })
                
                # Add column records
                for column in report_metadata.columns:
                    table_name = column.split("'")[1]
                    column_name = column.split("'")[2].strip("[]")
                    self.report_objects_used.append({
                        'report_id': report_id,
                        'report_name': report_name,
                        'workspace_id': workspace_id,
                        'workspace_name': workspace_name,
                        'dataset_id': dataset_id,
                        'object_type': 'Column',
                        'object_name': column_name,
                        'full_reference': column,
                        'table_name': table_name,
                        'extraction_method': report_metadata.extraction_method
                    })
                
                # Add measure records
                for measure in report_metadata.measures:
                    table_name = measure.split("'")[1]
                    measure_name = measure.split("'")[2].strip("[]")
                    self.report_objects_used.append({
                        'report_id': report_id,
                        'report_name': report_name,
                        'workspace_id': workspace_id,
                        'workspace_name': workspace_name,
                        'dataset_id': dataset_id,
                        'object_type': 'Measure',
                        'object_name': measure_name,
                        'full_reference': measure,
                        'table_name': table_name,
                        'extraction_method': report_metadata.extraction_method
                    })
        
        print(f"  ✅ Processed {len(self.report_metadata_list)+1} reports, extracted {len(self.report_objects_used)} object references")
        return self.report_metadata_list
    
    def check_dependencies(self, all_columns_df, all_tables_df, all_measures_df):
        """Step 5: Check for dependencies between objects"""
        print("\n🔍 STEP 5: Checking for dependencies...")
        
        # Convert report objects to DataFrame for easier analysis
        report_objects_df = pd.DataFrame(self.report_objects_used) if self.report_objects_used else pd.DataFrame()
        
        # Get all used objects from reports
        used_tables = set()
        used_columns = set()
        used_measures = set()
        
        if not report_objects_df.empty:
            used_tables.update(report_objects_df[report_objects_df['object_type'] == 'Table']['full_reference'].tolist())
            used_columns.update(report_objects_df[report_objects_df['object_type'] == 'Column']['full_reference'].tolist())
            used_measures.update(report_objects_df[report_objects_df['object_type'] == 'Measure']['full_reference'].tolist())
        
        print(f"  📋 Initial objects from reports: {len(used_tables)} tables, {len(used_columns)} columns, {len(used_measures)} measures")
        
        # Check for dependencies within datasets (relationships and transitive dependencies)
        for ds_id, dataset_info in self.all_dataset_info.items():
            # Check relationships - columns used in relationships are required
            if dataset_info.relationships_df is not None and not dataset_info.relationships_df.empty:
                for _, rel in dataset_info.relationships_df.iterrows():
                    if 'qualified_from' in rel:
                        used_columns.add(rel['qualified_from'])
                    if 'qualified_to' in rel:
                        used_columns.add(rel['qualified_to'])
        
        print(f"  🔗 After adding relationship columns: {len(used_columns)} columns")
        
        # Transitive dependency resolution: find what the used objects depend on
        # Keep iterating until no new dependencies are found
        iteration = 0
        max_iterations = 10  # Prevent infinite loops
        
        while iteration < max_iterations:
            iteration += 1
            initial_tables_count = len(used_tables)
            initial_columns_count = len(used_columns)
            initial_measures_count = len(used_measures)
            
            print(f"  🔄 Dependency resolution iteration {iteration}...")
            
            # Check dependencies for all used objects
            for ds_id, dataset_info in self.all_dataset_info.items():
                if dataset_info.dependencies_df is None or dataset_info.dependencies_df.empty:
                    continue
                
                # Iterate through each dependency row
                for _, dep in dataset_info.dependencies_df.iterrows():
                    # Get the full_object_name (the object that has the dependency)
                    full_object_name = dep.get('full_object_name', '')
                    
                    # Check if this object is in our used sets
                    if full_object_name in used_columns or full_object_name in used_measures:
                        # This object is used, so we need to mark its dependencies as used too
                        ref_object_type = dep.get('referenced_object_type', '')
                        referenced_full_object_name = dep.get('referenced_full_object_name', '')
                        
                        if ref_object_type == 'Table':
                            # The used object depends on a table
                            table_name = dep.get('referenced_table', '')
                            if table_name:
                                used_tables.add(table_name)
                        
                        elif ref_object_type == 'Column':
                            # The used object depends on a column
                            if referenced_full_object_name:
                                used_columns.add(referenced_full_object_name)
                        
                        elif ref_object_type == 'Measure':
                            # The used object depends on a measure
                            if referenced_full_object_name:
                                used_measures.add(referenced_full_object_name)
            
            # Check if we found any new dependencies
            new_tables = len(used_tables) - initial_tables_count
            new_columns = len(used_columns) - initial_columns_count
            new_measures = len(used_measures) - initial_measures_count
            
            print(f"    ➕ Added: {new_tables} tables, {new_columns} columns, {new_measures} measures")
            
            # If no new dependencies were found, we're done
            if new_tables == 0 and new_columns == 0 and new_measures == 0:
                print(f"  ✅ Dependency resolution converged after {iteration} iteration(s)")
                break
        
        print(f"  ✅ Final dependencies: {len(used_tables)} tables, {len(used_columns)} columns, {len(used_measures)} measures")
        
        # Display detailed errors if any
        if self.error_log:
            print(f"\n⚠️ Detailed Error Log ({len(self.error_log)} errors):")
            for idx, error_entry in enumerate(self.error_log, 1):
                print(f"\nError #{idx}:")
                print(f"  Dataset: {error_entry['dataset']}")
                print(f"  Operation: {error_entry['operation']}")
                print(f"  Details: {error_entry['error']}")
        
        return {
            'used_tables': used_tables,
            'used_columns': used_columns,
            'used_measures': used_measures,
            'report_objects_df': report_objects_df
        }
    
    def filter_results(self, all_columns_df, all_tables_df, all_measures_df, dependencies):
        """Step 6: Filter results to identify used vs unused objects"""
        print("\n🔍 STEP 6: Filtering results to identify used vs unused objects...")
        
        used_tables = dependencies['used_tables']
        used_columns = dependencies['used_columns']
        used_measures = dependencies['used_measures']
        
        # Filter columns
        if not all_columns_df.empty:
            if 'qualified_name' in all_columns_df.columns:
                all_columns_df['is_used'] = all_columns_df['qualified_name'].isin(used_columns)
            else:
                # Create qualified name if it doesn't exist
                all_columns_df['qualified_name'] = "'" + all_columns_df['table_name'] + "'[" + all_columns_df['column_name'] + ']'
                all_columns_df['is_used'] = all_columns_df['qualified_name'].isin(used_columns)
            
            used_columns_df = all_columns_df[all_columns_df['is_used'] == True].copy()
            unused_columns_df = all_columns_df[all_columns_df['is_used'] == False].copy()
        else:
            used_columns_df = pd.DataFrame()
            unused_columns_df = pd.DataFrame()
        
        # Filter tables
        if not all_tables_df.empty:
            all_tables_df['is_used'] = all_tables_df['name'].isin(used_tables)
            used_tables_df = all_tables_df[all_tables_df['is_used'] == True].copy()
            unused_tables_df = all_tables_df[all_tables_df['is_used'] == False].copy()
        else:
            used_tables_df = pd.DataFrame()
            unused_tables_df = pd.DataFrame()
        
        # Filter measures
        if not all_measures_df.empty:
            # Create qualified measure name for comparison
            all_measures_df['qualified_name'] = "'" + all_measures_df['table_name'] + "'[" + all_measures_df['measure_name'] + "]"
            all_measures_df['is_used'] = all_measures_df['qualified_name'].isin(used_measures)
            used_measures_df = all_measures_df[all_measures_df['is_used'] == True].copy()
            unused_measures_df = all_measures_df[all_measures_df['is_used'] == False].copy()
        else:
            used_measures_df = pd.DataFrame()
            unused_measures_df = pd.DataFrame()
        
        print(f"  ✅ Results filtered:")
        print(f"    Used: {len(used_tables_df)} tables, {len(used_columns_df)} columns, {len(used_measures_df)} measures")
        print(f"    Unused: {len(unused_tables_df)} tables, {len(unused_columns_df)} columns, {len(unused_measures_df)} measures")
        
        return {
            'used_tables': used_tables_df,
            'used_columns': used_columns_df,
            'used_measures': used_measures_df,
            'unused_tables': unused_tables_df,
            'unused_columns': unused_columns_df,
            'unused_measures': unused_measures_df
        }
    
    def collect_dataset_info(self, ds_id: str, ds_name: str, ws_id: str, ws_name: str) -> DatasetInfo:
        """Centralized function to collect all dataset-related information"""
        dataset_info = DatasetInfo(ds_id, ds_name, ws_id, ws_name)
        
        # Get model dependencies
        try:
            deps = fabric.get_model_calc_dependencies(dataset=ds_id, workspace=ws_id)
            with deps as calc_deps:
                dependencies_df = getattr(calc_deps, "dependencies_df", None)
            
            if dependencies_df is not None and not dependencies_df.empty:
                dependencies_df = self.sanitize_df_columns(
                    df=dependencies_df, 
                    extra_columns=True,
                    ws_id=ws_id, 
                    ds_id=ds_id,
                    ws_name=ws_name,
                    ds_name=ds_name
                )
                dataset_info.dependencies_df = dependencies_df
            else:
                dataset_info.dependencies_df = pd.DataFrame()
        except Exception as e:
            error_msg = str(e)
            # Extract only the first line of the error message
            brief_error = self._extract_meaningful_error(error_msg)
            print(f"    ⚠️ Dependencies unavailable for {ds_name}: {brief_error}")
            # Store full error in error log
            self.error_log.append({
                'dataset': ds_name,
                'operation': 'get_model_calc_dependencies',
                'error': error_msg
            })
            dataset_info.dependencies_df = pd.DataFrame()

        # Get tables
        try:
            tables = fabric.list_tables(dataset=ds_id, workspace=ws_id)
            if not tables.empty:
                tables = self.sanitize_df_columns(
                    df=tables, 
                    extra_columns=True,
                    ws_id=ws_id, 
                    ds_id=ds_id,
                    ws_name=ws_name,
                    ds_name=ds_name
                )
                dataset_info.tables_df = tables
        except Exception as e:
            error_msg = str(e)
            brief_error = self._extract_meaningful_error(error_msg)
            print(f"    ⚠️ Tables unavailable for {ds_name}: {brief_error}")
            self.error_log.append({
                'dataset': ds_name,
                'operation': 'list_tables',
                'error': error_msg
            })
            
        # Get relationships
        try:
            relationships = fabric.list_relationships(dataset=ds_id, workspace=ws_id, extended=True)
            if not relationships.empty:
                relationships = self.sanitize_df_columns(df=relationships)
                relationships['qualified_from'] = "'" + relationships['from_table'] + "'[" + relationships['from_column'] + "]"
                relationships['qualified_to'] = "'" + relationships['to_table'] + "'[" + relationships['to_column'] + "]"
                dataset_info.relationships_df = relationships
        except Exception as e:
            error_msg = str(e)
            brief_error = self._extract_meaningful_error(error_msg)
            print(f"    ⚠️ Relationships unavailable for {ds_name}: {brief_error}")
            self.error_log.append({
                'dataset': ds_name,
                'operation': 'list_relationships',
                'error': error_msg
            })

        # Get measures
        try:
            measures = fabric.list_measures(dataset=ds_id, workspace=ws_id)
            if not measures.empty:
                measures = self.sanitize_df_columns(df=measures)
                dataset_info.measures_df = measures
        except Exception as e:
            error_msg = str(e)
            brief_error = self._extract_meaningful_error(error_msg)
            print(f"    ⚠️ Measures unavailable for {ds_name}: {brief_error}")
            self.error_log.append({
                'dataset': ds_name,
                'operation': 'list_measures',
                'error': error_msg
            })

        # Get columns
        try:
            columns = fabric.list_columns(dataset=ds_id, workspace=ws_id, extended=True)
            if not columns.empty:
                columns = self.sanitize_df_columns(
                    df=columns,
                    extra_columns=True,
                    ws_id=ws_id, 
                    ds_id=ds_id,
                    ws_name=ws_name,
                    ds_name=ds_name
                )
                columns['qualified_name'] = "'" + columns['table_name'] + "'[" + columns['column_name'] + ']'
                dataset_info.columns_df = columns
        except Exception as e:
            error_msg = str(e)
            brief_error = self._extract_meaningful_error(error_msg)
            print(f"    ⚠️ Columns unavailable for {ds_name}: {brief_error}")
            self.error_log.append({
                'dataset': ds_name,
                'operation': 'list_columns',
                'error': error_msg
            })
        
        return dataset_info
    
    def extract_report_metadata(self, report_id: str, report_name: str, workspace_id: str, workspace_name: str, dataset_id: str) -> ReportMetadata:
        """Extract metadata from Power BI reports using dual approach"""
        
        # Initialize result object
        result = ReportMetadata(
            report_id=report_id,
            report_name=report_name,
            workspace_id=workspace_id,
            workspace_name=workspace_name,
            dataset_id=dataset_id,
            report_format="Unknown",
            extraction_method="None",
            tables=[],
            columns=[],
            measures=[],
            visuals_count=0,
            filters_count=0,
            extraction_success=False
        )
        
        try:
            # Step 1: Try to determine report format
            report = ReportWrapper(report=report_id, workspace=workspace_id)
            rep_format = report.format
            result.report_format = rep_format
            print(f"  📑 Report Type: {rep_format}")
            if rep_format == "PBIR":
                # Method 1: Use sempy_labs.report.list_all_semantic_model_objects() for PBIR format
                try:
                    objects = report.list_semantic_model_objects()
                    
                    if objects is not None and not objects.empty:
                        # Process the objects DataFrame
                        tables = objects['Table Name'].unique().tolist()
                        columns = (
                            objects[objects['Object Type'] == 'Column']
                            .assign(qualified=lambda df: "'" + df['Table Name'].fillna('') + "'[" + df['Object Name'] + "]")['qualified'] #build 'table'[column]
                            .unique().tolist()
                        )
                        measures = (
                            objects[objects['Object Type'] == 'Measure']
                            .assign(qualified = lambda df: "'" + df['Table Name'].fillna('') + "'[" + df['Object Name'] + "]")["qualified"] #build 'table'[measure]
                            .unique().tolist()
                        )
                        
                        result.tables = tables
                        result.columns = columns
                        result.measures = measures
                        result.extraction_method = "sempy_labs_objects"
                        result.extraction_success = True
                        
                        print(f"    ✅ Extracted via sempy_labs: {len(tables)} tables, {len(columns)} columns, {len(measures)} measures")
                        return result
                        
                except NotImplementedError as e:
                    print(f"    ⚠️ sempy_labs method not supported: {str(e)}")
                except Exception as e:
                    print(f"    ⚠️ sempy_labs method failed: {str(e)}")
            
            # Method 2: Fall back to JSON parsing
            report_json = sempy_labs.report.get_report_json(report=report_id, workspace=workspace_id)
            
            if report_json:
                # Use our custom extractor
                extractor = PowerBIMetadataExtractor()
                extraction_results = extractor.extract_from_json_data(report_json)
                
                result.tables = extraction_results.get('tables', [])
                result.columns = extraction_results.get('columns', [])
                result.measures = extraction_results.get('measures', [])
                result.visuals_count = len(extraction_results.get('visual_details', []))
                result.filters_count = len(extraction_results.get('filter_details', []))
                result.extraction_method = "json_parsing"
                result.extraction_success = True
                
                print(f"    ✅ Extracted via JSON: {len(result.tables)} tables, {len(result.columns)} columns, {len(result.measures)} measures")
                return result
            else:
                result.error_message = "Could not retrieve report JSON"
                
        except Exception as e:
            result.error_message = f"Extraction failed: {str(e)}"
            print(f"    ❌ Error extracting metadata: {str(e)}")
        
        return result
    
    def generate_ai_dataset_context(self, all_tables_df, all_columns_df, all_measures_df, all_relationships_df, filtered_results):
        """Generate AI-optimized dataset context table with health scores"""
        print("\n🤖 Generating AI dataset context table...")
        
        ai_dataset_records = []
        
        for ds_id, dataset_info in self.all_dataset_info.items():
            # Get basic dataset info
            ds_row = self.datasets_df[self.datasets_df['dataset_id'] == ds_id].iloc[0] if not self.datasets_df[self.datasets_df['dataset_id'] == ds_id].empty else None
            if ds_row is None:
                continue
            
            # Count objects for this dataset
            total_tables = len(all_tables_df[all_tables_df['dataset_id'] == ds_id]) if not all_tables_df.empty else 0
            total_columns = len(all_columns_df[all_columns_df['dataset_id'] == ds_id]) if not all_columns_df.empty else 0
            total_measures = len(all_measures_df[all_measures_df['dataset_id'] == ds_id]) if not all_measures_df.empty else 0
            total_relationships = len(all_relationships_df[all_relationships_df['dataset_id'] == ds_id]) if not all_relationships_df.empty else 0
            
            # Count report usage
            report_count = len(self.reports_df[self.reports_df['dataset_id'] == ds_id]) if not self.reports_df.empty else 0
            
            # Count connected vs isolated tables
            dataset_tables = all_tables_df[all_tables_df['dataset_id'] == ds_id]['name'].tolist() if not all_tables_df.empty else []
            connected_tables_set = set()
            if dataset_info.relationships_df is not None and not dataset_info.relationships_df.empty:
                connected_tables_set.update(dataset_info.relationships_df['from_table'].tolist())
                connected_tables_set.update(dataset_info.relationships_df['to_table'].tolist())
            connected_tables = len(connected_tables_set)
            isolated_tables = total_tables - connected_tables
            
            # Count unused objects
            unused_tables = len(filtered_results['unused_tables'][filtered_results['unused_tables']['dataset_id'] == ds_id]) if not filtered_results['unused_tables'].empty else 0
            unused_columns = len(filtered_results['unused_columns'][filtered_results['unused_columns']['dataset_id'] == ds_id]) if not filtered_results['unused_columns'].empty else 0
            unused_measures = len(filtered_results['unused_measures'][filtered_results['unused_measures']['dataset_id'] == ds_id]) if not filtered_results['unused_measures'].empty else 0
            
            # Detect circular relationships (simplified - count self-referencing)
            circular_chains = 0
            if dataset_info.relationships_df is not None and not dataset_info.relationships_df.empty:
                # Simple circular detection: tables that reference themselves
                circular_chains = len(dataset_info.relationships_df[
                    dataset_info.relationships_df['from_table'] == dataset_info.relationships_df['to_table']
                ])
            
            # Calculate health scores (0-1 scale)
            relationship_health = (connected_tables / total_tables) if total_tables > 0 else 0.0
            usage_efficiency = 1 - (unused_columns / total_columns) if total_columns > 0 else 1.0
            
            # Model complexity score (normalized)
            # Higher complexity = more relationships + measures relative to tables
            complexity_raw = (total_relationships + total_measures) / max(total_tables, 1)
            model_complexity = min(complexity_raw / 10, 1.0)  # Normalize to 0-1, cap at 1
            
            # Overall optimization score (0-100)
            # Weighted combination: relationships (30%), usage (40%), no isolated tables (20%), no unused (10%)
            optimization_score = (
                relationship_health * 30 +
                usage_efficiency * 40 +
                ((total_tables - isolated_tables) / max(total_tables, 1)) * 20 +
                (1 - (unused_tables / max(total_tables, 1))) * 10
            )
            
            ai_dataset_records.append({
                'workspace_id': dataset_info.ws_id,
                'workspace_name': dataset_info.ws_name,
                'dataset_id': ds_id,
                'dataset_name': dataset_info.ds_name,
                # Size metrics
                'total_tables': total_tables,
                'total_columns': total_columns,
                'total_measures': total_measures,
                'total_relationships': total_relationships,
                # Usage metrics
                'report_count': report_count,
                'dataflow_count': 0,  # Placeholder - can be added if needed
                'connected_tables': connected_tables,
                'isolated_tables': isolated_tables,
                # Quality metrics
                'unused_tables': unused_tables,
                'unused_columns': unused_columns,
                'unused_measures': unused_measures,
                'circular_chains': circular_chains,
                # Calculated health scores
                'relationship_health': round(relationship_health, 3),
                'usage_efficiency': round(usage_efficiency, 3),
                'model_complexity': round(model_complexity, 3),
                'optimization_score': round(optimization_score, 2)
            })
        
        ai_dataset_context_df = pd.DataFrame(ai_dataset_records)
        print(f"  ✅ Generated {len(ai_dataset_records)} dataset context records")
        return ai_dataset_context_df
    
    def generate_ai_object_features(self, all_columns_df, all_measures_df, all_tables_df, all_relationships_df, filtered_results, ai_dataset_context_df):
        """Generate AI-optimized object features table with rich context"""
        print("\n🤖 Generating AI object features table...")
        
        ai_object_records = []
        
        # Process columns
        if not all_columns_df.empty:
            for _, col_row in all_columns_df.iterrows():
                ds_id = col_row.get('dataset_id', '')
                table_name = col_row.get('table_name', '')
                column_name = col_row.get('column_name', '')
                qualified_name = col_row.get('qualified_name', '')
                
                # Get dataset context
                dataset_context = ai_dataset_context_df[ai_dataset_context_df['dataset_id'] == ds_id].iloc[0] if not ai_dataset_context_df[ai_dataset_context_df['dataset_id'] == ds_id].empty else None
                
                # Get table context
                table_measures = len(all_measures_df[
                    (all_measures_df['dataset_id'] == ds_id) & 
                    (all_measures_df['table_name'] == table_name)
                ]) if not all_measures_df.empty else 0
                
                table_columns = len(all_columns_df[
                    (all_columns_df['dataset_id'] == ds_id) & 
                    (all_columns_df['table_name'] == table_name)
                ]) if not all_columns_df.empty else 0
                
                table_relationships = len(all_relationships_df[
                    (all_relationships_df['dataset_id'] == ds_id) & 
                    ((all_relationships_df['from_table'] == table_name) | 
                     (all_relationships_df['to_table'] == table_name))
                ]) if not all_relationships_df.empty else 0
                
                # Check if table is isolated
                table_is_isolated = table_relationships == 0
                
                # Check column usage
                is_used = col_row.get('is_used', False)
                
                # Count usage types from dependencies
                dataset_info = self.all_dataset_info.get(ds_id)
                used_by_measures = 0
                used_by_relationships = 0
                used_by_dependencies = 0
                referenced_by_list = []
                
                if dataset_info and dataset_info.dependencies_df is not None and not dataset_info.dependencies_df.empty:
                    deps = dataset_info.dependencies_df[
                        dataset_info.dependencies_df['referenced_full_object_name'] == qualified_name
                    ]
                    used_by_dependencies = len(deps)
                    
                    # Get specific usage types
                    if 'object_type' in deps.columns:
                        used_by_measures = len(deps[deps['object_type'] == 'Measure'])
                    
                    # Get referenced by list
                    if 'object_name' in deps.columns:
                        referenced_by_list = deps['object_name'].unique().tolist()
                
                # Check relationship usage
                if not all_relationships_df.empty:
                    rel_usage = all_relationships_df[
                        (all_relationships_df['dataset_id'] == ds_id) & 
                        ((all_relationships_df['qualified_from'] == qualified_name) | 
                         (all_relationships_df['qualified_to'] == qualified_name))
                    ]
                    used_by_relationships = len(rel_usage)
                
                # Calculate usage score (0-1)
                usage_score = min((used_by_measures * 0.4 + used_by_relationships * 0.4 + used_by_dependencies * 0.2) / 5, 1.0)
                
                ai_object_records.append({
                    'workspace_id': col_row.get('workspace_id', ''),
                    'workspace_name': col_row.get('workspace_name', ''),
                    'dataset_id': ds_id,
                    'dataset_name': col_row.get('dataset_name', ''),
                    'table_name': table_name,
                    'object_name': column_name,
                    'object_type': 'calculated_column' if col_row.get('type', '').lower() == 'calculated' else 'column',
                    # Object properties
                    'data_type': col_row.get('data_type', col_row.get('type', 'Unknown')),
                    'is_hidden': col_row.get('is_hidden', False),
                    'is_calculated': col_row.get('type', '').lower() == 'calculated',
                    'has_dax': col_row.get('expression', '') != '',
                    # Table context
                    'table_measure_count': table_measures,
                    'table_column_count': table_columns,
                    'table_relationship_count': table_relationships,
                    'table_is_isolated': table_is_isolated,
                    # Dataset context (denormalized)
                    'dataset_total_tables': dataset_context['total_tables'] if dataset_context is not None else 0,
                    'dataset_relationship_health': dataset_context['relationship_health'] if dataset_context is not None else 0.0,
                    'dataset_usage_efficiency': dataset_context['usage_efficiency'] if dataset_context is not None else 0.0,
                    # Usage features
                    'used_by_measures': used_by_measures,
                    'used_by_relationships': used_by_relationships,
                    'used_by_dependencies': used_by_dependencies,
                    'is_used': is_used,
                    'usage_score': round(usage_score, 3),
                    # Referenced by (as JSON string)
                    'referenced_by_list': json.dumps(referenced_by_list) if referenced_by_list else ''
                })
        
        # Process measures
        if not all_measures_df.empty:
            for _, meas_row in all_measures_df.iterrows():
                ds_id = meas_row.get('dataset_id', '')
                table_name = meas_row.get('table_name', '')
                measure_name = meas_row.get('measure_name', '')
                qualified_name = meas_row.get('qualified_name', '')
                
                # Get dataset context
                dataset_context = ai_dataset_context_df[ai_dataset_context_df['dataset_id'] == ds_id].iloc[0] if not ai_dataset_context_df[ai_dataset_context_df['dataset_id'] == ds_id].empty else None
                
                # Get table context
                table_measures = len(all_measures_df[
                    (all_measures_df['dataset_id'] == ds_id) & 
                    (all_measures_df['table_name'] == table_name)
                ]) if not all_measures_df.empty else 0
                
                table_columns = len(all_columns_df[
                    (all_columns_df['dataset_id'] == ds_id) & 
                    (all_columns_df['table_name'] == table_name)
                ]) if not all_columns_df.empty else 0
                
                table_relationships = len(all_relationships_df[
                    (all_relationships_df['dataset_id'] == ds_id) & 
                    ((all_relationships_df['from_table'] == table_name) | 
                     (all_relationships_df['to_table'] == table_name))
                ]) if not all_relationships_df.empty else 0
                
                table_is_isolated = table_relationships == 0
                
                # Check measure usage
                is_used = meas_row.get('is_used', False)
                
                # Count usage from dependencies
                dataset_info = self.all_dataset_info.get(ds_id)
                used_by_measures = 0
                used_by_dependencies = 0
                referenced_by_list = []
                
                if dataset_info and dataset_info.dependencies_df is not None and not dataset_info.dependencies_df.empty:
                    deps = dataset_info.dependencies_df[
                        dataset_info.dependencies_df['referenced_full_object_name'] == qualified_name
                    ]
                    used_by_dependencies = len(deps)
                    
                    if 'object_type' in deps.columns:
                        used_by_measures = len(deps[deps['object_type'] == 'Measure'])
                    
                    if 'object_name' in deps.columns:
                        referenced_by_list = deps['object_name'].unique().tolist()
                
                # Calculate usage score
                usage_score = min((used_by_measures * 0.5 + used_by_dependencies * 0.5) / 3, 1.0)
                
                ai_object_records.append({
                    'workspace_id': meas_row.get('workspace_id', ''),
                    'workspace_name': meas_row.get('workspace_name', ''),
                    'dataset_id': ds_id,
                    'dataset_name': meas_row.get('dataset_name', ''),
                    'table_name': table_name,
                    'object_name': measure_name,
                    'object_type': 'measure',
                    # Object properties
                    'data_type': 'Measure',
                    'is_hidden': meas_row.get('is_hidden', False),
                    'is_calculated': True,
                    'has_dax': meas_row.get('expression', '') != '',
                    # Table context
                    'table_measure_count': table_measures,
                    'table_column_count': table_columns,
                    'table_relationship_count': table_relationships,
                    'table_is_isolated': table_is_isolated,
                    # Dataset context
                    'dataset_total_tables': dataset_context['total_tables'] if dataset_context is not None else 0,
                    'dataset_relationship_health': dataset_context['relationship_health'] if dataset_context is not None else 0.0,
                    'dataset_usage_efficiency': dataset_context['usage_efficiency'] if dataset_context is not None else 0.0,
                    # Usage features
                    'used_by_measures': used_by_measures,
                    'used_by_relationships': 0,  # Measures aren't used in relationships
                    'used_by_dependencies': used_by_dependencies,
                    'is_used': is_used,
                    'usage_score': round(usage_score, 3),
                    # Referenced by
                    'referenced_by_list': json.dumps(referenced_by_list) if referenced_by_list else ''
                })
        
        ai_object_features_df = pd.DataFrame(ai_object_records)
        print(f"  ✅ Generated {len(ai_object_records)} object feature records")
        return ai_object_features_df
    
    def save_all_results(self, all_columns_df, all_tables_df, all_measures_df, all_dependencies_df, all_relationships_df, filtered_results, dependencies):
        """Save AI-optimized results to lakehouse"""
        print("\n💾 STEP 7: Saving AI-optimized results to lakehouse...")
        
        # Generate AI-optimized tables
        ai_dataset_context_df = self.generate_ai_dataset_context(
            all_tables_df, all_columns_df, all_measures_df, all_relationships_df, filtered_results
        )
        
        ai_object_features_df = self.generate_ai_object_features(
            all_columns_df, all_measures_df, all_tables_df, all_relationships_df, filtered_results, ai_dataset_context_df
        )
        
        # Save ONLY AI-optimized tables for ML/AI consumption
        print("\n🤖 Saving AI-optimized tables for ML training and predictions...")
        self.save_to_lakehouse(ai_dataset_context_df, "ai_dataset_context", 
                              "AI-ready dataset features with health scores (0-100) for ML training")
        self.save_to_lakehouse(ai_object_features_df, "ai_object_features", 
                              "AI-ready object-level features with full lineage context for predictions")
        
        print("\n🔗 Saving relationships table...")
        self.save_to_lakehouse(all_relationships_df, "dataset_relationships", 
                              "All relationships across datasets with qualified column references")

        print("\n🔗 Saving dependencies table...")
        self.save_to_lakehouse(all_dependencies_df, "dataste_dependencies",
                                "All dataset dependedncies for reference later")
        print("\n✅ AI-optimized lakehouse tables created successfully!")
        print(f"   📊 ai_dataset_context: {len(ai_dataset_context_df)} datasets with 16 features")
        print(f"   📊 ai_object_features: {len(ai_object_features_df)} objects (columns + measures) with 23 features")
        print("\n💡 Use these tables for:")
        print("   - Training schema optimization models")
        print("   - Predicting unused objects")
        print("   - Generating dataset health scores")
        print("   - Recommending model improvements")
    
    def run_complete_analysis(self):
        """Run the complete analysis workflow"""
        print("🚀 STARTING COMPLETE FABRIC WORKSPACE ANALYSIS")
        print("=" * 80)
        
        start_time = time.time()
        
        # Step 1: Get Workspaces
        self.get_workspaces()
        
        # Step 2: Get Datasets and Reports
        self.get_datasets_and_reports()
        
        # Step 3: Process all datasets and aggregate all objects
        all_columns_df, all_tables_df, all_measures_df, all_dependencies_df, all_relationships_df = self.process_all_datasets()
        
        # Step 4: Get report metadata
        self.get_reports_metadata()
        
        # Step 5: Check dependencies
        dependencies = self.check_dependencies(all_columns_df, all_tables_df, all_measures_df)
        
        # Step 6: Filter results
        filtered_results = self.filter_results(all_columns_df, all_tables_df, all_measures_df, dependencies)
        
        # Step 7: Save all results
        self.save_all_results(all_columns_df, all_tables_df, all_measures_df, all_dependencies_df, all_relationships_df, filtered_results, dependencies)
        
        end_time = time.time()
        duration = end_time - start_time
        
        # Final summary
        print("\n" + "=" * 80)
        print("🎉 FABRIC WORKSPACE ANALYSIS COMPLETE!")
        print("=" * 80)
        print(f"⏱️ Total execution time: {duration:.2f} seconds")
        print(f"\n📊 Summary:")
        print(f"  Workspaces analyzed: {len(self.workspaces_df)}")
        print(f"  Datasets processed: {len(self.datasets_df)}")
        print(f"  Reports analyzed: {len(self.report_metadata_list)}")
        print(f"  Total objects found: {len(all_columns_df)} columns, {len(all_tables_df)} tables, {len(all_measures_df)} measures")
        print(f"  Used objects: {len(filtered_results['used_columns'])} columns, {len(filtered_results['used_tables'])} tables, {len(filtered_results['used_measures'])} measures")
        print(f"  Unused objects: {len(filtered_results['unused_columns'])} columns, {len(filtered_results['unused_tables'])} tables, {len(filtered_results['unused_measures'])} measures")
        print("\n💾 All results saved to lakehouse tables!")
        print("=" * 80)


StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 13, Finished, Available, Finished)

In [6]:

# ============================================================
# RUN THE COMPLETE ANALYSIS
# ============================================================

print("🚀 Initializing Fabric Workspace Analyzer...\n")
analyzer = FabricWorkspaceAnalyzer()

# Run the complete analysis
analyzer.run_complete_analysis()

print("\n🎉 Analysis complete! AI-optimized tables are ready.")
print("\n📊 Quick Access:")
print("  ai_datasets = spark.table('ai_dataset_context').toPandas()")
print("  ai_objects = spark.table('ai_object_features').toPandas()")



StatementMeta(, c6750e83-47d2-4299-9124-248033c7d2f1, 14, Finished, Available, Finished)

🚀 Initializing Fabric Workspace Analyzer...

🚀 STARTING COMPLETE FABRIC WORKSPACE ANALYSIS
🔍 STEP 1: Discovering workspaces...
  ✅ Found 4 workspaces

🔍 STEP 2: Getting datasets and reports...
  📦 Scanning workspace: Fabric_Demo
  📦 Scanning workspace: BI_Metadata
  📦 Scanning workspace: NLQ_Task
  📦 Scanning workspace: Auto_DP
  ✅ Found 3 datasets and 2 reports (2 PowerBI reports)

🔍 STEP 3: Processing all datasets and aggregating objects...
  📊 Processing dataset: LH_D365FNO
    ⚠️ Dependencies unavailable for LH_D365FNO: An error occurred when running AdomdCommand. AdomdCommandActivityId: '7a02ca9d-13ab-4c42-a420-e5b7305f62b2'

Caused by AdomdErrorResponseException:
The database is empty. The DISCOVER_CALC_DEPENDENCY operation cannot be performed on an empty database.

Technical Details:
RootActivityId: 18ac6a24-6ec0-40e9-bd5c-79487b0a3002
Date (UTC): 10/28/2025 5:12:11 AM
   at Microsoft.AnalysisServices.AdomdClient.XmlaClient.CheckForSoapFault(XmlReader reader, XmlaResult xmlaResu