In [1]:
# Install semantic-link-labs for extended Fabric analytics
!pip install semantic-link-labs

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 3, Finished, Available, Finished)

Collecting semantic-link-labs
  Downloading semantic_link_labs-0.12.3-py3-none-any.whl.metadata (27 kB)
Collecting semantic-link-sempy>=0.12.0 (from semantic-link-labs)
  Downloading semantic_link_sempy-0.12.1-py3-none-any.whl.metadata (11 kB)
Collecting anytree (from semantic-link-labs)
  Downloading anytree-2.13.0-py3-none-any.whl.metadata (8.0 kB)
Collecting polib (from semantic-link-labs)
  Downloading polib-1.2.0-py2.py3-none-any.whl.metadata (15 kB)
Collecting jsonpath_ng (from semantic-link-labs)
  Downloading jsonpath_ng-1.7.0-py3-none-any.whl.metadata (18 kB)
Collecting fabric-analytics-sdk==0.0.1 (from fabric-analytics-sdk[online-notebook]==0.0.1->semantic-link-sempy>=0.12.0->semantic-link-labs)
  Downloading fabric_analytics_sdk-0.0.1-py3-none-any.whl.metadata (14 kB)
Collecting azure-keyvault-secrets>=4.7.0 (from semantic-link-sempy>=0.12.0->semantic-link-labs)
  Downloading azure_keyvault_secrets-4.10.0-py3-none-any.whl.metadata (18 kB)
Collecting fabric-analytics-notebook

In [2]:
import pandas as pd
import sempy_labs
import sempy.fabric as fabric
from sempy_labs.report import ReportWrapper
import re
import sempy

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 4, Finished, Available, Finished)

In [3]:
def sanitize_df_columns(df, extra_columns=False, ws_id=None, ds_id=None):
    """
    Replaces spaces in column names with underscore to prevent errors during Spark Dataframe Creation
    """
    df.columns = [
        re.sub(r'\W+', "_", col.strip().lower())
        for col in df.columns
    ]

    if extra_columns:
        df['workspace_id'] = ws_id
        df['dataset_id'] = ds_id
        
    return df

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 5, Finished, Available, Finished)

In [4]:

# ------------------------------------------------------------
# STEP 1: Object Discovery
# ------------------------------------------------------------

print("🔍 Discovering workspaces...")

workspaces_df = fabric.list_workspaces()
workspaces_df = sanitize_df_columns(workspaces_df)
workspaces_df = workspaces_df[['id', 'name', 'type']]
display(workspaces_df)

datasets_all, reports_all, paginated_all, dataflows_all = [], [], [], []

for _, ws in workspaces_df.iterrows():
    ws_id = ws['id']
    ws_name = ws['name']
    ws_type = ws['type']
    if ws_type == "AdminInsights":
        continue
    print(f"\n📦 Scanning workspace: {ws_name}")

   # --- Datasets
    try:
        ds = fabric.list_datasets(workspace=ws_id)
        if not ds.empty:
            ds['workspace_id'] = ws_id
            ds['workspace_name'] = ws_name
            datasets_all.append(ds)
    except Exception as e:
        print(f"  ⚠️ Datasets error in {ws_name}: {e}")

    # --- Reports (includes both Power BI and Paginated)
    try:
        rep = fabric.list_reports(workspace=ws_id)
        if not rep.empty:
            rep['workspace_id'] = ws_id
            rep['workspace_name'] = ws_name
            reports_all.append(rep)
    except Exception as e:
        print(f"  ⚠️ Reports error in {ws_name}: {e}")

    # --- Dataflows
    try:
        dfs = fabric.list_items(type='Dataflow',workspace=ws_id)
        if not dfs.empty:
            dfs['workspace_id'] = ws_id
            dfs['workspace_name'] = ws_name
            dataflows_all.append(dfs)
    except Exception as e:
        print(f"  ⚠️ Dataflows error in {ws_name}: {e}")

# Combine results
datasets_df  = sanitize_df_columns(pd.concat(datasets_all, ignore_index=True) if datasets_all else pd.DataFrame())
reports_df   = sanitize_df_columns(pd.concat(reports_all, ignore_index=True) if reports_all else pd.DataFrame())
dataflows_df = sanitize_df_columns(pd.concat(dataflows_all, ignore_index=True) if dataflows_all else pd.DataFrame())

# Split report types for clarity
if not reports_df.empty and "report_type" in reports_df.columns:
    pbi_reports_df = reports_df[reports_df["report_type"] == "PowerBIReport"].copy()
    paginated_reports_df = reports_df[reports_df["report_type"] == "PaginatedReport"].copy()
else:
    pbi_reports_df = reports_df
    paginated_reports_df = pd.DataFrame()

print("\n✅ Object discovery complete.")
print(f"  Workspaces: {len(workspaces_df)}")
print(f"  Datasets:   {len(datasets_df)}")
print(f"  Reports:    {len(reports_df)}")
print(f"  Paginated:  {len(paginated_reports_df)}")
print(f"  Dataflows:  {len(dataflows_df)}")



StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 6, Finished, Available, Finished)

🔍 Discovering workspaces...


SynapseWidget(Synapse.DataFrame, 8a43d2b7-37b5-4f00-8096-2a938415a0b6)


📦 Scanning workspace: Test Workspace

📦 Scanning workspace: Admin Test Workspace

📦 Scanning workspace: Modelling Workspace Test

✅ Object discovery complete.
  Workspaces: 4
  Datasets:   8
  Reports:    11
  Paginated:  1
  Dataflows:  1


In [5]:
# ------------------------------------------------------------
# STEP 2: Usage Analysis
# ------------------------------------------------------------

print("\n🔎 Analyzing dataset usage...")

# 1️⃣ Dataset IDs used by any report (Power BI or Paginated)
used_dataset_ids = set()
if not reports_df.empty:
    used_dataset_ids.update(reports_df['dataset_id'].dropna().unique())

# 2️⃣ Dataset IDs used by dataflows (as sources)
dataflow_refs = []

for _, row in dataflows_df.iterrows():
    try:
        refs = labs_df.get_dataflow_references(row['id'], row['workspace_id'])
        if refs is not None and not refs.empty:
            refs['dataflow_id'] = row['id']
            refs['dataflow_name'] = row['name']
            refs['workspace_id'] = row['workspace_id']
            dataflow_refs.append(refs)
    except Exception:
        pass

dataflow_refs_df = pd.concat(dataflow_refs, ignore_index=True) if dataflow_refs else pd.DataFrame()

if not dataflow_refs_df.empty:
    if 'source_dataset_id' in dataflow_refs_df.columns:
        used_dataset_ids.update(dataflow_refs_df['source_dataset_id'].dropna().unique())

# 3️⃣ Determine unused datasets
unused_datasets_df = datasets_df[~datasets_df['dataset_id'].isin(used_dataset_ids)].copy()

print(f"✅ Found {len(unused_datasets_df)} potentially unused datasets.")

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 7, Finished, Available, Finished)


🔎 Analyzing dataset usage...
✅ Found 0 potentially unused datasets.


In [6]:
# ------------------------------------------------------------
# STEP 3: Usage Summary Table
# ------------------------------------------------------------

summary_records = []

for _, ds in datasets_df.iterrows():
    ds_id = ds['dataset_id']
    ds_name = ds['dataset_name']
    ws_name = ds['workspace_name']  # still derived from datasets_df

    # Reports using this dataset
    rep_refs = pbi_reports_df[pbi_reports_df['dataset_id'] == ds_id]
    paginated_refs = rep_refs[rep_refs['report_type'] == 'PaginatedReport']
    normal_refs = rep_refs[rep_refs['report_type'] != 'PaginatedReport']

    # Dataflows referencing this dataset (if any)
    dataflow_refs = []
    if not dataflow_refs_df.empty and 'source_dataset_id' in dataflow_refs_df.columns:
        dataflow_refs = dataflow_refs_df[dataflow_refs_df['source_dataset_id'] == ds_id]

    # Determine usage
    total_refs = len(rep_refs) + len(dataflow_refs)
    usage_status = "Unused" if total_refs == 0 else "Used"

    # Add records for all associated reports
    if not rep_refs.empty:
        for _, r in rep_refs.iterrows():
            summary_records.append({
                "Dataset Workspace": ws_name,
                "Dataset Name": ds_name,
                "Report Name": r['name'],
                "Report Type": r['report_type'],
                "Report Workspace": r['workspace_name']
            })
    # Add records for datasets with no references
    elif total_refs == 0:
        summary_records.append({
            "Dataset Workspace": ws_name,
            "Dataset Name": ds_name,
            "Report Name": None,
            "Report Type": None,
            "Report Workspace": None
        })

usage_summary_df = pd.DataFrame(summary_records)

display(usage_summary_df)

# ------------------------------------------------------------
# STEP 4: Hierarchical Text Summary
# ------------------------------------------------------------

summary_lines = []

for ws_name in workspaces_df['name']:
    ws_datasets = datasets_df[datasets_df['workspace_name'] == ws_name]
    if ws_datasets.empty:
        continue

    summary_lines.append(f"\n🏢 **Workspace:** {ws_name}")

    for _, ds in ws_datasets.iterrows():
        ds_id = ds['dataset_id']
        ds_name = ds['dataset_name']
        summary_lines.append(f"   📘 Dataset: {ds_name}")

        # Power BI Reports referencing this dataset
        rep_refs = pbi_reports_df[pbi_reports_df['dataset_id'] == ds_id]
        for _, r in rep_refs.iterrows():
            summary_lines.append(f"       📊 Report: {r['name']}")

        # Paginated reports
        preg_refs = paginated_reports_df[paginated_reports_df['dataset_id'] == ds_id]
        for _, r in preg_refs.iterrows():
            summary_lines.append(f"       📄 Paginated Report: {r['name']}")

        # Dataflows referencing this dataset
        if not dataflow_refs_df.empty and 'source_dataset_id' in dataflow_refs_df.columns:
            df_refs = dataflow_refs_df[dataflow_refs_df['source_dataset_id'] == ds_id]
            for _, dfr in df_refs.iterrows():
                summary_lines.append(f"       🔄 Dataflow: {dfr['dataflow_name']}")

        # Mark unused if no references found
        if ds_id not in used_dataset_ids:
            summary_lines.append(f"       ❌ Not used by any report or dataflow.")

summary_text = "\n".join(summary_lines)
print(summary_text)

# ------------------------------------------------------------
# STEP 5: Final Outputs
# ------------------------------------------------------------

# Unused dataset list
if not unused_datasets_df.empty:
    print("\n⚠️ UNUSED DATASETS")
    for _, row in unused_datasets_df.iterrows():
        print(f" - {row['workspace_name']} → {row['dataset_name']}")
else:
    print("\n🎉 No unused datasets found!")

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 526b248f-3e90-48b6-8f79-8457eda91aba)


🏢 **Workspace:** Test Workspace
   📘 Dataset: New Waziri Dashboard Report
       📊 Report: New Waziri Dashboard Report
   📘 Dataset: Jaffaa AS Report
       📊 Report: Jaffaa AS Report
   📘 Dataset: maven semantic model
       📊 Report: PBI Service Report
   📘 Dataset: Energy Consumption Dashboard
       📊 Report: Energy Consumption Dashboard
   📘 Dataset: Fabric Analysis SM
       📊 Report: Fabric Analysis Report
       📊 Report: Fabric Lists

🏢 **Workspace:** Admin Test Workspace
   📘 Dataset: U14 GRMFC 2024
       📊 Report: U14 GRMFC 2024
   📘 Dataset: Commuter Challenge Report
       📊 Report: Commuter Challenge Report
   📘 Dataset: ev_names
       📊 Report: Ev Vehicles

🎉 No unused datasets found!


In [12]:
table_usage = []
for _, ds in datasets_df.iterrows():
    ds_id = ds['dataset_id']
    ds_name = ds['dataset_name']
    ws_id = ds['workspace_id']
    ws_name = ds['workspace_name']
    print(f"\n🔹 Dataset: {ds_name} (Workspace: {ws_name})")    
    deps = fabric.get_model_calc_dependencies(dataset=ds_id, workspace=ws_id)
    with deps as calc_deps:
        dependencies_df = getattr(calc_deps, "dependencies_df", None)

    # display(dependencies_df)
    tables = fabric.list_tables(dataset=ds_id,workspace=ws_id)
    tables = sanitize_df_columns(tables)
    tables['workspace_id'] = ws_id
    tables['dataset_id'] = ds_id
    tables['workspace_name'] = ws_name
    tables['dataset_name'] = ds_name

    print(f" Found {len(tables)} total tables")
    relationships = fabric.list_relationships(dataset=ds_id, workspace=ws_id, extended=True)
    relationships['qualified_from'] = "'" + relationships['From Table'] + "'[" + relationships['From Column'] + "]"
    relationships['qualified_to'] = "'" + relationships['To Table'] + "'[" + relationships['To Column'] + "]"

    measures = fabric.list_measures(dataset=ds_id, workspace=ws_id)

    used_tables =  set(dependencies_df['Referenced Table']).union(
        set(relationships['From Table']),
        set(relationships['To Table']),
        set(measures['Table Name'])       
    )

    used_tables= {t for t in used_tables if pd.notna(t)}
    
    print(f" Found {len(used_tables)} used tables")

    for t in set(tables['name']):
        if pd.isna(t):
            continue

        # print(f"Table {t}")
        measures_count = len(measures[measures['Table Name'] == t])
        rel_count = len(relationships[(relationships['From Table'] == t) | (relationships['To Table'] == t)])
        dep_count = len(dependencies_df[dependencies_df['Referenced Table'] == t]) if dependencies_df is not None else 0
        # print(f"  Measures: {measures_count} \n Relationships: {rel_count} \n Calc Dependencies: {dep_count}")
        status = "Unused" if t not in used_tables else "Used"
        
        table_usage.append({
            'workspace': ws_name,
            'dataset': ds_name,
            'table': t,
            'measures': measures_count,
            'relationships': rel_count,
            'depenencies': dep_count,
            "usage": status
        })
display(table_usage)

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 14, Finished, Available, Finished)


🔹 Dataset: New Waziri Dashboard Report (Workspace: Test Workspace)
 Found 17 total tables
 Found 20 used tables

🔹 Dataset: Jaffaa AS Report (Workspace: Test Workspace)
 Found 11 total tables
 Found 10 used tables

🔹 Dataset: maven semantic model (Workspace: Test Workspace)
 Found 2 total tables
 Found 0 used tables

🔹 Dataset: Energy Consumption Dashboard (Workspace: Test Workspace)
 Found 8 total tables
 Found 12 used tables

🔹 Dataset: Fabric Analysis SM (Workspace: Test Workspace)
 Found 8 total tables
 Found 8 used tables

🔹 Dataset: U14 GRMFC 2024 (Workspace: Admin Test Workspace)
 Found 20 total tables
 Found 24 used tables

🔹 Dataset: Commuter Challenge Report (Workspace: Admin Test Workspace)
 Found 3 total tables
 Found 3 used tables

🔹 Dataset: ev_names (Workspace: Admin Test Workspace)
 Found 2 total tables
 Found 2 used tables


SynapseWidget(Synapse.DataFrame, 04011f01-3348-4a01-b4b3-8e1d610ceee7)

In [11]:
columns_usage = []

for _, ds in datasets_df.iterrows():
    ds_id = ds['dataset_id']
    ds_name = ds['dataset_name']
    ws_id = ds['workspace_id']
    ws_name = ds['workspace_name']

    print(f"\n🔹 Dataset: {ds_name} (Workspace: {ws_name})")    

        # --- Get all columns in the dataset ---
    all_columns = fabric.list_columns(dataset=ds_id, workspace=ws_id, extended=True)
    all_columns = sanitize_df_columns(all_columns)
    all_columns['workspace_id'] = ws_id
    all_columns['dataset_id'] = ds_id
    all_columns['workspace_name'] = ws_name
    all_columns['dataset_name'] = ds_name
    all_columns['qualified_name'] = "'" + all_columns['table_name'] + "'[" + all_columns['column_name'] + ']'

    print(f" Found {len(all_columns)} total columns")

    # --- Filtered dependencies to only Columns and Calc Columns ---
    dep_columns_df = (
        dependencies_df[
            dependencies_df['Referenced Object Type'].isin(['Column', 'Calc Column'])
        ]
        if dependencies_df is not None else pd.DataFrame()
    )

    # --- Extract subsets by object type ---
    if not dep_columns_df.empty:
        measures_refs_df = dep_columns_df[dep_columns_df['Object Type'] == 'Measure']
        relationship_refs_df = dep_columns_df[
            dep_columns_df['Object Type'].str.contains('Relationship', case=False, na=False)
        ]
        # hierarchy_refs_df = dep_columns_df[dep_columns_df['Object Type'] == 'Hierarchy']
    else:
        measures_refs_df = pd.DataFrame()
        relationship_refs_df = pd.DataFrame()
        # hierarchy_refs_df = pd.DataFrame()

    # --- Used columns (in dependencies or relationships) ---
    dep_columns = set(dep_columns_df['Referenced Full Object Name']) if not dep_columns_df.empty else set()
    rel_columns = set(relationships['qualified_from']).union(set(relationships['qualified_to']))
    used_columns = dep_columns.union(rel_columns)
    used_columns = {c for c in used_columns if pd.notna(c)}

    print(f" Found {len(used_columns)} used columns")

        # --- Determine usage per column ---
    for _, row in all_columns.iterrows():
        table_name = row['table_name']
        column_name = row['column_name']                           
        qualified_name = row['qualified_name']
        # print(qualified_name)
        if pd.isna(column_name):
            continue

        dep_count = len(dep_columns_df[
            dep_columns_df['Referenced Full Object Name'] == qualified_name
        ]) if not dep_columns_df.empty else 0

        measure_c = len(measures_refs_df[measures_refs_df['Referenced Full Object Name'] == qualified_name])
        relationship_c = len(relationship_refs_df[relationship_refs_df['Referenced Full Object Name'] == qualified_name])
        # hierarchy_c = len(hierarchy_refs_df[hierarchy_refs_df['Referenced Object'] == column_name])

        # Build a referenced-by list (measures, relationships, etc.)
        referenced_by = ", ".join(
            dep_columns_df.loc[
                dep_columns_df['Referenced Full Object Name'] == qualified_name, 'Object Name'
            ].unique().tolist()
        ) if not dep_columns_df.empty else ""

        # print(measure_c, relationship_c, dep_count )
        # Determine usage
        usage_status = 'Used' if any([measure_c, relationship_c,dep_count]) else 'Unused'
        # Append result
        columns_usage.append({
            'workspace': ws_name,
            'dataset': ds_name,
            'table': table_name,
            'column': column_name,
            'measures': measure_c,
            'relationships': relationship_c,
            # 'hierarchies': hierarchy_c,
            'dependencies': dep_count,
            'referenced_by': referenced_by,
            'usage': usage_status
        })
display(columns_usage)

StatementMeta(, 12f73f95-8194-488a-a347-297f411ac3ca, 13, Finished, Available, Finished)


🔹 Dataset: New Waziri Dashboard Report (Workspace: Test Workspace)
 Found 176 total columns
 Found 2 used columns

🔹 Dataset: Jaffaa AS Report (Workspace: Test Workspace)
 Found 124 total columns
 Found 2 used columns

🔹 Dataset: maven semantic model (Workspace: Test Workspace)
 Found 57 total columns
 Found 2 used columns

🔹 Dataset: Energy Consumption Dashboard (Workspace: Test Workspace)
 Found 107 total columns
 Found 2 used columns

🔹 Dataset: Fabric Analysis SM (Workspace: Test Workspace)
 Found 94 total columns
 Found 2 used columns

🔹 Dataset: U14 GRMFC 2024 (Workspace: Admin Test Workspace)
 Found 116 total columns
 Found 2 used columns

🔹 Dataset: Commuter Challenge Report (Workspace: Admin Test Workspace)
 Found 59 total columns
 Found 2 used columns

🔹 Dataset: ev_names (Workspace: Admin Test Workspace)
 Found 134 total columns
 Found 2 used columns


SynapseWidget(Synapse.DataFrame, 4f646d55-3cb6-4b62-aabd-abd8e1cab3a5)