In [2]:
import json
import pandas as pd

# --- Load both files ---
with open('datasets_snapshot.json', 'r') as f:
    datasets_json = json.load(f)

with open('processes_snapshot_full.json', 'r') as f:
    processes_json = json.load(f)

# --- Extract useful info ---
datasets = []
for e in datasets_json.get("entities", []):
    attr = e.get("attributes", {})
    datasets.append({
        "Code": attr.get("name", "").split("]")[0].replace("[", "") if "[" in attr.get("name", "") else "",
        "Type": e.get("typeName"),
        "Name": attr.get("name"),
        "QualifiedName": attr.get("qualifiedName"),
        "Description": attr.get("description", "")
    })

processes = []
for e in processes_json.get("entities", []):
    attr = e.get("attributes", {})
    rel = e.get("relationshipAttributes", {})
    processes.append({
        "Code": attr.get("name", "").split("]")[0].replace("[", "") if "[" in attr.get("name", "") else "",
        "Type": e.get("typeName"),
        "Name": attr.get("name"),
        "QualifiedName": attr.get("qualifiedName"),
        "Description": attr.get("description", ""),
        "Inputs": ", ".join([inp.get("displayText", {}) for inp in rel.get("inputs", [])]) if rel.get("inputs") else "",
        "Outputs": ", ".join([out.get("displayText", {}) for out in rel.get("outputs", [])]) if rel.get("outputs") else ""
    })

# --- Combine and format ---
df_datasets = pd.DataFrame(datasets)
df_processes = pd.DataFrame(processes)
df_all = pd.concat([df_datasets, df_processes], ignore_index=True)

# --- Sort and export to Markdown ---
df_all = df_all[["Code", "Type", "Name", "QualifiedName", "Description", "Inputs", "Outputs"]]
df_all = df_all.sort_values(by="Code")

df_all = df_all[df_all["Code"]!=""]

df_all.to_csv("atlas_lineage.csv", index=False, encoding="utf-8")

markdown_table = df_all.to_markdown(index=False)
with open("atlas_entities_summary.md", "w") as f:
    f.write(markdown_table)

print("✅ Markdown summary saved as atlas_entities_summary.md")

'markdown_table = df_all.to_markdown(index=False)\nwith open("atlas_entities_summary.md", "w") as f:\n    f.write(markdown_table)\n\nprint("✅ Markdown summary saved as atlas_entities_summary.md")\n'

In [3]:
display(df_all)

Unnamed: 0,Code,Type,Name,QualifiedName,Description,Inputs,Outputs
4,D1,DataSet,[D1] - retail_data.csv,retail_data.csv@retail,original CSV from Kaggle with retail data (pri...,,
9,D10,DataSet,[D10] - API Flask remote (ngrok),api_flask_remote@retail,"Remote Flask API exposed via ngrok tunnel, pro...",,
10,D11,DataSet,[D11] - retail_project_db_AzureSQL,retail_project_db_AzureSQL@azure,Final consolidated database in Azure SQL conta...,,
13,D2,DataSet,[D2] - customers_local,customers_local@retail,Customers table imported into SSMS (from retai...,,
6,D3,DataSet,[D3] - products_local,products_local@retail,Products table imported into SSMS (via API Fla...,,
8,D4,DataSet,[D4] - sales_local,sales_local@retail,Facts table 'sales' imported at SSMS from reta...,,
11,D5,DataSet,[D5] - retail_summary_local,retail_summary_local@retail,Final consolidated retail summary table (sales...,,
2,D6,DataSet,[D6] - agg_sales.csv,agg_sales.csv@blobstorage,CSV with sales aggregated data exported from S...,,
1,D7,DataSet,[D7] - agg_customers.csv,agg_customers.csv@blobstorage,CSV with aggregated customers data exported fr...,,
5,D8,DataSet,[D8] - API Flask local,api_flask_local@retail,API Flask local exposes aggregated products da...,,
