In [1]:
import os
import scanpy as sc
import anndata as ad
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score

In [None]:
# --------------------------------------------------
# 1️⃣ LOAD SAVED ANNDATA
# --------------------------------------------------

import scanpy as sc
import pandas as pd
import numpy as np
import os
from sklearn.metrics import normalized_mutual_info_score

adata = sc.read("/home/znazari/scRNAseq_parkinson/result/adata_geneformer_leiden_sweep.h5ad")
print("Loaded:", adata.shape)

# --------------------------------------------------
# 2️⃣ VERIFY METADATA EXISTS
# --------------------------------------------------

required_cols = ["disease", "cell_type"]
missing = [c for c in required_cols if c not in adata.obs.columns]

if len(missing) > 0:
    raise ValueError(f"Missing metadata columns: {missing}")

print("Metadata already present.")

# --------------------------------------------------
# 3️⃣ CREATE celltype_condition
# --------------------------------------------------

adata.obs["celltype_condition"] = (
    adata.obs["cell_type"].astype(str) + "_" +
    adata.obs["disease"].astype(str)
)

# --------------------------------------------------
# 4️⃣ FIND LEIDEN COLUMNS
# --------------------------------------------------

leiden_cols = [c for c in adata.obs.columns if c.startswith("leiden")]
print("Found clustering columns:", leiden_cols)

# --------------------------------------------------
# 5️⃣ COMPUTE PER-CELLTYPE NMI
# --------------------------------------------------

results = []

unique_classes = adata.obs["celltype_condition"].dropna().unique()

for cluster_key in leiden_cols:

    res = float(cluster_key.replace("leiden_r", ""))

    print(f"Processing resolution {res}")

    valid = adata.obs[[cluster_key, "celltype_condition"]].dropna()
    clusters = valid[cluster_key].astype(str)
    labels = valid["celltype_condition"].astype(str)

    for ct in unique_classes:
        binary_label = (labels == ct).astype(int)
        nmi_ct = normalized_mutual_info_score(clusters, binary_label)

        results.append({
            "resolution": res,
            "celltype_condition": ct,
            "NMI": nmi_ct
        })

nmi_df = pd.DataFrame(results)

print("NMI computation Done.")

# --------------------------------------------------
# 6️⃣ CREATE SORTED PIVOT (ONLY OUTPUT)
# --------------------------------------------------

pivot = nmi_df.pivot(
    index="celltype_condition",
    columns="resolution",
    values="NMI"
)

lowest_resolution = sorted(pivot.columns)[0]

sorted_pivot = pivot.sort_values(by=lowest_resolution, ascending=False)

output_path = "/home/znazari/scRNAseq_parkinson/result/nmi_table_pivot_geneformer.csv"
sorted_pivot.to_csv(output_path, index=True)

print("Final sorted pivot saved at:", output_path)

Loaded: (2096155, 512)
Metadata already present.
Found clustering columns: ['leiden_r0.3', 'leiden_r0.5', 'leiden_r0.8', 'leiden_r1.0']
Processing resolution 0.3


In [None]:
sorted_pivot