In [1]:
import os
import scanpy as sc
import pandas as pd


In [2]:
 adata = sc.read_h5ad('/projects/bioinformatics/DB/scRNAseq_parkinson/dataset.h5ad')

In [3]:
# 1. Keep a copy of raw counts
adata_raw = adata.copy()

# 2. HVG selection on raw counts
sc.pp.highly_variable_genes(
    adata_raw,
    n_top_genes=2000,
    flavor='seurat'
)
hvg_genes = adata_raw.var.highly_variable

# 3. Now normalize and log-transform the original dataset
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# 4. Now subset to HVGs AFTER normalization
adata_hvg = adata[:, hvg_genes].copy()

# 5. Scale
sc.pp.scale(adata_hvg)

# 6. PCA
sc.tl.pca(adata_hvg)



  return dispatch(args[0].__class__)(*args, **kw)


In [None]:
# Rank genes for each cluster or cell type
sc.tl.rank_genes_groups(adata_hvg, groupby='cell_type', method='wilcoxon')
sc.pl.rank_genes_groups(adata_hvg, n_genes=20, sharey=False)


In [4]:
# Build the neighborhood graph

sc.pp.neighbors(adata_hvg, n_pcs=30)


In [None]:
# Run Leiden clustering
sc.tl.leiden(adata_hvg, resolution=1.0)



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata_hvg, resolution=1.0)


In [None]:
# where my cluster is 
adata_hvg.obs['leiden']


In [None]:
# Compute NMI with cell_type and disease

from sklearn.metrics import normalized_mutual_info_score
import pandas as pd

df_pca = adata_hvg.obs[['leiden', 'cell_type', 'disease']].copy()

# NMI with cell type
nmi_celltype = normalized_mutual_info_score(df_pca['leiden'], df_pca['cell_type'])
print("NMI Leiden vs Cell Type (PCA):", nmi_celltype)

# NMI with disease
nmi_disease = normalized_mutual_info_score(df_pca['leiden'], df_pca['disease'])
print("NMI Leiden vs Disease (PCA):", nmi_disease)


In [None]:
# Plot 

sc.pl.pca_scatter(adata_hvg, color=['leiden', 'cell_type', 'disease'])
