In [6]:
import scanpy as sc
import argparse
import os
import sys
import time
import tracemalloc  # For memory profiling
from scipy import sparse
import scanpy.external as sce
import numpy as np
import doubletdetection
import warnings
import logging

warnings.filterwarnings("ignore", category=DeprecationWarning)
from numba.core.errors import NumbaDeprecationWarning
warnings.simplefilter("ignore", category=NumbaDeprecationWarning)

# Set up logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

sc.settings.verbosity = 0
sc.settings.set_figure_params(dpi=80, facecolor="white", frameon=False)

log = logging.getLogger("clustering.log")
log.setLevel(logging.INFO)

context_path ="/home/sah2p/ondemand/singlecell_data/Arabidopsis/"
dataset = "SRP166333"
filename = dataset+"_bbknn.h5ad"

In [7]:
adata = sc.read_h5ad(context_path+filename)

# Clustering

In [8]:
# Using the igraph implementation and a fixed number of iterations can be significantly faster, especially for larger datasets
sc.tl.leiden(adata, n_iterations=2)
sc.tl.louvain(adata)
sc.pl.umap(adata, color=["leiden"], save=dataset+"_bbknn_leiden.png")
sc.pl.umap(adata, color=["louvain"], save=dataset+"_bbknn_louvain.png")

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


In [9]:
# Re assessing the doublet detection

sc.pl.umap(
    adata,
    color=["leiden", "doublet", "doublet_score"],
    # increase horizontal space between panels
    save = dataset+"_leiden_bbknn_doublet.png",
    wspace=0.5,
    size=3,
)
sc.pl.umap(
    adata,
    color=["leiden", "log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
    wspace=0.5,
    save = dataset+"_leiden_bbknn_qc.png",
    ncols=2,
)

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


In [None]:
# Doing Clustering on differnt resolutions

In [10]:
sc.tl.leiden(adata, key_added="leiden_res0_1", resolution=0.1)
sc.tl.leiden(adata, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(adata, key_added="leiden_res1", resolution=1)

sc.tl.louvain(adata, key_added="louvain_res0_1", resolution=0.1)
sc.tl.louvain(adata, key_added="louvain_res0_5", resolution=0.5)
sc.tl.louvain(adata, key_added="louvain_res1", resolution=1)

In [11]:
sc.pl.umap(
    adata,
    color=["leiden_res0_1", "leiden_res0_5","leiden_res1"],
    legend_loc="on data",
    show = False,
    save = dataset+"_bbknn_leiden_resolutions.png",)

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


[<AxesSubplot: title={'center': 'leiden_res0_1'}, xlabel='UMAP1', ylabel='UMAP2'>,
 <AxesSubplot: title={'center': 'leiden_res0_5'}, xlabel='UMAP1', ylabel='UMAP2'>,
 <AxesSubplot: title={'center': 'leiden_res1'}, xlabel='UMAP1', ylabel='UMAP2'>]

In [12]:
sc.pl.umap(
    adata,
    color=["louvain_res0_1", "louvain_res0_5", "louvain_res1"],
    legend_loc="on data",
    show = False,
    save = dataset+"_bbknn_louvain_resolutions.png",)

  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


[<AxesSubplot: title={'center': 'louvain_res0_1'}, xlabel='UMAP1', ylabel='UMAP2'>,
 <AxesSubplot: title={'center': 'louvain_res0_5'}, xlabel='UMAP1', ylabel='UMAP2'>,
 <AxesSubplot: title={'center': 'louvain_res1'}, xlabel='UMAP1', ylabel='UMAP2'>]

In [13]:
# Perform differential expression analysis
cluster_column = "leiden_res0_5"
sc.tl.rank_genes_groups(adata, groupby=cluster_column, method="wilcoxon")


  return reduction(axis=axis, out=out, **passkwargs)


In [None]:
!pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0


In [14]:
# Extract the results
top_n = 10  # Number of top markers per cluster
ranked_genes = pd.DataFrame()

for cluster in adata.obs[cluster_column].unique():
    top_genes = pd.DataFrame({
        "gene": adata.uns["rank_genes_groups"]["names"][cluster][:top_n],
        "log2FC": adata.uns["rank_genes_groups"]["logfoldchanges"][cluster][:top_n],
        "p_val": adata.uns["rank_genes_groups"]["pvals"][cluster][:top_n],
        "p_val_adj": adata.uns["rank_genes_groups"]["pvals_adj"][cluster][:top_n],
        "cluster": cluster
    })
    ranked_genes = pd.concat([ranked_genes, top_genes])

# Save to CSV
output_file = context_path+"top_markers_per_cluster_scanpy.csv"
ranked_genes.to_csv(output_file, index=False)

print(f"Top markers saved to {output_file}")

Top markers saved to /home/sah2p/ondemand/singlecell_data/Arabidopsis/top_markers_per_cluster_scanpy.csv


In [15]:
sc.pl.dotplot(adata, var_names=ranked_genes["gene"].unique(), groupby=cluster_column, save=dataset+"_clusterGenes_dotplot.png")

  obs_bool.groupby(level=0).sum() / obs_bool.groupby(level=0).count()
  dot_color_df = self.obs_tidy.groupby(level=0).mean()
  dot_ax.scatter(x, y, **kwds)


### Filtering out marker genes

In [16]:
import pandas as pd
df = pd.read_csv(context_path+"arabidopsis_thaliana.marker_fd.csv")


In [17]:
df.shape

(141696, 13)

In [18]:
# Filter out weak markers:
# Remove genes with low avg_log2FC (e.g., below 0.5).
# Remove genes with low p_val_adj (e.g., above 0.05).
df = df[(df["avg_log2FC"] > 0.5) & (df["p_val_adj"] < 0.05)]
df.shape
# Filter out genes that are not in the adata object:
# df = df[df["gene"].isin(adata.var_names)]
#Filter my dataset
df = df[df['dataset'] == 'SRP166333']

In [19]:
TOP_N_GENES = 10
# Get the top N genes for each cluster
top_genes = df.groupby("clusterName").apply(lambda x: x.nlargest(TOP_N_GENES, "avg_log2FC"))
top_genes = top_genes.reset_index(drop=True)
# Save the ranked genes per cluster to a CSV file
output_file = context_path+"top_markers_per_cluster.csv"
top_genes.to_csv(output_file, index=False)

In [None]:
# Subset to only the markers that are in the data
marker_genes_in_data = {}
for ct, markers in marker_genes.items():
    markers_found = []
    for marker in markers:
        if marker in adata.var.index:
            markers_found.append(marker)
    marker_genes_in_data[ct] = markers_found

In [22]:
import pandas as pd
from itertools import combinations
from scipy.spatial.distance import jaccard

# Load clustered genes (Your dataset)
clustered_gene_file = "/home/sah2p/ondemand/singlecell_data/Arabidopsis/top_markers_per_cluster_scanpy.csv"  # Replace with your file
cluster_df = pd.read_csv(clustered_gene_file)  # Assumes two columns: "cluster" & "gene"

# Load top marker genes per cell type (Preprocessed file from previous steps)
marker_gene_file = "/home/sah2p/ondemand/singlecell_data/Arabidopsis/top_markers_per_cluster.csv"  # Replace with your file
marker_df = pd.read_csv(marker_gene_file)  # Assumes two columns: "celltype_id" & "gene"

# # Convert to dictionary for easy lookup
cluster_dict = cluster_df.groupby("cluster")["gene"].apply(set).to_dict()
celltype_dict = marker_df.groupby("clusterName")["gene"].apply(set).to_dict()

# Compute Jaccard Similarity
cluster_annotations = {}

for cluster, cluster_genes in cluster_dict.items():
    best_match = None
    best_score = 0
    
    for celltype, celltype_genes in celltype_dict.items():
        intersection = len(cluster_genes & celltype_genes)
        union = len(cluster_genes | celltype_genes)
        score = intersection / union if union > 0 else 0  # Jaccard similarity

        if score > best_score:
            best_score = score
            best_match = celltype

    cluster_annotations[cluster] = {"best_match": best_match, "jaccard_score": best_score}

# # Convert results to DataFrame
annotations_df = pd.DataFrame.from_dict(cluster_annotations, orient="index")
annotations_df.reset_index(inplace=True)
annotations_df.columns = ["cluster", "predicted_celltype", "jaccard_score"]

# # Save to CSV
# annotations_df.to_csv("cluster_annotations.csv", index=False)

# # Display results
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Cluster Annotations", dataframe=annotations_df)

In [23]:
annotations_df.to_csv(context_path+"cluster_annotations.csv", index=False)

In [24]:
cluster_df

Unnamed: 0,gene,log2FC,p_val,p_val_adj,cluster
0,AT2G46890,5.562217,0.0,0.0,1
1,AT3G48185,5.308359,0.0,0.0,1
2,AT2G40480,4.601549,0.0,0.0,1
3,AT1G25530,5.839280,0.0,0.0,1
4,AT5G57770,5.368637,0.0,0.0,1
...,...,...,...,...,...
145,AT5G22440,3.521461,0.0,0.0,2
146,AT3G58700,3.719462,0.0,0.0,2
147,AT3G45030,3.347976,0.0,0.0,2
148,AT5G58420,3.306624,0.0,0.0,2


In [25]:
import pandas as pd
from scipy.stats import hypergeom

# ==== STEP 1: Load Your Clustered Genes ====
# Ensure your file has two columns: "cluster" and "gene"
cluster_df = pd.read_csv(clustered_gene_file)

# ==== STEP 2: Load Reference Marker Genes ====
# Ensure the file has "clusterName" and "gene"
marker_df = pd.read_csv(marker_gene_file)

# ==== STEP 3: Convert Data to Dictionary ====
# Convert to dictionary for easy lookup
cluster_dict = cluster_df.groupby("cluster")["gene"].apply(set).to_dict()
marker_dict = marker_df.groupby("clusterName")["gene"].apply(set).to_dict()

# Total genes in the genome (N)
N = 27600  # Approximate Arabidopsis gene count

# ==== STEP 4: Compute Hypergeometric Enrichment ====
enrichment_results = []

for cluster, cluster_genes in cluster_dict.items():
    for celltype, marker_genes in marker_dict.items():
        K = len(marker_genes)  # Genes in the reference cell type
        n = len(cluster_genes)  # Genes in your test cluster
        k = len(cluster_genes & marker_genes)  # Overlapping genes

        if k > 0:  # Only compute if there is some overlap
            p_value = hypergeom.sf(k-1, N, K, n)  # Survival function (1 - CDF)
            enrichment_results.append([cluster, celltype, K, n, k, p_value])

# Convert results to DataFrame
enrichment_df = pd.DataFrame(
    enrichment_results,
    columns=["Cluster", "CellType", "Reference_Size", "Cluster_Size", "Overlap", "p_value"]
)

# ==== STEP 5: Adjust p-values (Benjamini-Hochberg FDR) ====
enrichment_df["p_adj"] = enrichment_df["p_value"] * len(enrichment_df) / enrichment_df["p_value"].rank(method="first")

# Save to CSV
output_file = context_path+"hypergeometric_enrichment_results.csv"
enrichment_df.to_csv(output_file, index=False)

print(f"Enrichment analysis completed. Results saved to: {output_file}")

# ==== STEP 6: Display Top Results (Optional) ====
print(enrichment_df.sort_values("p_adj").head(10))  # Show top 10 significant results

Enrichment analysis completed. Results saved to: /home/sah2p/ondemand/singlecell_data/Arabidopsis/hypergeometric_enrichment_results.csv
    Cluster               CellType  Reference_Size  Cluster_Size  Overlap  \
3         3            Root cortex              10            10        5   
7         8  Phloem pole pericycle              10            10        4   
5         5               Non-hair              10            10        4   
4         4        Root endodermis              10            10        4   
1         1        Root procambium              10            10        4   
11       14          Sieve element              10            10        3   
8         9              Root hair              10            10        3   
9        11              Metaxylem              10            10        2   
6         7         Companion cell              10            10        2   
0         0   Xylem pole pericycle              10            10        2   

         p_value

In [26]:
adata.obs

Unnamed: 0,Orig.ident,Celltype,Dataset,Tissue,Organ,Condition,Genotype,Libraries,ACE,n_genes_by_counts,...,batch,n_counts,leiden,louvain,leiden_res0_1,leiden_res0_5,leiden_res1,louvain_res0_1,louvain_res0_5,louvain_res1
SRX8089019@@_AAACCTGAGAAGGTTT-1-0,SRX8089019,Root procambium,SRP166333,Whole root,Root,Normal,Col-0,10x Genomics,7 days old seedling,1550,...,0,3313.0,9,10,0,1,9,0,1,10
SRX8089019@@_AAACCTGAGAGTGACC-1-0,SRX8089019,Root cap,SRP166333,Whole root,Root,Normal,Col-0,10x Genomics,7 days old seedling,1715,...,0,4064.0,4,12,3,6,4,1,7,12
SRX8089019@@_AAACCTGAGCTAGTCT-1-0,SRX8089019,Root cap,SRP166333,Whole root,Root,Normal,Col-0,10x Genomics,7 days old seedling,1502,...,0,2746.0,10,5,3,10,10,1,3,5
SRX8089019@@_AAACCTGAGTGGTAGC-1-0,SRX8089019,Root cap,SRP166333,Whole root,Root,Normal,Col-0,10x Genomics,7 days old seedling,939,...,0,2756.0,14,13,3,12,14,1,7,13
SRX8089019@@_AAACCTGCAATAGCGG-1-0,SRX8089019,Root cap,SRP166333,Whole root,Root,Normal,Col-0,10x Genomics,7 days old seedling,2249,...,0,5080.0,10,5,3,10,10,1,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRX8089021@@_TTTGTCACATGTAAGA-1-2,SRX8089021,Non-hair,SRP166333,Whole root,Root,Heat shock,Col-0,10x Genomics,7 days old seedling,987,...,2,2806.0,1,0,2,5,1,2,4,0
SRX8089021@@_TTTGTCAGTGCCTGTG-1-2,SRX8089021,Xylem pole pericycle,SRP166333,Whole root,Root,Heat shock,Col-0,10x Genomics,7 days old seedling,2665,...,2,8617.0,6,4,1,0,5,0,0,4
SRX8089021@@_TTTGTCAGTTGTTTGG-1-2,SRX8089021,Root hair,SRP166333,Whole root,Root,Heat shock,Col-0,10x Genomics,7 days old seedling,1013,...,2,2123.0,0,6,5,2,0,1,8,6
SRX8089021@@_TTTGTCATCATGCATG-1-2,SRX8089021,Root hair,SRP166333,Whole root,Root,Heat shock,Col-0,10x Genomics,7 days old seedling,1007,...,2,1937.0,0,6,5,2,0,1,8,6


In [27]:
# ==== STEP 3: Assign Best-Matching Cell Type ====
# Select the best cell type for each cluster (lowest adjusted p-value)
best_matches = enrichment_df.sort_values("p_adj").groupby("Cluster").first().reset_index()
best_matches = best_matches[["Cluster", "CellType", "p_adj"]]

# Convert cluster column to string (to match AnnData)
best_matches["Cluster"] = best_matches["Cluster"].astype(str)

# ==== STEP 4: Add Cell Type Annotations to AnnData ====
# Ensure cluster labels exist in AnnData
if "leiden_res0_5" not in adata.obs.columns:
    raise ValueError("Clustering labels ('leiden') not found in adata.obs. Please run clustering first.")

# Create a mapping from cluster to cell type
cluster_to_celltype = dict(zip(best_matches["Cluster"], best_matches["CellType"]))

# Assign the cell type annotation
adata.obs["predicted_celltype"] = adata.obs["leiden_res0_5"].map(cluster_to_celltype)

# ==== STEP 5: Save Updated AnnData Object ====
# output_file = "adata_with_annotations.h5ad"
# adata.write(output_file)

# ==== STEP 6: Plot UMAP with Annotated Clusters ====
sc.pl.umap(adata, color=["leiden_res0_5", "predicted_celltype"], save=dataset+"umap_predicted_celltypes.png", frameon=False)

print(f"✅ Annotated AnnData saved to: {output_file}")

... storing 'predicted_celltype' as categorical
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


✅ Annotated AnnData saved to: /home/sah2p/ondemand/singlecell_data/Arabidopsis/hypergeometric_enrichment_results.csv


In [36]:
# # ==== STEP 7: Generate Heatmap for Marker Gene Expression ====
# # Select marker genes for visualization
# top_marker_genes = best_matches["CellType"].unique()[:15]  # Top 15 representative genes



# # ==== STEP 8: Generate Dot Plot ====
# sc.pl.dotplot(adata, var_names=top_marker_genes, groupby="predicted_celltype", standard_scale="var")

# # ==== STEP 9: Compare Predicted vs. Ground Truth Labels ====
comparison_df = adata.obs[["predicted_celltype", "Celltype"]]
comparison_table = pd.crosstab(comparison_df["predicted_celltype"], comparison_df["Celltype"])

# Save the comparison table
comparison_output = context_path+"predicted_vs_ground_truth_comparison.csv"
comparison_table.to_csv(comparison_output)

# Display comparison table
print("✅ Predicted vs. Ground Truth Comparison Table:")
print(comparison_table)

print(f"✅ Comparison results saved to: {comparison_output}")

✅ Predicted vs. Ground Truth Comparison Table:
Celltype               Companion cell  G2/M phase  Metaxylem  Non-hair  \
predicted_celltype                                                       
Companion cell                    603           0         26         4   
G2/M phase                         22         104         30        16   
Metaxylem                           0           0        496         0   
Non-hair                            1           0          2      1164   
Phloem pole pericycle               6           0          2         0   
Root cap                            0           0          0         1   
Root cortex                         2           0          0         6   
Root endodermis                     0           0          3         1   
Root hair                           0           0          0         7   
Root procambium                     2           0          1         5   
Sieve element                       3           0          0     

In [65]:
adata

AnnData object with n_obs × n_vars = 14238 × 4000
    obs: 'Orig.ident', 'Celltype', 'Dataset', 'Tissue', 'Organ', 'Condition', 'Genotype', 'Libraries', 'ACE', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_pt', 'log1p_total_counts_pt', 'pct_counts_pt', 'doublet', 'doublet_score', 'batch', 'n_counts', 'leiden', 'louvain', 'leiden_res0_1', 'leiden_res0_5', 'leiden_res1', 'louvain_res0_1', 'louvain_res0_5', 'louvain_res1', 'predicted_celltype'
    var: 'features', 'mt', 'pt', 'n_cells_by_counts-0', 'mean_counts-0', 'log1p_mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'log1p_total_counts-0', 'n_cells_by_counts-1', 'mean_counts-1', 'log1p_mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'log1p_total_counts-1', 'n_cells_by_counts-2

In [66]:
sc.pl.umap(adata, color=["leiden", "Celltype"], save=dataset+"umap_celltypes.png", frameon=False)


  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(


In [67]:
#Umap of predicted vs ground truth
sc.pl.umap(adata, color=["leiden_res0_5", "predicted_celltype", "Celltype"], save=dataset+"predicted_vs_gt.png", frameon=False)


  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
  color_vector = pd.Categorical(values.map(color_map))
  cax = scatter(
