In [3]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib as mpl

def plot_adata(adata_to_plot, cluster_key, plot_type="umap", keyword=None):
    """
    Plot AnnData embeddings (UMAP or spatial).
    
    Parameters:
    - adata_to_plot: AnnData object with .obsm and .obs fields.
    - cluster_key: key in adata.obs to color by.
    - plot_type: "umap" or "spatial".
    - keyword: if provided, only highlight clusters containing this substring; others grouped as "Other".
    """

    # Prepare DataFrame for plotting
    if plot_type == "spatial":
        coords = adata_to_plot.obsm['spatial'].copy()
        df = pd.DataFrame(coords, columns=['x', 'y'], index=adata_to_plot.obs_names)
    else:  # UMAP
        df = pd.DataFrame(
            adata_to_plot.obsm['X_umap'],
            columns=['UMAP1', 'UMAP2'],
            index=adata_to_plot.obs_names
        )

    # Original cluster labels as strings
    df[cluster_key] = adata_to_plot.obs[cluster_key].astype(str)

    # If a keyword is provided, label matching clusters, others as "Other"
    if keyword:
        df['plot_label'] = df[cluster_key].apply(lambda x: x if keyword in x else 'Other')
        color_key = 'plot_label'
        legend_title = f"Highlighted: {keyword}"
    else:
        color_key = cluster_key
        legend_title = cluster_key

    # Get a discrete color map
    tab20 = [mpl.colors.rgb2hex(c) for c in plt.get_cmap('tab20').colors]

    # Choose appropriate scatter fields
    if plot_type == "spatial":
        fig = px.scatter(
            df,
            x='x',
            y='y',
            color=color_key,
            title='Spatial scatter — manual cell types',
            hover_name=df.index,
            width=1600,
            height=700
        )
        fig.update_traces(marker=dict(size=2, opacity=0.8))
        fig.update_yaxes(autorange='reversed')
    else:  # UMAP
        fig = px.scatter(
            df,
            x='UMAP1',
            y='UMAP2',
            color=color_key,
            title='UMAP embedding — Leiden clusters',
            color_discrete_sequence=tab20,
            hover_name=df.index,
            width=1400,
            height=1200
        )
        fig.update_traces(marker=dict(size=3, opacity=0.8))

    # Adjust legend and layout
    fig.update_layout(
        legend_title_text=legend_title,
        legend=dict(
            itemsizing='constant',
            traceorder='normal',
            bgcolor='rgba(255,255,255,0.5)',
            x=1.02,
            y=1
        ),
        margin=dict(l=20, r=200, t=50, b=20)
    )

    fig.show()

In [1]:
import pandas as pd
import scanpy as sc


MajorSet = "ImmuneAnn_SubTypes"
Tissue = "Region2"
xen_file = f"/Volumes/ProstateCancerEvoMain/dbs/Ongoing/Collection/MajorAnnGroups/{MajorSet}/{Tissue}.{MajorSet}.h5ad"
adata_xen = sc.read_h5ad(xen_file) 
adata_xen


AnnData object with n_obs × n_vars = 71932 × 5101
    obs: 'cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'z_level', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_10_genes', 'pct_counts_in_top_20_genes', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_150_genes', 'cell_type', 'leiden', 'major_cell_type_xenium_panel'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'cell_type_colors', 'dendrogram_leiden', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'rank_leiden', 'spatialdata_attrs', 'umap'
    obsm: 'X_pca', '

In [3]:
ann_file = f"/Volumes/ProstateCancerEvoMain/dbs/Ongoing/Collection/MajorAnnGroups/{MajorSet}/{MajorSet}.V1.xlsx"

df = pd.read_excel(ann_file)


# 2. Split the comma-separated 'geneSymbolmore1' column into lists
df["gene_list"] = df["geneSymbolmore1"].str.split(",")

# 3. Explode the 'gene_list' column so each (cellName, gene) pair is its own row
df_long = df.explode("gene_list")

# 4. (Optional) Trim whitespace from each gene symbol, in case there are stray spaces
df_long["gene_list"] = df_long["gene_list"].str.strip()

# 5. Select (and/or rename) only the columns you need: 'cellName' and the exploded gene symbol
df_long = df_long[["cellName", "gene_list"]].rename(columns={"gene_list": "geneSymbol"})

# 6. Reset the index (optional, for a clean range index)
markers = df_long.reset_index(drop=True)
markers

dup_mask = markers["geneSymbol"].duplicated(keep=False)

markers = markers[~dup_mask].copy()


In [4]:
# import spatialdata
# import anndata
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np
# import decoupler as dc


# # Query Omnipath and get PanglaoDB
# markers = dc.get_resource(
#         name="PanglaoDB",
#         organism='human',
#         license='academic'
# )

# markers

# # Filter by canonical_marker and human
# markers = markers[(markers['human'] == "True") & (markers['canonical_marker'] == "True")]
# markers = markers[~markers.duplicated(['cell_type', 'genesymbol'])]
# markers


# dc.run_ora(
#     mat=adata_xen,
#     net=markers,
#     source='cell_type',
#     target='genesymbol',
#     min_n=3,
#     verbose=True,
#     use_raw=False
# )

# adata_xen.obsm['ora_estimate']

# acts = dc.get_acts(adata_xen, obsm_key='ora_estimate')

# # We need to remove inf and set them to the maximum value observed for pvals=0
# acts_v = acts.X.ravel()
# max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
# acts.X[~np.isfinite(acts.X)] = max_e


# df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')

# res = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')

# n_ctypes = 10
# ctypes_dict = res.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
# ctypes_dict




In [5]:
import spatialdata
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import decoupler as dc

import pandas as pd
import glob

csv_files = glob.glob('/Users/ugursahin/Downloads/scImmuCC/extdata/*csv')

dfs = [pd.read_csv(path, index_col=None) for path in csv_files]

combined_df = pd.concat(dfs, axis=1, join='outer')

stacked = combined_df.stack().reset_index()
stacked.columns = ['row_index', 'cell_type', 'genesymbol']
markers = stacked[["cell_type","genesymbol"]]


dup_mask = markers["genesymbol"].duplicated(keep=False)

markers = markers[~dup_mask].copy()


dc.run_ora(
    mat=adata_xen,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True,
    use_raw=False
)

adata_xen.obsm['ora_estimate']

acts = dc.get_acts(adata_xen, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts ; 
df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

res = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
res
n_ctypes = 10
ctypes_dict = res.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

  _set_context_ca_bundle_path(ca_bundle_path)
  from .autonotebook import tqdm as notebook_tqdm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Running ora on mat with 11677 samples and 5101 targets for 30 sources.


100%|██████████| 11677/11677 [00:09<00:00, 1187.09it/s]


{'0': ['Neutrophil',
  'Immune',
  'ILC',
  'Treg',
  'CD4_Central_memory',
  'ILC3',
  'ILC1',
  'CD8_Central_memory',
  'CD4_naive',
  'NK_bright'],
 '1': ['Neutrophil',
  'Immune',
  'ILC',
  'Treg',
  'NonClassical_Mono',
  'CD8_Cytotoxic',
  'Mast',
  'NK_bright',
  'Non_Immune',
  'Monocyte'],
 '10': ['Bcell',
  'ILC',
  'Tfh',
  'Th17',
  'Monocyte',
  'Macrophage_M1',
  'CD4_Effector_memory',
  'CD8_Central_memory',
  'NK_bright',
  'Treg'],
 '11': ['DC',
  'Th2',
  'CD4_naive',
  'Macrophage_M1',
  'Th17',
  'Monocyte',
  'Classical_Mono',
  'ILC',
  'Treg',
  'Neutrophil'],
 '12': ['Mast',
  'ILC1',
  'Neutrophil',
  'ILC',
  'Immune',
  'Treg',
  'Classical_Mono',
  'NK_dim',
  'CD8_Cytotoxic',
  'ILC3'],
 '13': ['CD8_Cytotoxic',
  'Non_Immune',
  'CD4_Effector_memory',
  'CD8_Central_memory',
  'NK_bright',
  'Mast',
  'Th17',
  'CD8_Effector_memory',
  'ILC1',
  'Monocyte'],
 '14': ['Immune',
  'Neutrophil',
  'Treg',
  'ILC',
  'Macrophage_M2',
  'NK_bright',
  'CD8_Cytot

In [6]:
# If adata_xen.obs['leiden'] is integer-typed, convert to string so it matches ctypes_dict keys:
if pd.api.types.is_integer_dtype(adata_xen.obs['leiden']):
    adata_xen.obs['leiden'] = adata_xen.obs['leiden'].astype(str)

# 4) Create a new column "assigned_celltype" by mapping each cell's cluster to ctypes_dict[cluster][0]
def pick_first_celltype(cluster_label):
    # Return the first element of the list in ctypes_dict, if the key exists
    if cluster_label in ctypes_dict:
        return ctypes_dict[cluster_label][0]
    else:
        return None

adata_xen.obs['assigned_celltype'] = adata_xen.obs['leiden'].map(pick_first_celltype)


In [7]:
# basal_epithelial_cell_of_prostatic_duct = list(markers[markers["cellName"]=="basal_cell_of_prostate_epithelium"]["geneSymbol"].values)

# present = [g for g in basal_epithelial_cell_of_prostatic_duct if g in list(adata_xen.var_names)]

# # Create the dotplot
# sc.pl.dotplot(
#     adata_xen,
#     present,
#     groupby="leiden",
#     standard_scale='var',   # scale each gene to [0,1] across groups
#     swap_axes=True,
#     dendrogram=False,
#     dot_max=1,
#     figsize=(12, 6),
# )


In [4]:
plot_adata(adata_xen,cluster_key="cell_type")

In [9]:
plot_adata(adata_xen,cluster_key="assigned_celltype")

In [10]:
ctypes_dict

{'0': ['Neutrophil',
  'Immune',
  'ILC',
  'Treg',
  'CD4_Central_memory',
  'ILC3',
  'ILC1',
  'CD8_Central_memory',
  'CD4_naive',
  'NK_bright'],
 '1': ['Neutrophil',
  'Immune',
  'ILC',
  'Treg',
  'NonClassical_Mono',
  'CD8_Cytotoxic',
  'Mast',
  'NK_bright',
  'Non_Immune',
  'Monocyte'],
 '10': ['Bcell',
  'ILC',
  'Tfh',
  'Th17',
  'Monocyte',
  'Macrophage_M1',
  'CD4_Effector_memory',
  'CD8_Central_memory',
  'NK_bright',
  'Treg'],
 '11': ['DC',
  'Th2',
  'CD4_naive',
  'Macrophage_M1',
  'Th17',
  'Monocyte',
  'Classical_Mono',
  'ILC',
  'Treg',
  'Neutrophil'],
 '12': ['Mast',
  'ILC1',
  'Neutrophil',
  'ILC',
  'Immune',
  'Treg',
  'Classical_Mono',
  'NK_dim',
  'CD8_Cytotoxic',
  'ILC3'],
 '13': ['CD8_Cytotoxic',
  'Non_Immune',
  'CD4_Effector_memory',
  'CD8_Central_memory',
  'NK_bright',
  'Mast',
  'Th17',
  'CD8_Effector_memory',
  'ILC1',
  'Monocyte'],
 '14': ['Immune',
  'Neutrophil',
  'Treg',
  'ILC',
  'Macrophage_M2',
  'NK_bright',
  'CD8_Cytot

In [17]:
# #Edit 
if Tissue == "Region1":
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "0", "assigned_celltype"] = "CD4 T Cell"
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "2", "assigned_celltype"] = "CD8 T Cell"
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "13", "assigned_celltype"] = "Unknown"
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "14", "assigned_celltype"] = "Unknown"
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "5", "assigned_celltype"] = "Macrophage_M2"

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "7", "assigned_celltype"] = 'CD8 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "9", "assigned_celltype"] = 'CD4 T Cell'
#     adata_xen.obs.loc[adata_xen.obs["leiden"] == "9", "assigned_celltype"] = 'T regulatory cells'
#     adata_xen.obs.loc[adata_xen.obs["leiden"] == "2", "assigned_celltype"] = 'NK cells'



if Tissue == "Region2":
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "0", "assigned_celltype"] = 'CD8 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "9", "assigned_celltype"] = 'CD8 T Cell'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "8", "assigned_celltype"] = 'CD4 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "3", "assigned_celltype"] = 'CD4 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "2", "assigned_celltype"] = 'CD8 and CD4 T Cell'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "17", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "16", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "20", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "14", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "15", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "18", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "19", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "21", "assigned_celltype"] = 'Unknown'



     adata_xen.obs.loc[adata_xen.obs["leiden"] == "1", "assigned_celltype"] = 'Macrophage_M2'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "11", "assigned_celltype"] = 'Monocyte'
#     adata_xen.obs.loc[adata_xen.obs["leiden"] == "1", "assigned_celltype"] = 'Macrophages'
    



if Tissue == "Region3":
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "1", "assigned_celltype"] = 'CD8  and CD4 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "4", "assigned_celltype"] = 'CD8  and CD4 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "2", "assigned_celltype"] = 'CD8  and CD4 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "11", "assigned_celltype"] = 'CD8  and CD4 T Cell'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "7", "assigned_celltype"] = 'Macrophage_M2'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "0", "assigned_celltype"] = 'Macrophage_M2'



    

if Tissue == "Region4":
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "0", "assigned_celltype"] = 'Macrophage_M2'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "12", "assigned_celltype"] = 'Macrophage_M2'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "6", "assigned_celltype"] = 'Macrophage_M2'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "14", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "13", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "17", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "18", "assigned_celltype"] = 'Unknown'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "1", "assigned_celltype"] = 'Neutrophil'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "16", "assigned_celltype"] = 'Unknown'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "9", "assigned_celltype"] = 'Unknown'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "2", "assigned_celltype"] = 'CD8 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "4", "assigned_celltype"] = 'CD8 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "15", "assigned_celltype"] = 'CD8 T Cell'
     adata_xen.obs.loc[adata_xen.obs["leiden"] == "19", "assigned_celltype"] = 'CD8 T Cell'

     adata_xen.obs.loc[adata_xen.obs["leiden"] == "5", "assigned_celltype"] = 'CD4 T Cell'









In [18]:
plot_adata(adata_xen,cluster_key="assigned_celltype")

In [19]:
outpath = "/Volumes/ProstateCancerEvoMain/dbs/Ongoing/Collection/MajorAnnGroups/Results"
adata_xen.write_h5ad(
              f"{outpath}/{Tissue}/{xen_file.split('/')[-1].replace('.h5ad','.AnnDC.h5ad')}")

In [20]:
f"{outpath}/{Tissue}/{xen_file.split('/')[-1].replace('.h5ad','.AnnDC.h5ad')}"

'/Volumes/ProstateCancerEvoMain/dbs/Ongoing/Collection/MajorAnnGroups/Results/Region1/Region1.ImmuneAnn_SubTypes.AnnDC.h5ad'