## This script is used to annotate cell types for 8 μm bins based on a marker gene list and to save the results as an H5AD file.

In [1]:
import warnings
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sp
import pandas as pd
import anndata as ad
import scanpy as sc
import squidpy as sq
import seaborn as sns
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
sc.settings.verbosity = 3

import json
import pickle
import os
import argparse

from collections import defaultdict
from skimage.morphology import dilation, square
from multiprocessing import Pool, Manager
from tqdm import tqdm
import matplotlib.colors as mcolors

In [2]:
def compute_relative_abundance(adata, dict_use):
    relative_abundances = {}
    for cell_type, markers in tqdm(dict_use.items()):
        avg_expressions = []
        for marker in markers:
            if marker not in adata.var_names:
                continue
            expressed_values = adata[:, marker].X[adata[:, marker].X > 0]
            # Check if there are any expressed values
            if isinstance(expressed_values, np.ndarray):
                num_nonzero = len(expressed_values)
            else:  # Assuming it's a sparse matrix
                num_nonzero = expressed_values.getnnz()
            if num_nonzero > 0:
                avg_expressions.append(np.mean(expressed_values))
            else:
                avg_expressions.append(0)
        relative_abundance = np.log10(sum(avg_expressions) + 1e-10)
        relative_abundances[cell_type] = 1 / relative_abundance
    return relative_abundances


def annotate_cells_stage(marker_dict_use, adata, cells_to_annotate=None):
    WCT = compute_relative_abundance(adata, marker_dict_use)
    annotations = {}
    unannotated_cells = []

    if cells_to_annotate is None:
        cells_to_annotate = adata.obs_names
        data_submatrix = adata.X
    else:
        data_submatrix = adata[cells_to_annotate, :].X

    all_scores = np.zeros((len(cells_to_annotate), len(marker_dict_use)))
    print(all_scores.shape)
    for idx, (cell_type, markers) in enumerate(marker_dict_use.items()):
        valid_markers_indices = [
            adata.var_names.get_loc(marker)
            for marker in markers
            if marker in adata.var_names
        ]
        marker_matrix = data_submatrix[:, valid_markers_indices]
        presence_matrix = (marker_matrix > 0).astype(int)
        scores = presence_matrix.sum(axis=1) * WCT[cell_type]

        all_scores[:, idx] = scores.ravel()
    max_scores = np.max(all_scores, axis=1)
    max_score_indices = np.argmax(all_scores, axis=1)
    cell_types = list(marker_dict_use.keys())
    annotations = np.array(
        [
            cell_types[idx] if score > 0 else "Others"
            for idx, score in zip(max_score_indices, max_scores)
        ]
    )
    unannotated = [cells_to_annotate[i] for i in np.where(annotations == "Others")[0]]
    return dict(zip(cells_to_annotate, annotations)), unannotated


In [3]:
with open("./cell_markers.json", "r") as json_file:
    markers = json.load(json_file)

marker_dict = defaultdict(list)
for key in markers.keys():
    marker_dict[key].extend(markers[key])
for key, value in marker_dict.items():
    marker_dict[key] = list(set(marker_dict[key]))


In [None]:
sample_list = pd.read_csv("HD-OV 100.csv", index_col=False)
adata_list = []

for index, row in sample_list.iterrows():
    sample = row['sample_id']
    file_path = row['path']
    group = row['group']

    adata = sc.read_h5ad(f"{file_path}/outs/adata_8um.h5ad")
    adata.obs['in_tissue'] = adata.obs['in_tissue'].astype(float)
    adata.obs['array_row'] = adata.obs['array_row'].astype(float)
    adata.obs['array_col'] = adata.obs['array_col'].astype(float)
    adata.obsm['spatial'] = adata.obsm['spatial'].astype(float)

    sc.pp.calculate_qc_metrics(adata_8um, inplace=True)
    adata_8um = adata_8um[(adata_8um.obs['n_genes_by_counts'] > 3) & (adata_8um.obs['total_counts'] > 5) ].copy()

    adata.layers['counts'] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    annotations, unannotated_cells = annotate_cells_stage(marker_dict, adata)
    adata.obs["annotations"] = pd.Series(annotations).astype("category")
    adata.write_h5ad(f"{file_path}/outs/adata_anno.h5ad")
    adata_list.append(adata)


In [5]:
adata_merged = ad.concat(adata_list, axis=0, merge='same', pairwise=True)
adata_merged.uns['spatial'] = {s: {} for s in adata_merged.obs['sample'].unique()}
adata_merged.write_h5ad(f"./integrated_adata.h5ad")
adata_merged.obs.to_csv(f"./integrated_adata_obs.csv")


In [None]:
HGSOC_adata = adata_merged[adata_merged["subtype"]=="HGSOC"].copy()
HGSOC_adata.write_h5ad(f"./HGSOC_adata.h5ad")
