In [1]:
import pandas as pd
import numpy as np
import random
import os
from Celloc.common import read_data,sample_single_cells,plot_results_bulk_ST_by_spot
import matplotlib.pyplot as plt
import scanpy as sc
import sys
from Celloc import CELLOC
import anndata as ad

Load data

In [2]:
scRNA_path="./data/DCIS1/sc_RNA_DCIS1.h5ad"
cell_type_path=None
st_path="./data/DCIS1/spatial_DCIS1.h5ad"
coordinates_path=None
cell_type_numbers_estimation_path='./data/DCIS1/cell_type_number_setimate.csv'
output_folder="./Celloc_output/"

In [3]:
# scRNA_path="./data/melanoma/melanoma_scRNA_GEP.txt"
# cell_type_path="./data/melanoma/melanoma_scRNA_celllabels.txt"
# st_path="./data/melanoma/melanoma_STdata_slide1_GEP.txt"
# coordinates_path="./data/melanoma/melanoma_STdata_slide1_coordinates.txt"
# cell_type_numbers_estimation_path='./data/melanoma/cell_type_number_setimate.csv'

In [4]:
celltype_col='CellType'

# Read the data from the specified paths
# sc_adata: single-cell RNA-seq data
# spatial_adata: spatial transcriptomics data
# cell_type_number_eachspot_data: estimated number of each cell type in each spot
# coordinates_data: coordinates of each spot
sc_adata, spatial_adata, cell_type_number_eachspot_data, coordinates_data =\
        read_data(scRNA_path, cell_type_path, cell_type_numbers_estimation_path, st_path, coordinates_path,celltype_col)

# Extract the cell type information from the single-cell data
# Rename the index and the 'CellType' column for clarity
cell_type=pd.DataFrame(sc_adata.obs['CellType'])
cell_type.index = [str(idx)[5:] for idx in cell_type.index]
cell_type['CellType'] = [str(idx)[5:] for idx in cell_type['CellType']]

# Calculate the total number of each cell type across all spots
# Store the results in a DataFrame
cell_type_numbers_int=pd.DataFrame(cell_type_number_eachspot_data.sum(0))
cell_type_numbers_int.columns=['Fraction']

# Calculate the total number of cells in each spot
# Convert the results to integers
cell_number_to_node_assignment=np.array(cell_type_number_eachspot_data.sum(1)).astype(int)

In [5]:
# Filter out genes that are expressed in less than 1 cell in the single-cell data
sc.pp.filter_genes(sc_adata, min_cells=1)

# Filter out genes that are expressed in less than 1 cell in the spatial data
sc.pp.filter_genes(spatial_adata, min_cells=1)

# Find the intersection of the gene lists from the single-cell and spatial data
intersect_genes = list(set(sc_adata.var_names) & set(spatial_adata.var_names))

# Subset the single-cell and spatial data to only include the intersecting genes
sc_adata = sc_adata[:,intersect_genes]
spatial_adata = spatial_adata[:,intersect_genes]

# Set the seed for random number generation
seed=1

# Set the method for sampling single cells
sampling_method="duplicates"

# Sample single cells from the single-cell data according to the estimated number of each cell type
# The sampling method is "duplicates", which means that cells can be sampled multiple times
sc_adata_sampled = sample_single_cells(sc_adata, cell_type_numbers_int, sampling_method, seed)

In [6]:
# Normalize the total counts in each cell in the sampled single-cell data to 1e6
sc.pp.normalize_total(sc_adata_sampled,target_sum=10**6)

# Log-transform the data in the sampled single-cell data
sc.pp.log1p(sc_adata_sampled)

# Normalize the total counts in each cell in the spatial data to 1e6
sc.pp.normalize_total(spatial_adata,target_sum=10**6)

# Log-transform the data in the spatial data
sc.pp.log1p(spatial_adata)

# Perform PCA on the sampled single-cell data, keeping the first 50 principal components
sc.tl.pca(sc_adata_sampled,n_comps=50)

print(sc_adata_sampled)
print(spatial_adata)

  view_to_actual(adata)
  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 7402 × 19840
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.1', 'seurat_clusters', 'cell_type', 'CellType'
    var: 'n_cells'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
AnnData object with n_obs × n_vars = 1567 × 19840
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'
    uns: 'spatial', 'log1p'
    obsm: 'spatial'


In [7]:
# Use the CELLOC library's paired_align function to align the sampled single-cell data (sc_adata_sampled) and the spatial data (spatial_adata)
# The initial guess for the number of cells in each spot (b_init) is provided by cell_number_to_node_assignment
# The task is set to "mapping", which means the function will try to map single cell to a spot
pi_sc_sp = CELLOC.paired_align(sc_adata_sampled, spatial_adata, b_init=cell_number_to_node_assignment, task="mapping")

# Convert the output of the paired_align function to a pandas DataFrame
# The index is the cell names from the sampled single-cell data, and the columns are the spot names from the spatial data
sc_sp_map_df = pd.DataFrame(pi_sc_sp, index=sc_adata_sampled.obs_names, columns=spatial_adata.obs_names)

cuda
expression_term: 3.569, space_term: 0.128, density_term: 0.223
expression_term: 3.566, space_term: 0.128, density_term: 0.222
expression_term: 3.563, space_term: 0.128, density_term: 0.220
expression_term: 3.560, space_term: 0.128, density_term: 0.218
expression_term: 3.557, space_term: 0.128, density_term: 0.216
expression_term: 3.555, space_term: 0.128, density_term: 0.215
expression_term: 3.552, space_term: 0.128, density_term: 0.214
expression_term: 3.550, space_term: 0.128, density_term: 0.212
expression_term: 3.548, space_term: 0.128, density_term: 0.211
expression_term: 3.546, space_term: 0.128, density_term: 0.210
expression_term: 3.544, space_term: 0.128, density_term: 0.209
expression_term: 3.542, space_term: 0.128, density_term: 0.208
expression_term: 3.540, space_term: 0.128, density_term: 0.208
expression_term: 3.538, space_term: 0.128, density_term: 0.207
expression_term: 3.537, space_term: 0.128, density_term: 0.206
expression_term: 3.535, space_term: 0.128, density

In [8]:
# Remove the first 5 characters from the column and index names of the DataFrame
sc_sp_map_df.columns = [str(col)[5:] for col in sc_sp_map_df.columns]
sc_sp_map_df.index = [str(idx)[5:] for idx in sc_sp_map_df.index]

# Initialize an empty dictionary to store the results
result_dict = {}

# For each column in the DataFrame, find the cell names with the highest values
# The number of cell names to find is determined by cell_number_to_node_assignment
# Store the results in the result_dict dictionary
for i, column in enumerate(sc_sp_map_df.columns.values):
    result_dict[column] = sc_sp_map_df[column].nlargest(cell_number_to_node_assignment[i]).index

# Convert the result_dict dictionary to a DataFrame
# The 'Predict' column contains the spot names, and the 'Values' column contains the cell names
# The 'Values' column is exploded so that each cell name has its own row
max_column_names = pd.DataFrame({'Predict': list(result_dict.keys()), 'Values': [list(index_obj) for index_obj in result_dict.values()]}).explode('Values')

# Convert the cell names in the 'Values' column to strings
max_column_names['Values'] = max_column_names['Values'].astype(str)

# Set the 'Values' column as the index of the DataFrame
max_column_names = max_column_names.set_index('Values')

# Add a new column 'CellType' to the DataFrame and initialize it with 'type'
max_column_names=max_column_names.assign(CellType='type')

# Remove rows where the index is "nan"
max_column_names = max_column_names[max_column_names.index!="nan"]

# For each row in the DataFrame, replace 'type' in the 'CellType' column with the actual cell type from the cell_type DataFrame
for i,cell_id in enumerate(max_column_names.index.values):
    max_column_names.iloc[i,-1]=cell_type.loc[cell_id][0]

# Reindex the coordinates_data DataFrame based on the 'Predict' column of the max_column_names DataFrame
# This gives the predicted coordinates for each cell
predict_coordinates=coordinates_data.reindex(max_column_names['Predict'])

# Add the predicted x and y coordinates to the max_column_names DataFrame
max_column_names['predict_x']=predict_coordinates['X'].values
max_column_names['predict_y']=predict_coordinates['Y'].values

# Print the DataFrame
print(max_column_names)

                             Predict    CellType  predict_x  predict_y
Values                                                                
CTTTGCGCACAACTGT  AAACAAGTATCTCCCA-1  Nornal epi       2939       1190
CTTTGCGCACAACTGT  AAACAAGTATCTCCCA-1  Nornal epi       2939       1190
CTTTGCGCACAACTGT  AAACAAGTATCTCCCA-1  Nornal epi       2939       1190
CTTTGCGGTAAGGGAA  AAACAAGTATCTCCCA-1  Nornal epi       2939       1190
CTTTGCGGTAAGGGAA  AAACAAGTATCTCCCA-1  Nornal epi       2939       1190
...                              ...         ...        ...        ...
TTCTTAGTCGGAGGTA  TTGTTTGTGTAAATTC-1     Myeloid        690       2693
TTCTTAGTCGGAGGTA  TTGTTTGTGTAAATTC-1     Myeloid        690       2693
TTCTTAGTCGGAGGTA  TTGTTTGTGTAAATTC-1     Myeloid        690       2693
AGAGCTTGTCTAGGTT  TTGTTTGTGTAAATTC-1     Myeloid        690       2693
AGAGCTTGTCTAGGTT  TTGTTTGTGTAAATTC-1     Myeloid        690       2693

[7402 rows x 4 columns]


In [None]:
assigned_locations_path = os.path.join(output_folder+"mapping_results.csv")
max_column_names.to_csv(assigned_locations_path)

: 

In [None]:
plot_results_bulk_ST_by_spot(assigned_locations=max_column_names, coordinates_data=coordinates_data, dir_out=output_folder, output_prefix="Celloc_mapping")

: 