In [1]:
import pandas as pd
import numpy as np
import random
import os
from Celloc.common import read_data,sample_single_cells,plot_results_bulk_ST_by_spot,estimate_cell_number_RNA_reads
import matplotlib.pyplot as plt
import scanpy as sc
import sys
from Celloc import CELLOC
import anndata as ad

Load data

In [2]:
scRNA_path="./data/DCIS1/sc_RNA_DCIS1.h5ad"
cell_type_path=None
st_path="./data/DCIS1/spatial_DCIS1.h5ad"
coordinates_path=None
mean_cell_numbers=5
output_folder="./Celloc_output/"

In [3]:
# scRNA_path="./data/melanoma/melanoma_scRNA_GEP.txt"
# cell_type_path="./data/melanoma/melanoma_scRNA_celllabels.txt"
# st_path="./data/melanoma/melanoma_STdata_slide1_GEP.txt"
# coordinates_path="./data/melanoma/melanoma_STdata_slide1_coordinates.txt"
# cell_type_numbers_estimation_path='./data/melanoma/cell_type_number_setimate.csv'

In [4]:
celltype_col='CellType'

# Read the data from the specified paths
# sc_adata: single-cell RNA-seq data
# spatial_adata: spatial transcriptomics data
# cell_type_number_eachspot_data: estimated number of each cell type in each spot
# coordinates_data: coordinates of each spot
sc_adata, spatial_adata, coordinates_data =\
        read_data(scRNA_path, cell_type_path, st_path, coordinates_path,celltype_col)

# Extract the cell type information from the single-cell data
# Rename the index and the 'CellType' column for clarity
cell_type=pd.DataFrame(sc_adata.obs['CellType'])
cell_type.index = [str(idx)[5:] for idx in cell_type.index]
cell_type['CellType'] = [str(idx)[5:] for idx in cell_type['CellType']]

# Calculate the total number of cells in each spot
# Convert the results to integers
cell_number_to_node_assignment = estimate_cell_number_RNA_reads(spatial_adata.to_df().T, int(mean_cell_numbers))

In [5]:
# Filter out genes that are expressed in less than 1 cell in the single-cell data
sc.pp.filter_genes(sc_adata, min_cells=1)

# Filter out genes that are expressed in less than 1 cell in the spatial data
sc.pp.filter_genes(spatial_adata, min_cells=1)

# Find the intersection of the gene lists from the single-cell and spatial data
intersect_genes = list(set(sc_adata.var_names) & set(spatial_adata.var_names))

# Subset the single-cell and spatial data to only include the intersecting genes
sc_adata = sc_adata[:,intersect_genes]
spatial_adata = spatial_adata[:,intersect_genes]

In [6]:
# Normalize the total counts in each cell in the single-cell data to 1e6
sc.pp.normalize_total(sc_adata,target_sum=10**6)

# Log-transform the data in the single-cell data
sc.pp.log1p(sc_adata)

# Normalize the total counts in each cell in the spatial data to 1e6
sc.pp.normalize_total(spatial_adata,target_sum=10**6)

# Log-transform the data in the spatial data
sc.pp.log1p(spatial_adata)

# Perform PCA on the single-cell data, keeping the first 50 principal components
sc.tl.pca(sc_adata,n_comps=50)

print(sc_adata)
print(spatial_adata)

  view_to_actual(adata)


AnnData object with n_obs × n_vars = 3587 × 19840
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.1', 'seurat_clusters', 'cell_type', 'CellType'
    var: 'n_cells'
    uns: 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
AnnData object with n_obs × n_vars = 1567 × 19840
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'
    uns: 'spatial', 'log1p'
    obsm: 'spatial'


In [7]:
# Use the CELLOC library's paired_align function to align the single-cell data (sc_adata) and the spatial data (spatial_adata)
# The initial guess for the number of cells in each spot (b_init) is provided by cell_number_to_node_assignment
# The task is set to "location_recovery", which means the function will try to recover the location of each cell
pi_sc_sp = CELLOC.paired_align(sc_adata, spatial_adata, b_init=cell_number_to_node_assignment, task="location_recovery")

# Convert the output of the paired_align function to a pandas DataFrame
# The index is the cell names from the single-cell data, and the columns are the spot names from the spatial data
sc_sp_map_df = pd.DataFrame(pi_sc_sp, index=sc_adata.obs_names, columns=spatial_adata.obs_names)

cuda
expression_term: 2.821, space_term: 0.284, density_term: 0.243
expression_term: 1.557, space_term: 0.293, density_term: 0.423
expression_term: 1.493, space_term: 0.292, density_term: 0.456
expression_term: 1.476, space_term: 0.290, density_term: 0.454
expression_term: 1.469, space_term: 0.287, density_term: 0.450
expression_term: 1.461, space_term: 0.281, density_term: 0.448
expression_term: 1.454, space_term: 0.271, density_term: 0.447
expression_term: 1.449, space_term: 0.257, density_term: 0.449
expression_term: 1.445, space_term: 0.244, density_term: 0.450
expression_term: 1.442, space_term: 0.234, density_term: 0.452
expression_term: 1.439, space_term: 0.227, density_term: 0.454
expression_term: 1.436, space_term: 0.223, density_term: 0.455
expression_term: 1.434, space_term: 0.220, density_term: 0.456
expression_term: 1.432, space_term: 0.217, density_term: 0.457
expression_term: 1.429, space_term: 0.215, density_term: 0.457
expression_term: 1.427, space_term: 0.214, density

In [8]:
# Remove the first 5 characters from the column and index names of the DataFrame
sc_sp_map_df.columns = [str(col)[5:] for col in sc_sp_map_df.columns]
sc_sp_map_df.index = [str(idx)[5:] for idx in sc_sp_map_df.index]

# Find the column name with the maximum value for each row in the DataFrame
# Convert the result to a DataFrame and rename the column to 'Predict'
max_column_names = sc_sp_map_df.idxmax(axis=1).to_frame()
max_column_names=max_column_names.rename(columns={0: 'Predict'})

# Add a new column 'CellType' to the DataFrame and initialize it with 'type'
max_column_names=max_column_names.assign(CellType='type')

# For each row in the DataFrame, replace 'type' in the 'CellType' column with the actual cell type from the cell_type DataFrame
for i,cell_id in enumerate(max_column_names.index.values):
    max_column_names.iloc[i,-1]=cell_type.loc[cell_id][0]

# Reindex the coordinates_data DataFrame based on the 'Predict' column of the max_column_names DataFrame
# This gives the predicted coordinates for each cell
predict_coordinates=coordinates_data.reindex(max_column_names['Predict'])

# Add the predicted x and y coordinates to the max_column_names DataFrame
max_column_names['predict_x']=predict_coordinates['X'].values
max_column_names['predict_y']=predict_coordinates['Y'].values

print(max_column_names)

                             Predict CellType  predict_x  predict_y
AAACCTGAGATCCCGC  GCACAAGTGTCGGAAG-1     NK/T        850       2306
AAACCTGAGGGATGGG  TCTGCCAGAAACTGCA-1    Tumor       2465       1873
AAACCTGCATAACCTG  ACGCGAAGTCAGACGA-1    Tumor       2875       2474
AAACCTGGTTGTTTGG  GCTAACTGAAGTCTGA-1    Tumor       3351       1612
AAACCTGTCACAATGC  TAATACTAGAACAGAC-1    Tumor       3920       1946
...                              ...      ...        ...        ...
TTTGTCACAGCCACCA  TCTGCCAGAAACTGCA-1    Tumor       2465       1873
TTTGTCAGTAAGAGAG  ATTCACTGATGTTGGA-1    Tumor       3242       2209
TTTGTCAGTAAGGATT  CCTCCCGACAATCCCT-1  Myeloid        797       2336
TTTGTCATCACAGGCC  GCTAACTGAAGTCTGA-1    Tumor       3351       1612
TTTGTCATCCGTCAAA  ATTTAACTCGTATTAC-1    Tumor       1054       2667

[3587 rows x 4 columns]


In [None]:
assigned_locations_path = os.path.join(output_folder+"location_recovery_results.csv")
max_column_names.to_csv(assigned_locations_path)

In [None]:
plot_results_bulk_ST_by_spot(assigned_locations=max_column_names, coordinates_data=coordinates_data, dir_out=output_folder, output_prefix="Celloc_location")