In [1]:
import pandas as pd
import scanpy as sc
import numpy as np
import warnings
import datatable as dt
from Celloc.common import estimate_cell_number_RNA_reads,read_file

In [None]:
mean_cell_numbers=5

DataDir="./data/heart_data"
decon_results_path=DataDir+"/decon_result.txt"

# Check the file type of the deconvolution results
if decon_results_path.endswith("txt"):
    # If it's a txt file, use pandas to read it into a DataFrame
    decon_results = pd.read_csv(decon_results_path,sep='\t',index_col=0)
elif decon_results_path.endswith("h5ad"):
    # If it's a h5ad file, use scanpy to read it into an AnnData object, then convert to DataFrame
    coGCNresults= sc.read_h5ad(decon_results_path)
    decon_results=pd.DataFrame(coGCNresults.X,index=coGCNresults.obs_names,columns=coGCNresults.var_names)

# Remove duplicate columns, if any
decon_results = decon_results.loc[:,np.unique(decon_results.columns)]

# Normalize the data by dividing each cell type proportion by the total proportion in each spot
decon_results = (decon_results.T/decon_results.sum(axis=1)).T

# Fill any NaN values with 0
decon_results = decon_results.fillna(0)

print(decon_results)

Calculate the cell abundance of each spot separately

In [None]:
# Define the path to the spatial transcriptomics data
st_path=DataDir+"/spatial_count.txt"

# Read the data from the file
st_data = read_file(st_path)

# Rename the columns and rows for clarity
st_data.columns = ['SPOT_'+str(col) for col in st_data.columns]
st_data.index = ['GENE_'+str(idx) for idx in st_data.index]

# Estimate the number of cells in each spot based on RNA reads
cell_number_to_node_assignment = estimate_cell_number_RNA_reads(st_data, mean_cell_numbers)

# Calculate the total number of cells
number_of_cells = np.sum(cell_number_to_node_assignment)

print(cell_number_to_node_assignment)
print(cell_number_to_node_assignment.min())

In [None]:
# Convert the deconvolution results DataFrame to a numpy array
arr_decon_results = decon_results.values

# Multiply the deconvolution results by the estimated cell numbers for each spot
# This gives the estimated number of each cell type in each spot
celltype_number_to_node_assignment = np.array(decon_results) * cell_number_to_node_assignment.reshape(-1, 1)
print(celltype_number_to_node_assignment)

In [None]:
# Sum the estimated number of each cell type across all spots
celltype_number = celltype_number_to_node_assignment.sum(0)

# Round up the estimated number of each cell type to the nearest integer
celltype_number = np.ceil(celltype_number)

# Convert the estimated number of each cell type to integers
celltype_number = celltype_number.astype(int)

# Create a DataFrame with the estimated number of each cell type
cell_type_numbers = pd.DataFrame(celltype_number, index=decon_results.columns, columns=["cell_number"])

print(sum(cell_type_numbers.iloc[:,0]))
print(number_of_cells)

# Adjust the estimated number of the first cell type to make the total estimated number of cells match the total number of cells estimated from RNA reads
cell_type_numbers.loc[cell_type_numbers.index[0], cell_type_numbers.columns[0]] += number_of_cells - sum(cell_type_numbers.iloc[:,0])

print(len(list(celltype_number)))
print(cell_type_numbers)

In [None]:
def adjust_matrix(X, row_sums, col_sums):
    # Round the matrix X to the nearest integer
    Z = np.round(X).astype(int)

    for _ in range(5):
        # Calculate the row and column sums of the matrix Z
        Z_row_sums = np.sum(Z, axis=1)
        Z_col_sums = np.sum(Z, axis=0)

        # Find the differences between the desired and actual row and column sums
        row_diff = row_sums - Z_row_sums
        col_diff = col_sums - Z_col_sums

        # Adjust the elements of the matrix Z based on the differences
        for i in range(len(row_diff)):
            for j in range(len(col_diff)):
                if row_diff[i] > 0 and col_diff[j] > 0:
                    adjustment = min(row_diff[i], col_diff[j])
                    Z[i, j] += adjustment
                    row_diff[i] -= adjustment
                    col_diff[j] -= adjustment

    return Z

# Call the function to adjust the matrix
Y = adjust_matrix(celltype_number_to_node_assignment, cell_number_to_node_assignment, celltype_number)

In [None]:
# Calculate the row and column sums of the matrix Y
Y_row_sums = np.sum(Y, axis=1)
Y_col_sums = np.sum(Y, axis=0)

# Find the indices of the rows in Y that sum to zero
zero_index = np.where(Y_row_sums == 0)

# For these rows, replace the corresponding rows in the original cell type assignment matrix
# with a binary version (values greater than 0.3 become 1, others become 0)
replacement = celltype_number_to_node_assignment[zero_index]
replacement = np.where(replacement > 0.3, 1, 0).astype(int)

# Replace the zero-sum rows in Y with the binary rows
Y[zero_index, :] = replacement

# Copy the adjusted matrix Y to celltype_number_to_node_assignment
celltype_number_to_node_assignment = Y.copy()

# Print the column sums of the adjusted matrix
print(celltype_number_to_node_assignment.sum(0))

In [None]:
# Convert the numpy array celltype_number_to_node_assignment to a pandas DataFrame
# The index and columns are the same as those of the decon_results DataFrame
celltype_number_to_node_assignment_df=pd.DataFrame(celltype_number_to_node_assignment,
                                                   index=decon_results.index,
                                                   columns=decon_results.columns)
print(celltype_number_to_node_assignment_df.astype(int))

In [None]:
celltype_number_to_node_assignment_df.astype(int).to_csv(DataDir+"/cell_type_number_setimate.csv")