To generate synthetic VisiumHD data from Xenium, please read and run all the cells below. Thanks!

## Download Xenium output from 10X website
Paste the URL for the binned_outputs.tar.gz for the sample you want to analyze.

1. Go to Xenium public datasets page:https://www.10xgenomics.com/datasets?query=&page=1&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bspecies%5D%5B0%5D=Human&refinementList%5BdiseaseStates%5D%5B0%5D=colorectal%20cancer

2. Select sample to analyze scrolling down to downloads section, click "Batch download"


### Install prerequisite libraries

In [1]:
import zipfile

# 指定 ZIP 文件的路径
zip_file_path = "/home/wangzhuo/data/Xenium_Human_Colorectal_Cancer/Xenium_V1_Human_Colorectal_Cancer_Addon_FFPE_outs.zip"

# 解压 ZIP 文件到指定目录
extract_dir = "/home/wangzhuo/data/Xenium_Human_Colorectal_Cancer"  # 可以修改为你想要的解压目录

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Extraction completed. Files are extracted to: {extract_dir}")

Extraction completed. Files are extracted to: /home/wangzhuo/data/Xenium_Human_Colorectal_Cancer


### Import Relevant Libraries

In [1]:
import geopandas as gpd # Geopandas for storing Shapely objects
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import scanpy as sc
import pandas as pd
from scipy import sparse
import anndata
import os
import gzip
import numpy as np
import re
import shapely
from shapely.geometry import Polygon, Point # Representing bins and cells as Shapely Polygons and Point objects
from shapely import wkt

### Load Cell & Transcripts Info

In [2]:
# Load the transcript data
transcripts_path = "/home/wangzhuo/data/Xenium_Human_Colorectal_Cancer/transcripts.csv.gz"
with gzip.open(transcripts_path, 'rt') as f:
    transcripts_df = pd.read_csv(f)

# Load cell info
cells_path = "/home/wangzhuo/data/Xenium_Human_Colorectal_Cancer/cells.csv.gz"
with gzip.open(cells_path, 'rt') as f:
    cells_df = pd.read_csv(f)


### Load Cell Boundary Info

In [3]:
import zarr

zarr_file = zarr.open('/home/wangzhuo/data/Xenium_Human_Colorectal_Cancer/cells.zarr.zip', mode='r')
print(zarr_file.tree())

/
 ├── cell_id (388175, 2) uint32
 ├── cell_summary (388175, 8) float64
 ├── masks
 │   ├── 0 (20493, 51115) uint32
 │   ├── 1 (20493, 51115) uint32
 │   └── homogeneous_transform (4, 4) float32
 └── polygon_sets
     ├── 0
     │   ├── cell_index (388175,) uint32
     │   ├── method (388175,) uint32
     │   ├── num_vertices (388175,) int32
     │   └── vertices (388175, 50) float32
     └── 1
         ├── cell_index (388175,) uint32
         ├── method (388175,) uint32
         ├── num_vertices (388175,) int32
         └── vertices (388175, 50) float32


细胞核顶点数据	人 zarr_file['polygon_sets/0/vertices'][:]	小鼠 zarr_file['polygon_vertices'][0, :, :]

整个细胞顶点数据	人 zarr_file['polygon_sets/1/vertices'][:] 小鼠	zarr_file['polygon_vertices'][1, :, :]

In [4]:
file = zarr_file['polygon_sets/0/vertices'][:]
# 1 is whole cell, 0 is nucleus

### Create folders to store synthetic data

For both the `seqfish_dir` and `enact_data_dir`, change `"/home/oneai/"` to the directory that stores this repo.

In [5]:
xenium_dir = "/home/wangzhuo/data/enact_synthetic_output/chunks" # Update it to the directory where you want to save the synthetic data
enact_data_dir = "/home/wangzhuo/data/enact_synthetic_output/chunks" # Directory that saves all the input and results of the enact pipeline, 
# should end with "oneai-dda-spatialtr-visiumhd_analysis/cache/seqfish/chunks"

transcripts_df_chunks_dir = os.path.join(xenium_dir, "transcripts_patches") # Directory to store the files that contain the transcripts info for each chunk
output_dir = os.path.join(enact_data_dir, "bins_gdf") # Directory to store the results of gene-to-bin assignment for each chunk
cells_df_chunks_dir =  os.path.join(enact_data_dir,"cells_gdf") 
ground_truth_dir =  os.path.join(xenium_dir, "ground_truth_nuclei")

# Making relevant directories
os.makedirs(xenium_dir, exist_ok=True)
os.makedirs(enact_data_dir, exist_ok=True)
os.makedirs(transcripts_df_chunks_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(cells_df_chunks_dir, exist_ok=True)
os.makedirs(ground_truth_dir, exist_ok=True)

### Generate Synthetic VisiumHD Dataset

#### Break transcripts df to patches (based on location)

Break transcripts df to patches of size 1000um x 1000um (larger patch size may result in memory issue)

In [6]:
# patch size: 1000 um x 1000 um

patch_size = 1000

# patch indices
transcripts_df['x_patch'] = (transcripts_df['x_location'] // patch_size).astype(int)
transcripts_df['y_patch'] = (transcripts_df['y_location'] // patch_size).astype(int)
transcripts_df["patch_id"] = transcripts_df["x_patch"].astype(str) + "_" + transcripts_df["y_patch"].astype(str)

# Create a df for each patch
grouped = transcripts_df.groupby(['x_patch', 'y_patch'])
for (x_patch, y_patch), group in grouped:
    # Calculate the start and end locations for each patch
    # x_start = x_patch * patch_size
    # x_end = (x_patch + 1) * patch_size
    # y_start = y_patch * patch_size
    # y_end = (y_patch + 1) * patch_size
    
    filename = f"patch_{x_patch}_{y_patch}.csv"
    output_loc = os.path.join(transcripts_df_chunks_dir , filename)
    group.to_csv(output_loc)

    print(f"Saved {filename}")

Saved patch_0_0.csv
Saved patch_0_1.csv
Saved patch_0_2.csv
Saved patch_0_3.csv
Saved patch_1_0.csv
Saved patch_1_1.csv
Saved patch_1_2.csv
Saved patch_1_3.csv
Saved patch_2_0.csv
Saved patch_2_1.csv
Saved patch_2_2.csv
Saved patch_2_3.csv
Saved patch_3_0.csv
Saved patch_3_1.csv
Saved patch_3_2.csv
Saved patch_3_3.csv
Saved patch_4_0.csv
Saved patch_4_1.csv
Saved patch_4_2.csv
Saved patch_4_3.csv
Saved patch_4_4.csv
Saved patch_5_0.csv
Saved patch_5_1.csv
Saved patch_5_2.csv
Saved patch_5_3.csv
Saved patch_5_4.csv
Saved patch_6_0.csv
Saved patch_6_1.csv
Saved patch_6_2.csv
Saved patch_6_3.csv
Saved patch_6_4.csv
Saved patch_7_0.csv
Saved patch_7_1.csv
Saved patch_7_2.csv
Saved patch_7_3.csv
Saved patch_7_4.csv
Saved patch_8_0.csv
Saved patch_8_1.csv
Saved patch_8_2.csv
Saved patch_8_3.csv
Saved patch_8_4.csv
Saved patch_9_0.csv
Saved patch_9_1.csv
Saved patch_9_2.csv
Saved patch_9_3.csv
Saved patch_9_4.csv
Saved patch_10_1.csv
Saved patch_10_2.csv
Saved patch_10_3.csv
Saved patch_10_4.

#### Generate synthetic visiumHD for each patch

Each patch is broken into bins of size 2um x 2um. The synthetic data contains transcript counts orgnized by bin_id. Each row contains transcript counts for a unique bin. Bins with no transcript counts is not included. 

In addition to all the gene features, there are two additional columns represent the row number and column number of the bin, and a column contains the Shapely polygon item that represents the bin. The first column is the bin_id.

In [7]:
def generate_synthetic_VisiumHD_data(transcripts_df, bin_size=2, whole_cell=True, QScore20=True):
    filtered_df = transcripts_df.copy()
    # only count transcripts in the nucleus
    if not whole_cell:
        filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1].copy()
    
    #only count transcripts with QScore >= 20
    if QScore20:
        filtered_df = filtered_df[filtered_df['qv'] >= 20].copy()
 
    # assigne bin to each transcript
    filtered_df.loc[:, 'row'] =np.ceil(filtered_df['y_location'] / bin_size).astype(int)
    filtered_df.loc[:, 'column'] = np.ceil(filtered_df['x_location'] / bin_size).astype(int)
    filtered_df.loc[:, 'assigned_bin_id'] = filtered_df.apply(
        lambda row: f"{bin_size}um_" + str(row['row']).zfill(5) +"_"+ str(row['column']).zfill(5),
        axis=1)
    
    bin_coordinates = filtered_df[['assigned_bin_id', 'row', 'column']].drop_duplicates().set_index('assigned_bin_id')
    bin_gene_matrix = filtered_df.groupby(['assigned_bin_id', 'feature_name']).size().unstack(fill_value=0)
    bin_gene_matrix_with_coords = bin_gene_matrix.merge(bin_coordinates, left_index=True, right_index=True)
    
    return bin_gene_matrix_with_coords

In [8]:
# Extract row and column number from the bin_id
def extract_numbers(entry):
    match = re.search(r'_(\d{5})_(\d{5})', entry)
    if match:
        number1 = int(match.group(1).lstrip('0'))  
        number2 = int(match.group(2).lstrip('0'))  
        return number2*2-1, number1*2-1
    else:
        return None, None

In [9]:
from tqdm import tqdm
def generate_bin_polys(bins_df, x_col, y_col, bin_size):
        """Represents the bins as Shapely polygons

        Args:
            bins_df (pd.DataFrame): bins dataframe
            x_col (str): column with the bin centre x-coordinate
            y_col (str): column with the bin centre y-coordinate
            bin_size (int): bin size in pixels

        Returns:
            list: list of Shapely polygons
        """
        geometry = []
        # Generates Shapely polygons to represent each bin

        if True:
            half_bin_size = bin_size / 2
            bbox_coords = pd.DataFrame(
                {
                    "min_x": bins_df[x_col] - half_bin_size,
                    "min_y": bins_df[y_col] - half_bin_size,
                    "max_x": bins_df[x_col] + half_bin_size,
                    "max_y": bins_df[y_col] + half_bin_size,
                }
            )
            geometry = [
                shapely.geometry.box(min_x, min_y, max_x, max_y)
                for min_x, min_y, max_x, max_y in tqdm(
                    zip(
                        bbox_coords["min_x"],
                        bbox_coords["min_y"],
                        bbox_coords["max_x"],
                        bbox_coords["max_y"],
                    ),
                    total=len(bins_df),
                )
            ]

        return geometry

下面这段代码的主要任务是：

处理多个转录本数据文件（chunks）。

对每个转录本数据文件进行筛选和分配到bin中。



调整bin的行列坐标。
将bin转换为Shapely多边形对象。

创建包含地理空间信息的GeoDataFrame。

将处理后的数据保存为CSV文件，并输出成功信息。
这段代码适用于需要对大量单细胞空间转录组数据进行批量处理和空间分析的场景。

In [10]:
# Loop through all the transcripra_df chunks and generate gene-to-bin assignments 
patch_size = 1000
bin_size = 2
transcripts_df_chunks = os.listdir(transcripts_df_chunks_dir)
for chunk_fname in transcripts_df_chunks:
    output_loc = os.path.join(output_dir, chunk_fname)
    # if os.path.exists(output_loc):
    #     continue
    if chunk_fname in [".ipynb_checkpoints"]:
        continue
    transcripts_df_chunk = pd.read_csv(os.path.join(transcripts_df_chunks_dir, chunk_fname))
    bin_df_chunk = generate_synthetic_VisiumHD_data(transcripts_df_chunk, bin_size, whole_cell=True, QScore20=True)
    bin_df_chunk['column'] = bin_df_chunk['column']*2-1
    bin_df_chunk['row'] = bin_df_chunk['row']*2-1
    bin_df_chunk['geometry'] = generate_bin_polys(bin_df_chunk, 'column', 'row', 2)
    bin_gdf_chunk = gpd.GeoDataFrame( bin_df_chunk, geometry = bin_df_chunk['geometry'])
    bin_df_chunk.to_csv(output_loc)
    print(f"Successfully assigned transcripts to bins for {chunk_fname}")

100%|██████████| 29564/29564 [00:00<00:00, 115719.84it/s]


Successfully assigned transcripts to bins for patch_0_0.csv


100%|██████████| 116474/116474 [00:00<00:00, 134019.36it/s]


Successfully assigned transcripts to bins for patch_0_1.csv


100%|██████████| 114212/114212 [00:00<00:00, 129463.30it/s]


Successfully assigned transcripts to bins for patch_0_2.csv


100%|██████████| 48869/48869 [00:00<00:00, 138620.85it/s]


Successfully assigned transcripts to bins for patch_0_3.csv


100%|██████████| 86057/86057 [00:00<00:00, 132388.23it/s]


Successfully assigned transcripts to bins for patch_1_0.csv


100%|██████████| 207381/207381 [00:01<00:00, 134950.08it/s]


Successfully assigned transcripts to bins for patch_1_1.csv


100%|██████████| 163887/163887 [00:01<00:00, 133049.51it/s]


Successfully assigned transcripts to bins for patch_1_2.csv


100%|██████████| 74159/74159 [00:00<00:00, 121607.85it/s]


Successfully assigned transcripts to bins for patch_1_3.csv


100%|██████████| 164052/164052 [00:01<00:00, 126297.97it/s]


Successfully assigned transcripts to bins for patch_2_0.csv


100%|██████████| 185760/185760 [00:01<00:00, 132233.65it/s]


Successfully assigned transcripts to bins for patch_2_1.csv


100%|██████████| 174597/174597 [00:01<00:00, 131895.80it/s]


Successfully assigned transcripts to bins for patch_2_2.csv


100%|██████████| 99432/99432 [00:00<00:00, 116036.23it/s]


Successfully assigned transcripts to bins for patch_2_3.csv


100%|██████████| 150496/150496 [00:01<00:00, 123305.95it/s]


Successfully assigned transcripts to bins for patch_3_0.csv


100%|██████████| 190170/190170 [00:01<00:00, 133733.40it/s]


Successfully assigned transcripts to bins for patch_3_1.csv


100%|██████████| 212488/212488 [00:01<00:00, 127868.60it/s]


Successfully assigned transcripts to bins for patch_3_2.csv


100%|██████████| 129728/129728 [00:01<00:00, 123264.63it/s]


Successfully assigned transcripts to bins for patch_3_3.csv


100%|██████████| 168501/168501 [00:01<00:00, 133565.26it/s]


Successfully assigned transcripts to bins for patch_4_0.csv


100%|██████████| 208910/208910 [00:01<00:00, 123725.24it/s]


Successfully assigned transcripts to bins for patch_4_1.csv


100%|██████████| 173329/173329 [00:01<00:00, 136394.72it/s]


Successfully assigned transcripts to bins for patch_4_2.csv


100%|██████████| 80077/80077 [00:00<00:00, 131149.28it/s]


Successfully assigned transcripts to bins for patch_4_3.csv


100%|██████████| 4984/4984 [00:00<00:00, 138467.32it/s]


Successfully assigned transcripts to bins for patch_4_4.csv


100%|██████████| 177991/177991 [00:01<00:00, 124432.13it/s]


Successfully assigned transcripts to bins for patch_5_0.csv


100%|██████████| 171183/171183 [00:01<00:00, 135003.90it/s]


Successfully assigned transcripts to bins for patch_5_1.csv


100%|██████████| 182367/182367 [00:01<00:00, 129000.02it/s]


Successfully assigned transcripts to bins for patch_5_2.csv


100%|██████████| 122488/122488 [00:00<00:00, 141821.53it/s]


Successfully assigned transcripts to bins for patch_5_3.csv


100%|██████████| 51598/51598 [00:00<00:00, 122326.63it/s]


Successfully assigned transcripts to bins for patch_5_4.csv


100%|██████████| 164754/164754 [00:01<00:00, 134757.79it/s]


Successfully assigned transcripts to bins for patch_6_0.csv


100%|██████████| 171924/171924 [00:01<00:00, 127001.45it/s]


Successfully assigned transcripts to bins for patch_6_1.csv


100%|██████████| 174107/174107 [00:01<00:00, 125051.94it/s]


Successfully assigned transcripts to bins for patch_6_2.csv


100%|██████████| 194064/194064 [00:01<00:00, 124978.93it/s]


Successfully assigned transcripts to bins for patch_6_3.csv


100%|██████████| 61402/61402 [00:00<00:00, 139783.19it/s]


Successfully assigned transcripts to bins for patch_6_4.csv


100%|██████████| 185415/185415 [00:01<00:00, 128060.82it/s]


Successfully assigned transcripts to bins for patch_7_0.csv


100%|██████████| 228841/228841 [00:01<00:00, 137500.21it/s]


Successfully assigned transcripts to bins for patch_7_1.csv


100%|██████████| 186837/186837 [00:01<00:00, 127801.12it/s]


Successfully assigned transcripts to bins for patch_7_2.csv


100%|██████████| 182744/182744 [00:01<00:00, 128361.98it/s]


Successfully assigned transcripts to bins for patch_7_3.csv


100%|██████████| 64029/64029 [00:00<00:00, 140462.84it/s]


Successfully assigned transcripts to bins for patch_7_4.csv


100%|██████████| 49661/49661 [00:00<00:00, 139690.12it/s]


Successfully assigned transcripts to bins for patch_8_0.csv


100%|██████████| 193570/193570 [00:01<00:00, 133601.75it/s]


Successfully assigned transcripts to bins for patch_8_1.csv


100%|██████████| 200388/200388 [00:01<00:00, 123875.33it/s]


Successfully assigned transcripts to bins for patch_8_2.csv


100%|██████████| 175617/175617 [00:01<00:00, 132072.62it/s]


Successfully assigned transcripts to bins for patch_8_3.csv


100%|██████████| 16818/16818 [00:00<00:00, 139676.70it/s]


Successfully assigned transcripts to bins for patch_8_4.csv


100%|██████████| 2312/2312 [00:00<00:00, 138783.66it/s]

Successfully assigned transcripts to bins for patch_9_0.csv



100%|██████████| 67036/67036 [00:00<00:00, 141940.00it/s]


Successfully assigned transcripts to bins for patch_9_1.csv


100%|██████████| 191105/191105 [00:01<00:00, 131499.74it/s]


Successfully assigned transcripts to bins for patch_9_2.csv


100%|██████████| 225490/225490 [00:01<00:00, 122780.02it/s]


Successfully assigned transcripts to bins for patch_9_3.csv


100%|██████████| 12710/12710 [00:00<00:00, 141762.76it/s]


Successfully assigned transcripts to bins for patch_9_4.csv


100%|██████████| 8766/8766 [00:00<00:00, 134508.16it/s]


Successfully assigned transcripts to bins for patch_10_1.csv


100%|██████████| 80502/80502 [00:00<00:00, 141469.06it/s]


Successfully assigned transcripts to bins for patch_10_2.csv


100%|██████████| 121132/121132 [00:00<00:00, 129271.21it/s]


Successfully assigned transcripts to bins for patch_10_3.csv


100%|██████████| 2829/2829 [00:00<00:00, 128551.47it/s]

Successfully assigned transcripts to bins for patch_10_4.csv





### Generate cell_gdf

This session generate the cell_df patches required to run the enact pipeline. The main purpose is to create Shapely polygons that represent the cell outline.

为什么要转换为Shapely多边形对象


几何操作: Shapely多边形对象提供了丰富的几何操作功能，如计算面积、周长、交集、并集、包含关系等。这些功能在空间转录组数据分析中非常有用，例如计算多边形之间的重叠区域，或者判断一个细胞是否位于某个多边形内。
空间分析: 使用多边形对象可以更容易地进行空间分析和可视化。在单细胞空间转录组数据分析中，理解细胞在组织切片中的空间分布是非常重要的。

与其他库的兼容性: Shapely多边形对象与其他空间分析相关的库（如GeoPandas）兼容良好。通过将多边形对象添加到GeoDataFrame中，可以方便地进行进一步的空间分析和处理。

总结来说，这段代码通过将坐标数组转换为Shapely多边形对象，并将其添加到DataFrame中，为后续的空间分析和可视化提供了几何基础。

In [11]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

# Assuming cells_df is already loaded and contains 'polygons' column
# Create polygons if not already done
def create_polygons(coords_array):
    polygons = []
    for row in coords_array:
        reshaped_coords = row.reshape(-1, 2)
        polygon = Polygon(reshaped_coords)
        polygons.append(polygon)
    return polygons

polygons = create_polygons(file)
cells_df['polygons'] = polygons

# Convert to GeoDataFrame
cell_gdf = gpd.GeoDataFrame(cells_df, geometry=cells_df['polygons'])
cell_gdf.rename(columns={'x_centroid': 'cell_x', 'y_centroid': 'cell_y'}, inplace=True)

# Define patch size (same as transcripts)
patch_size = 1000

# Assign patch indices to cells based on their centroids
cell_gdf['x_patch'] = (cell_gdf['cell_x'] // patch_size).astype(int)
cell_gdf['y_patch'] = (cell_gdf['cell_y'] // patch_size).astype(int)
cell_gdf['patch_id'] = cell_gdf['x_patch'].astype(str) + "_" + cell_gdf['y_patch'].astype(str)

# Group by patch_id and save each group as a separate file
grouped = cell_gdf.groupby(['x_patch', 'y_patch'])
for (x_patch, y_patch), group in grouped:
    filename = f"patch_{x_patch}_{y_patch}.csv"
    output_loc = os.path.join(cells_df_chunks_dir, filename)
    
    # Save only the required columns
    group[['cell_id', 'cell_x', 'cell_y', 'geometry']].to_csv(output_loc, index=False)
    print(f"Saved {filename}")

Saved patch_0_0.csv
Saved patch_0_1.csv
Saved patch_0_2.csv
Saved patch_0_3.csv
Saved patch_1_0.csv
Saved patch_1_1.csv
Saved patch_1_2.csv
Saved patch_1_3.csv
Saved patch_2_0.csv
Saved patch_2_1.csv
Saved patch_2_2.csv
Saved patch_2_3.csv
Saved patch_3_0.csv
Saved patch_3_1.csv
Saved patch_3_2.csv
Saved patch_3_3.csv
Saved patch_4_0.csv
Saved patch_4_1.csv
Saved patch_4_2.csv
Saved patch_4_3.csv
Saved patch_4_4.csv
Saved patch_5_0.csv
Saved patch_5_1.csv
Saved patch_5_2.csv
Saved patch_5_3.csv
Saved patch_5_4.csv
Saved patch_6_0.csv
Saved patch_6_1.csv
Saved patch_6_2.csv
Saved patch_6_3.csv
Saved patch_6_4.csv
Saved patch_7_0.csv
Saved patch_7_1.csv
Saved patch_7_2.csv
Saved patch_7_3.csv
Saved patch_7_4.csv
Saved patch_8_0.csv
Saved patch_8_1.csv
Saved patch_8_2.csv
Saved patch_8_3.csv
Saved patch_8_4.csv
Saved patch_9_0.csv
Saved patch_9_1.csv
Saved patch_9_2.csv
Saved patch_9_3.csv
Saved patch_9_4.csv
Saved patch_10_1.csv
Saved patch_10_2.csv
Saved patch_10_3.csv
Saved patch_10_4.

### Generate Ground Truth

The following cell will generate and save the ground truth of the synthetic VisiumHD data for the use of bin-to-cell assignment methods evaluation. Ground truth dataframe consists of rows representing the transcript counts of each cell. Each column represents a gene feature (gene feature name is also the column name).

#### Generate Cell-gene matrix for evaluation

In [12]:
def generate_ground_truth_table(transcripts_df, cells_df, whole_cell=True, QScore20=True, include_unassigned_transcript=False):
    filtered_df = transcripts_df
    
    # only count transcripts in the nucleus
    if not whole_cell:
        filtered_df = transcripts_df[transcripts_df['overlaps_nucleus'] == 1]
    
    # only count transcripts with QScore >= 20
    if QScore20:
        filtered_df = filtered_df[filtered_df['qv'] >= 20]
    
    # only count transcripts that are assigned to specific cells
    if not include_unassigned_transcript:
        filtered_df = filtered_df[filtered_df['cell_id'] != 'UNASSIGNED']
    
    pivot_df = filtered_df.pivot_table(index='cell_id', columns='feature_name', aggfunc='size', fill_value=0)
    
    merged_df = pivot_df.merge(cells_df[['cell_id']], left_index=True, right_on='cell_id', how='right')
    columns = ['cell_id'] + [col for col in merged_df.columns if col not in ['cell_id', 'x_centroid', 'y_centroid','polygons']]
    merged_df = merged_df[columns]
    merged_df.set_index('cell_id', inplace=True)
    #merged_df['total_gene_counts'] = merged_df.iloc[:, 3:].sum(axis=1)
    
    return merged_df

In [13]:
bin_size = 2
cell_df_chunks = os.listdir(cells_df_chunks_dir)
for chunk_fname in cell_df_chunks:
    output_loc = os.path.join(ground_truth_dir,chunk_fname)
    if os.path.exists(output_loc):
        continue
    if chunk_fname in [".ipynb_checkpoints"]:
        continue
    cell_df_chunk = pd.read_csv(os.path.join(cells_df_chunks_dir, chunk_fname))
    groundtruth_chunk = generate_ground_truth_table(transcripts_df, cell_df_chunk, whole_cell=False, QScore20=False, include_unassigned_transcript=False)
    groundtruth_chunk.to_csv(output_loc)
    print(f"Successfully generated groundthuth for {chunk_fname}")

Successfully generated groundthuth for patch_0_0.csv
Successfully generated groundthuth for patch_0_1.csv
Successfully generated groundthuth for patch_0_2.csv
Successfully generated groundthuth for patch_0_3.csv
Successfully generated groundthuth for patch_1_0.csv
Successfully generated groundthuth for patch_1_1.csv
Successfully generated groundthuth for patch_1_2.csv
Successfully generated groundthuth for patch_1_3.csv
Successfully generated groundthuth for patch_2_0.csv
Successfully generated groundthuth for patch_2_1.csv
Successfully generated groundthuth for patch_2_2.csv
Successfully generated groundthuth for patch_2_3.csv
Successfully generated groundthuth for patch_3_0.csv
Successfully generated groundthuth for patch_3_1.csv
Successfully generated groundthuth for patch_3_2.csv
Successfully generated groundthuth for patch_3_3.csv
Successfully generated groundthuth for patch_4_0.csv
Successfully generated groundthuth for patch_4_1.csv
Successfully generated groundthuth for patch_4