# Baysor Segmentation
- Tested on Ubuntu, follow installation guide on https://kharchenkolab.github.io/Baysor/dev/installation/
- Make sure to adapt xenium.toml according to the structure of your data

## Xenium

In [1]:
import pandas as pd
import scanpy as sc

In [49]:
# only the transcripts are needed
path_to_xenium_points = "/home/icb/michael.dammann/troutpy/notebooks/spatialdata_tutorials/test_data.zarr/points/transcripts/points.parquet"
data_frame = pd.read_parquet(path_to_xenium_points)

In [50]:
#min_qv = 20.0
min_x = min(data_frame['x'])
max_x = max(data_frame['x'])
min_y = min(data_frame['y'])
max_y = max(data_frame['y'])

In [51]:
data_frame = data_frame.dropna(subset=["feature_name"])

In [52]:
import pandas as pd

# Assume you're working with a Dask DataFrame
df = data_frame.copy()

# Convert to Pandas if needed
df = df.compute() if not isinstance(df, pd.DataFrame) else df

# Isolate the non-"UNASSIGNED" values
mask = df["cell_id"] != "UNASSIGNED"
unique_ids = df.loc[mask, "cell_id"].unique()

# Create a mapping: "UNASSIGNED" remains as string, others get int codes
id_map = {cid: i for i, cid in enumerate(sorted(unique_ids))}
df["cell_id_encoded"] = df["cell_id"].apply(lambda x: id_map[x] if x in id_map else "UNASSIGNED")

data_frame = df

In [54]:
def process(data_frame):
    # Filter transcripts. Ignore negative controls
    filtered_frame = data_frame[#(data_frame["qv"] >= min_qv) & -> done by Baysor? -> commented out
                                (data_frame["x"] >= min_x) &
                                (data_frame["x"] <= max_x) &
                                (data_frame["y"] >= min_y) &
                                (data_frame["y"] <= max_y)]

    # Change cell_id of cell-free transcripts from -1 to 0 (done by Baysor -> commented out)
    #neg_cell_row = filtered_frame["cell_id"] == 'UNASSIGNED'
    #filtered_frame.loc[neg_cell_row,"cell_id"] = '0'

    # Output filtered transcripts to CSV
    filtered_frame.to_csv('_'.join(["X"+str(min_x)+"-"+str(max_x), "Y"+str(min_y)+"-"+str(max_y), "filtered_transcripts.csv"]),
                          index=False,
                          encoding = 'utf-8')


In [55]:
process(data_frame)

## Running Baysor

Baysor needs to be installed (see Link above), then run:
baysor run -c xenium.toml X5100.015625-5200.0_Y1912.515625-5200.0_filtered_transcripts.csv :cell_id_encoded

baysor might not be in PATH, then either: add to PATH, or use whole path, like:
/home/.../bin/baysor/bin/baysor run -c xenium.toml X5100.015625-5200.0_Y1912.515625-5200.0_filtered_transcripts.csv :cell_id_encoded

Depending on where the toml and csv are saved, their file names might need to be replaced by the relative or absolute path.

In [58]:
results_df = pd.read_csv("segmentation.csv")

In [59]:
print(results_df)

                x          y          z   gene     cell_id  nucleus_distance  \
0       5108.5938  2544.7344  19.687500  Snrpn  hfigbpgi-1          0.468750   
1       5115.5156  2549.7344  16.234375  Snrpn  hfhpappo-1          2.109375   
2       5120.7344  2549.6406  16.937500  Snrpn  hfhpappo-1          0.968750   
3       5123.9062  2525.4531  15.546875  Snrpn  hfhfdcbd-1          0.000000   
4       5133.2344  2530.6094  18.359375  Snrpn  hfiofbjb-1          2.265625   
...           ...        ...        ...    ...         ...               ...   
353395  5128.7656  2544.9531  19.359375  Snrpn  hfhfnclc-1          0.000000   
353396  5131.1094  2536.3594  18.750000  Snrpn  hfinedai-1          0.812500   
353397  5147.8280  2516.2344  19.906250  Snrpn  hfgljmbb-1          0.328125   
353398  5170.9688  2537.3438  18.718750  Snrpn  hfgfaonh-1          1.093750   
353399  5177.8280  2501.0781  17.671875  Snrpn  UNASSIGNED          1.921875   

        overlaps_nucleus fov_name  is_g