In [6]:
import os
import tarfile
import spatialdata as sd
import pandas as pd
import spatialdata_db
from spatialdata_io import xenium, visium, visium_hd

In [7]:
def decrompress_if_not_done(path, tar_gz_file):
    decompressed_folder_name = tar_gz_file[:-7]
    decompressed_folder_path = os.path.join(path, decompressed_folder_name)

    # Check if the decompressed folder already exists
    if os.path.exists(decompressed_folder_path):
        print(f"Decompressed folder already exists for: {tar_gz_file}")
    else:
        # Decompress the .tar.gz file
        print(f"Decompressing: {tar_gz_file}")
        try:
            with tarfile.open(os.path.join(path, tar_gz_file), "r:gz") as tar:
                tar.extractall(path=decompressed_folder_path)
            print(f"Decompressed: {tar_gz_file} -> {decompressed_folder_path}")
        except Exception as e:
            print(f"Error decompressing {tar_gz_file}: {e}")

In [3]:
# uids_to_add = ["10vek", "1000j", "1005m", "1012t", 
#  "101cw", "101in", "1025z", "103sm", "103te", 
#  "103uy", "104dm", "104pe", "105tt", "10fna"]

In [None]:
#TODO: go through problem files: 10vek, 10s7d
uids_to_add = ["10s7d"]
# uids_to_add = ["10vek"]

In [9]:
SDDB_KEY = 'sample'
DATA_PATH = '/lustre/groups/ml01/projects/2024_spatialdata_db/data'
CSV_FILE_PATH = './metadata_10x.csv'

df = pd.read_csv(CSV_FILE_PATH, sep='\t')

df = df[df['uid'].isin(uids_to_add)]

In [10]:
for index, row in df.iterrows():
    uid = row['uid']
    assay = row['assay']
    meta = row.drop('uid') 

    folder_path = None
    for folder in os.listdir(DATA_PATH):
        if folder.startswith(str(uid)):
            folder_path = os.path.join(DATA_PATH, folder)
            break

    if folder_path is None:
        print(f"Folder for UID {uid} not found.")
        continue

    # Check if a .zarr file exists in the folder
    if any(file.endswith('.zarr') for file in os.listdir(folder_path)):
        print(f".zarr file already exists for UID {uid} in {folder_path}. Skipping...")
        continue
    
    raw_output = os.path.join(folder_path, 'raw_output')
    print(f"Processing {uid}")
    print(f"Path {raw_output}")

    # Iterate over files in the directory
    for file_name in os.listdir(raw_output):
        if file_name.endswith(".tar.gz"):
            decrompress_if_not_done(raw_output, file_name)

    sdata = None
    if assay == 'Xenium':
        sdata = xenium(raw_output)
    elif assay == 'Visium':
        sdata = visium(raw_output)
    elif assay == 'VisiumHD':
        sdata = visium_hd(raw_output)

    if sdata is None:
        print(f"Assay {assay} not supported.")
        continue

    zarr_filename = folder_path.split('/')[-1] + '.zarr'
    print(f"Writing to {zarr_filename}")
    sdata.write(os.path.join(folder_path, zarr_filename))

.zarr file already exists for UID 10vek in /lustre/groups/ml01/projects/2024_spatialdata_db/data/10vek__10X__VisiumHD__Human__lung__20240329__v3.0.0. Skipping...
Processing 10s7d
Path /lustre/groups/ml01/projects/2024_spatialdata_db/data/10s7d__10X__VisiumHD__Mouse__intestine__20240325__v3.0.0/raw_output
Decompressed folder already exists for: Visium_HD_Mouse_Small_Intestine_square_008um_binned_outputs.tar.gz
Decompressed folder already exists for: Visium_HD_Mouse_Small_Intestine_spatial.tar.gz
Decompressed folder already exists for: Visium_HD_Mouse_Small_Intestine_square_002um_binned_outputs.tar.gz
Decompressed folder already exists for: Visium_HD_Mouse_Small_Intestine_square_016um_binned_outputs.tar.gz


ValueError: tuple.index(x): x not in tuple