In [1]:
import scanpy as sc
from pathlib import Path
import pandas as pd
import os
import numpy as np
import anndata as ad
from tqdm import tqdm

# from jax.config import config
# config.update("jax_enable_x64", True)

from joblib import Parallel, delayed
import squidpy as sq
import scanpy as sc
from anndata import AnnData
from scipy.sparse import csr_matrix

%load_ext autoreload
%autoreload 2
%load_ext lab_black

path_read = Path("/lustre/groups/ml01/workspace/giovanni.palla/moscot/data/DataUpload")
path_write = Path("/lustre/groups/ml01/workspace/giovanni.palla/moscot/processed_data")

In [2]:
def check_write(dataset: str):
    dataset_read = dataset
    dataset_write = dataset_read.lower()

    sc_count = pd.read_csv(
        path_read / dataset_read / "scRNA_count.txt", sep="\t", header=0, index_col=0
    ).T

    spatial_count = pd.read_csv(
        path_read / dataset_read / "Spatial_count.txt", sep="\t"
    )

    insitu_count = pd.read_csv(path_read / dataset_read / "Insitu_count.txt", sep="\t")

    test_list = np.load(path_read / dataset_read / "test_list.npy", allow_pickle=True)
    train_list = np.load(path_read / dataset_read / "train_list.npy", allow_pickle=True)

    adata_sc = AnnData(csr_matrix(sc_count.to_numpy()))
    adata_sc.var_names = sc_count.columns.tolist()

    adata_sp = AnnData(csr_matrix(spatial_count.to_numpy()))
    adata_sp.var_names = spatial_count.columns.tolist()

    adata_insitu = AnnData(csr_matrix(insitu_count.to_numpy()))
    adata_insitu.var_names = insitu_count.columns.tolist()

    for i in range(len(test_list)):
        assert set(test_list[i]).issubset(set(spatial_count.columns))
        assert set(train_list[i]).issubset(set(spatial_count.columns))
        adata_sp.var[f"test_{i}"] = np.in1d(adata_sp.var_names, test_list[i])
        adata_sp.var[f"train_{i}"] = np.in1d(adata_sp.var_names, train_list[i])

    adata_sc = process_normalize(adata_sc)
    adata_sp = process_normalize(adata_sp)
    adata_insitu = process_normalize(adata_insitu)

    try:
        sc.pp.pca(adata_sc)
        sc.pp.pca(adata_sp)
        sc.pp.pca(adata_insitu)
        locations = pd.read_csv(
            path_read / dataset_read / "Locations.txt",
            sep="\t",
        )
        adata_sp.obsm["spatial"] = locations.to_numpy()
        adata_insitu.obsm["spatial"] = locations.to_numpy()
    except ValueError:
        return

    adata_sc.write(path_write / f"{dataset_write}_sc.h5ad")
    adata_sp.write(path_write / f"{dataset_write}_sp.h5ad")
    adata_insitu.write(path_write / f"{dataset_write}_insitu.h5ad")

    del adata_sc
    del adata_sp
    del adata_insitu
    del sc_count
    del spatial_count
    del insitu_count

    print(dataset_write)

    return


def process_normalize(adata: AnnData):
    sc.pp.filter_cells(adata, min_counts=10)
    sc.pp.filter_genes(adata, min_counts=2)
    adata.layers["counts"] = adata.X.copy()
    if adata.shape[1] > 2000:
        sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=2000)
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)

    return adata.copy()

In [3]:
# dataset = "Dataset36"
# check_write(dataset)

In [4]:
# os.listdir(path_read)[:3]

In [5]:
result = Parallel(n_jobs=6)(delayed(check_write)(dataset) for dataset in tqdm(os.listdir(path_read)))

  np.log1p(X, out=X)
  np.log1p(X, out=X)


dataset45
dataset2




dataset36
dataset4




dataset14
dataset3




dataset23
dataset10


100%|██████████| 45/45 [10:37<00:00, 14.17s/it]


AssertionError: 

In [10]:
dataset = "dataset3"
adata_sc = ad.read(path_write / f"{dataset.lower()}_sc.h5ad")
adata_sp = ad.read(path_write / f"{dataset.lower()}_sp.h5ad")

In [14]:
adata_sp.shape

(3585, 249)

In [16]:
adata_sp[:, adata_sp.var.test_0].shape

(3585, 17)

In [17]:
adata_sp[:, adata_sp.var.train_0].shape

(3585, 153)

In [18]:
153 + 17

170

In [19]:
adata_sc.shape

(8596, 16384)

In [13]:
adata_sp.var

Unnamed: 0,test_0,train_0,test_1,train_1,test_2,train_2,test_3,train_3,test_4,train_4,...,train_5,test_6,train_6,test_7,train_7,test_8,train_8,test_9,train_9,n_counts
Tal1,False,True,True,False,False,True,False,True,False,True,...,True,False,True,False,True,False,True,False,True,45815.0
Dmbx1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,36514.0
Emx2,False,True,False,True,False,True,False,True,False,True,...,True,False,True,True,False,False,True,False,True,51325.0
Uncx,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,42842.0
Paxip1,False,True,False,True,False,True,False,True,False,True,...,True,False,True,False,True,False,True,True,False,60061.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ngef,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,40229.0
tiam1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,76686.0
slc1a2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,77793.0
gja1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,26414.0
