Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM4150378

In [None]:
import gzip
import os

import pandas as pd
from anndata import AnnData

from utils import download_binary_file
from scipy.sparse import csr_matrix

In [None]:
def download_srivatsan_2019_sciplex3(output_path: str) -> None:
    """
    Download Srivatsan et al. 2019 sciplex3 data from the hosting URLs.

    Args:
    ----
        output_path: Output path to store the downloaded and unzipped
        directories.

    Returns
    -------
        None. File directories are downloaded to output_path.
    """

    count_matrix_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&file"
        "=GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz"
    )
    count_matrix_filename = os.path.join(output_path, count_matrix_url.split("=")[-1])
    download_binary_file(count_matrix_url, count_matrix_filename)

    cell_metadata_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file&"
        "file=GSM4150378_sciPlex3_pData.txt.gz"
    )
    cell_metadata_filename = os.path.join(output_path, cell_metadata_url.split("=")[-1])
    download_binary_file(cell_metadata_url, cell_metadata_filename)

    gene_metadata_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150378&format=file"
        "&file=GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz"
    )
    cell_metadata_filename = os.path.join(output_path, gene_metadata_url.split("=")[-1])
    download_binary_file(gene_metadata_url, cell_metadata_filename)


def read_srivatsan_2019_sciplex3(file_directory: str) -> pd.DataFrame:
    """
    Read the sciplex3 expression data from Srivatsan et al. 2019 in the given directory.

    Args:
    ----
        file_directory: Directory containing Srivatsan et al. 2019 data.

    Returns
    -------
        A data frame containing single-cell gene expression counts. The count
        matrix is stored in triplet format. I.e., each row of the data frame
        has the format (row, column, count) stored in columns (i, j, x) respectively.
    """

    with gzip.open(
        os.path.join(
            file_directory,
            "GSM4150378_sciPlex3_A549_MCF7_K562_screen_UMI.count.matrix.gz",
        ),
        "rb",
    ) as f:
        df = pd.read_csv(f, sep="\t", header=None, names=["i", "j", "x"])

    return df

In [None]:
download_path = "./srivatsan_2019_sciplex3"

os.makedirs(download_path, exist_ok=True)
download_srivatsan_2019_sciplex3(download_path)
df = read_srivatsan_2019_sciplex3(download_path)

# The Srivatsan count data is in a sparse triplet format represented
# by three columns 'i', 'j', and 'x'. 'i' refers to a row number, 'j' refers to
# a column number, and 'x' refers to a count value.
counts = df["x"]
rows = (
    df["i"] - 1
)  # Indices were originally 1-base-indexed --> switch to 0-base-indexing
cols = df["j"] - 1

# This dataset is large enough that we need to store it as a scipy sparse matrix
# for preprocessing (>600 GB in RAM as a dense matrix)
count_matrix = csr_matrix((counts.values, (rows.values, cols.values)), shape=(max(rows) + 1, max(cols) + 1))

# Switch matrix from gene rows and cell columns to cell rows and gene columns
count_matrix = count_matrix.T

cell_metadata = pd.read_csv(
    os.path.join(
        download_path,
        "GSM4150378_sciPlex3_pData.txt.gz",
    ),
    sep=" ",
)

gene_metadata = pd.read_csv(
    os.path.join(
        download_path,
        "GSM4150378_sciPlex3_A549_MCF7_K562_screen_gene.annotations.txt.gz",
    ),
    sep=" ",
    index_col=0,
)

# The gene list contains both mouse and human genes due to quirks
# in how the authors saved their data. We only care about human genes
# (since the cell lines we're using are from humans), so we discard
# the mouse genes. The human genes come before the mice genes in the
# data, so we can just compute the number of human genes (x) and then subset
# the data to the first x genes.
num_human_genes = sum(['ENSG' in x for x in gene_metadata.index.values])
count_matrix = count_matrix[:, :num_human_genes]
gene_metadata = gene_metadata.head(num_human_genes)

adata = AnnData(
    X=count_matrix, obs=cell_metadata, var=gene_metadata
)

# Filter out cells for which we don't have data
adata = adata[adata.obs['cell_type'].notna()]
adata = adata[adata.obs['product_name'].notna()]

# For readability
adata.obs['product_name'] = [x.split(' ')[0] for x in adata.obs['product_name']]

In [None]:
adata.write_h5ad("Srivatsan_2019_sciplex3_raw.h5ad")