Accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139944

In [1]:
import gzip
import os

import numpy as np
import pandas as pd
from anndata import AnnData

from utils import download_binary_file

In [2]:
def download_srivatsan_2019_sciplex2(output_path: str) -> None:
    """
    Download Srivatsan et al. 2019 sciplex-2 data from the hosting URLs.

    Args:
    ----
        output_path: Output path to store the downloaded and unzipped
        directories.

    Returns
    -------
        None. File directories are downloaded to output_path.
    """

    count_matrix_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150377&format=file&file="
        "GSM4150377_sciPlex2_A549_Transcription_Modulators_UMI.count.matrix.gz"
    )
    count_matrix_filename = os.path.join(output_path, count_matrix_url.split("=")[-1])
    download_binary_file(count_matrix_url, count_matrix_filename)

    cell_metadata_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150377&format=file"
        "&file=GSM4150377_sciPlex2_pData.txt.gz"
    )
    cell_metadata_filename = os.path.join(output_path, cell_metadata_url.split("=")[-1])
    download_binary_file(cell_metadata_url, cell_metadata_filename)

    gene_metadata_url = (
        "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4150377&format=file&file="
        "GSM4150377_sciPlex2_A549_Transcription_Modulators_gene.annotations.txt.gz"
    )
    cell_metadata_filename = os.path.join(output_path, gene_metadata_url.split("=")[-1])
    download_binary_file(gene_metadata_url, cell_metadata_filename)


def read_srivatsan_2019_sciplex2(file_directory: str) -> pd.DataFrame:
    """
    Read the sciplex-2 expression data for Srivatsan et al. 2019 in the given directory.

    Args:
    ----
        file_directory: Directory containing Srivatsan et al. 2019 data.

    Returns
    -------
        A data frame containing single-cell gene expression counts. The count
        matrix is stored in triplet format. I.e., each row of the data frame
        has the format (row, column, count) stored in columns (i, j, x) respectively.
    """

    with gzip.open(
        os.path.join(
            file_directory,
            "GSM4150377_sciPlex2_A549_Transcription_Modulators_UMI.count.matrix.gz",
        ),
        "rb",
    ) as f:
        df = pd.read_csv(f, sep="\t", header=None, names=["i", "j", "x"])

    return df

In [3]:
download_path = "./srivatsan_2019_sciplex2"

os.makedirs(download_path, exist_ok=True)
download_srivatsan_2019_sciplex2(download_path)
df = read_srivatsan_2019_sciplex2(download_path)

# The Srivatsan count data is in a sparse triplet format represented
# by three columns 'i', 'j', and 'x'. 'i' refers to a row number, 'j' refers to
# a column number, and 'x' refers to a count value.
counts = df["x"]
rows = (
    df["i"] - 1
)  # Indices were originally 1-base-indexed --> switch to 0-base-indexing
cols = df["j"] - 1

# Convert the triplets into a numpy array
count_matrix = np.zeros([max(rows) + 1, max(cols) + 1])
count_matrix[rows, cols] = counts

# Switch matrix from gene rows and cell columns to cell rows and gene columns
count_matrix = count_matrix.T

cell_metadata = pd.read_csv(
    os.path.join(
        download_path,
        "GSM4150377_sciPlex2_pData.txt.gz",
    ),
    sep=" ",
)

gene_metadata = pd.read_csv(
    os.path.join(
        download_path,
        "GSM4150377_sciPlex2_A549_Transcription_Modulators_gene.annotations.txt.gz",
    ),
    sep="\t",
    header=None,
    index_col=0,
)

adata = AnnData(
    X=count_matrix, obs=cell_metadata, var=pd.DataFrame(index=gene_metadata.index)
)

# Index needs string names or else the write_h5ad call will throw an error
adata.var.index.name = "gene_id"

# Treatment information is contained in the `top_oligo` column
# with the format <drug>_<dose>. There exist some NaN values
# in the column (which will break the next few lines), so we
# convert them to strings first.
adata.obs["top_oligo"] = [str(x) for x in adata.obs["top_oligo"]]
adata.obs["drug"] = [
    treatment.split("_")[0] for treatment in adata.obs["top_oligo"]
]
adata = adata[adata.obs["drug"] != "nan"]
adata.obs["dose"] = [
    treatment.split("_")[1] for treatment in adata.obs["top_oligo"]
]
adata.obs["dose"] = adata.obs["dose"].apply(pd.to_numeric, args=("coerce",))

# If a drug is listed with dosage of 0, the cell was only exposed to vehicle control
adata.obs["drug"][adata.obs["dose"] == 0.0] = "Vehicle"

File ./srivatsan_2019_sciplex2/GSM4150377_sciPlex2_A549_Transcription_Modulators_UMI.count.matrix.gz already exists. No files downloaded to overwrite the existing file.
File ./srivatsan_2019_sciplex2/GSM4150377_sciPlex2_pData.txt.gz already exists. No files downloaded to overwrite the existing file.
File ./srivatsan_2019_sciplex2/GSM4150377_sciPlex2_A549_Transcription_Modulators_gene.annotations.txt.gz already exists. No files downloaded to overwrite the existing file.


  adata = AnnData(
  adata.obs["dose"] = [
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs["drug"][adata.obs["dose"] == 0.0] = "Vehicle"


In [4]:
adata.write_h5ad("Srivatsan_2019_sciplex2_raw.h5ad")

  df[key] = c
  df[key] = c
