Replogle et al., 2020
https://doi.org/10.1038/s41587-020-0470-y

In [None]:
import os

from shared import download_file


def download_raw_data(dir_path: str) -> None:
    """Download the raw data.

    Args:
        dir_path: The directory path where the raw data will be stored.
    """
    download_file(
        url="https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE146194&format=file",
        path=os.path.join(dir_path, "GSE146194_RAW.tar"),
    )

In [None]:
dataset_dir_path = "replogle2020/raw"
os.makedirs(name=dataset_dir_path, exist_ok=False)
download_raw_data(dir_path=dataset_dir_path)

In [None]:
import tarfile

# Replace 'filename.tar.gz' with your tar file name
tar_file = "replogle2020/raw/GSE146194_RAW.tar"
target_dir_path = "replogle2020/raw/extracted"

# Open the tar file
with tarfile.open(tar_file, "r") as tar:
    # Extract all contents to the current directory
    tar.extractall(path=target_dir_path)

In [None]:
# Rename *matrix.mtx.gz to *.mtx.gz

In [20]:
import scanpy as sc

adata = sc.read_10x_mtx(
    path=target_dir_path, var_names="gene_ids", cache=False, prefix="GSM4367989_exp11."
)

Only considering the two last: ['.mtx', '.gz'].
Only considering the two last: ['.mtx', '.gz'].


In [21]:
print(adata)

AnnData object with n_obs × n_vars = 204520 × 33694
    var: 'gene_symbols', 'feature_types'


In [23]:
# Save the data to an H5AD file.
os.makedirs(name=os.path.join(dataset_dir_path, "preprocessed"), exist_ok=False)
h5ad_file_path = os.path.join(dataset_dir_path, "preprocessed", "adata.h5ad")
print(f"Saving the preprocessed data to: {h5ad_file_path}")
adata.write(filename=h5ad_file_path)

Saving the preprocessed data to: replogle2020/raw/preprocessed/adata.h5ad


In [None]:
def download_raw_data(dir_path: str) -> None:  # noqa: D103
    download_file(
        url="https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE146194&format=file",
        path=os.path.join(dir_path, "GSE146194_RAW.tar"),
    )


def load_raw_data(dir_path: str) -> None:  # noqa: D103
    # # Extract the tar file.
    tar_file_path = os.path.join(dir_path, "GSE146194_RAW.tar")
    extract_dir_path = os.path.join(dir_path, "GSE146194_RAW")
    with tarfile.open(name=tar_file_path, mode="r") as tar:
        tar.extractall(path=extract_dir_path)

    # Load the data.
    adata = sc.read_10x_mtx(
        path=extract_dir_path,
        var_names="gene_ids",
        cache=False,
        prefix="GSM4367985_exp7.",
    )

    # Add info from the "*_cell_identities.csv.gz" file to the AnnData object.
    cell_identities_file_path = os.path.join(
        extract_dir_path, "GSM4367985_exp7.cell_identities.csv.gz"
    )
    barcodes_file_path = os.path.join(
        extract_dir_path, "GSM4367985_exp7.barcodes.tsv.gz"
    )
    with gzip.open(cell_identities_file_path, mode="r") as cell_identities_file:
        cell_identities_df = pd.read_csv(filepath_or_buffer=cell_identities_file)
        with gzip.open(barcodes_file_path, mode="r") as barcodes_file:
            barcodes_df = pd.read_csv(
                filepath_or_buffer=barcodes_file, header=None, names=["cell_barcode"]
            )
            merged_df = pd.merge(
                left=barcodes_df,
                right=cell_identities_df,
                on="cell_barcode",
                how="left",
            )

    # Ensure the merged_df index matches the obs_names of adata.
    merged_df.set_index("cell_barcode", inplace=True)

    # Convert all columns to strings.
    merged_df = merged_df.astype(str)

    # Add the merged_df as obs to adata.
    adata.obs = merged_df