# RAPIDS & Scanpy Single-Cell RNA-seq Workflow on mouse NAc cells

Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License") you may not use this file except in compliance with the License. You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0 

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

This notebook demonstrates a single-cell RNA analysis workflow that begins with preprocessing a count matrix of size `(n_gene, n_cell)` and results in a visualization of the clustered cells for further analysis.

For demonstration purposes, we use a dataset of 1.3 M brain cells with Unified Virtual Memory to oversubscribe GPU memory.

## Import requirements

In [2]:
import numpy as np
import scanpy as sc
import anndata

import time
import os, wget

import cupy as cp
import cudf

from cuml.decomposition import PCA
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from cuml.preprocessing import StandardScaler

import rapids_scanpy_funcs
import utils

import warnings
warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')
import pandas as pd
from sh import gunzip
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import rmm
rmm.reinitialize(managed_memory=True)
from rmm.allocators.cupy import rmm_cupy_allocator
import cupy
cupy.cuda.set_allocator(rmm_cupy_allocator)

import calculation_tool as ct

  from .autonotebook import tqdm as notebook_tqdm


We use the RAPIDS memory manager to enable Unified Virtual Memory management, which allows us to oversubscribe the GPU memory.

## Input data

In the cell below, we provide the path to the sparse `.h5ad` file containing the count matrix to analyze. Please see the README for instructions on how to download the dataset we use here.

To run this notebook using your own dataset, please see the README for instructions to convert your own count matrix into this format. Then, replace the path in the cell below with the path to your generated `.h5ad` file.

In [2]:
import os
import urllib.request
import gzip
import shutil

url_list=[r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482106&format=file&file=GSM5482106%5FNAc%5FWT%5FRep1%5Fbarcodes%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482106&format=file&file=GSM5482106%5FNAc%5FWT%5FRep1%5Ffeatures%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482106&format=file&file=GSM5482106%5FNAc%5FWT%5FRep1%5Fmatrix%2Emtx%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482107&format=file&file=GSM5482107%5FNAc%5FWT%5FRep2%5Fbarcodes%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482107&format=file&file=GSM5482107%5FNAc%5FWT%5FRep2%5Ffeatures%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482107&format=file&file=GSM5482107%5FNAc%5FWT%5FRep2%5Fmatrix%2Emtx%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482108&format=file&file=GSM5482108%5FNAc%5FSetd1aHet%5FRep1%5Fbarcodes%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482108&format=file&file=GSM5482108%5FNAc%5FSetd1aHet%5FRep1%5Ffeatures%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482108&format=file&file=GSM5482108%5FNAc%5FSetd1aHet%5FRep1%5Fmatrix%2Emtx%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482109&format=file&file=GSM5482109%5FNAc%5FSetd1aHet%5FRep2%5Fbarcodes%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482109&format=file&file=GSM5482109%5FNAc%5FSetd1aHet%5FRep2%5Ffeatures%2Etsv%2Egz",
r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5482109&format=file&file=GSM5482109%5FNAc%5FSetd1aHet%5FRep2%5Fmatrix%2Emtx%2Egz"]

def download_file(url, dir, file_name):
    full_path = os.path.join(dir, file_name)
    urllib.request.urlretrieve(url, full_path)
    # 以下のコメントアウト部分を活用する場合は、解凍の手順を追加することができます。
    # with gzip.open(full_path, 'rb') as f_in:
    #     with open(full_path[:-3], 'wb') as f_out:
    #         shutil.copyfileobj(f_in, f_out)

dir = "/data/mouse_NAc/"

for url in url_list:
    acc = url.split("acc=")[1].split("&")[0]
    file_name_from_url = url.split("file=")[1]

    # WTとSetd1aHetのサブフォルダを作成
    subfolder = "WT" if "WT" in file_name_from_url else "Setd1aHet"
    acc_dir = os.path.join(dir,subfolder, acc)
    os.makedirs(acc_dir, exist_ok=True)

    # 標準的なファイル名に変更（.gzを取り除く）
    standard_file_name = file_name_from_url.split('_')[-1].replace('.gz', '')
    
    if "barcodes" in file_name_from_url:
        standard_file_name = "barcodes.tsv.gz"
    elif "features" in file_name_from_url:
        standard_file_name = "features.tsv.gz"
    elif "matrix" in file_name_from_url:
        standard_file_name = "matrix.mtx.gz"
    
    # ファイルをダウンロード
    download_file(url, acc_dir, standard_file_name)


In [None]:
# ダウンロードディレクトリ
base_dir = "/data/mouse_NAc/"

def read_10x_mtx(path, label):
    """10x mtxフォーマットのデータを読み込み、指定されたラベルを追加します。"""
    adata = sc.read_10x_mtx(
        path,
        var_names='gene_symbols',
        cache=True
    )
    adata.obs['label'] = label
    return adata

labels = ["WT", "Setd1aHet"]

all_adatas = []

for label in labels:
    label_dir = os.path.join(base_dir, label)
    gsm_dirs = [os.path.join(label_dir, d) for d in os.listdir(label_dir) if os.path.isdir(os.path.join(label_dir, d))]
    
    for gsm_dir in gsm_dirs:
        if os.path.exists(gsm_dir):
            all_adatas.append(read_10x_mtx(gsm_dir, label))

# 全てのAnnDataオブジェクトを1つに結合
combined_adata = all_adatas[0].concatenate(all_adatas[1:], join='outer')

# .h5ad形式で保存
combined_adata.write("/data/mouse_NAc/combined_data.h5ad")

: 

In [None]:
file_path="/data/mouse_hippocampus/GSE60361_C1-3005-Expression.txt"
df=pd.read_csv(file_path, delimiter="\t",header=0,index_col=0)
adata = anndata.read_csv(file_path, delimiter="\t")

In [None]:
df.columns

Index(['1772071015_C02', '1772071017_G12', '1772071017_A05', '1772071014_B06',
       '1772067065_H06', '1772071017_E02', '1772067065_B07', '1772067060_B09',
       '1772071014_E04', '1772071015_D04',
       ...
       '1772066110_D12', '1772071017_A07', '1772063071_G10', '1772058148_C03',
       '1772063061_D09', '1772067059_B04', '1772066097_D04', '1772063068_D01',
       '1772066098_A12', '1772058148_F03'],
      dtype='object', length=3005)

In [18]:
df.index

Index(['Tspan12', 'Tshz1', 'Fnbp1l', 'Adamts15', 'Cldn12', 'Rxfp1',
       '2310042E22Rik', 'Sema3c', 'Jam2', 'Apbb1ip',
       ...
       'Gm20826_loc1', 'Gm20826_loc2', 'Gm20877_loc2', 'Gm20877_loc1',
       'Gm20865_loc4', 'Gm20738_loc4', 'Gm20738_loc6', 'Gm21943_loc1',
       'Gm21943_loc3', 'Gm20738_loc3'],
      dtype='object', name='cell_id', length=19972)

## load data

In [None]:
import calculation_tool as ct
file_path="/data/mouse_NAc/combined_data.h5ad"
adata=ct.preprocess_adata_in_bulk(file_path,label=None)

  from .autonotebook import tqdm as notebook_tqdm


preprocess_in_bulk
HTR1E is not included
HTR1E is removed from marker list
['CX3CR1', 'CLDN5', 'GLUL', 'NDRG2', 'PCDH15', 'PLP1', 'MBP', 'SATB2', 'SLC17A7', 'SLC17A6', 'GAD2', 'GAD1', 'SNAP25', 'HTR1A', 'HTR1B', 'HTR1D', 'HTR2A', 'HTR2B', 'HTR2C', 'HTR3A', 'HTR4', 'HTR5A', 'HTR6', 'HTR7', 'DRD1', 'DRD2', 'DRD3', 'DRD4', 'DRD5', 'HRH1', 'HRH2', 'HRH3', 'CHRM1', 'CHRM2', 'CHRM3', 'CHRM4', 'CHRM5', 'ADRA1A', 'ADRA1B', 'ADRA2A', 'ADRA2B', 'ADRA2C', 'ADRB1', 'ADRB2']
perform regression
perform scale
float32
Total Preprocessing time: 19.9202721118927
shape of adata: (38956, 21594)
shape of adata: (38956, 21594)
perform PCA
100


In [None]:
D_R_mtx,GPCR_type_df,drug_list,GPCR_list=ct.load_parameters()
params=ct.set_parameters_for_preprocess(GPCR_list)

In [None]:
sc.pl.umap(adata, color=["leiden"])

In [None]:
import matplotlib.gridspec as gridspec
import math

# 1. ラベル毎に処理するための準備
unique_labels = adata.obs['label'].unique()

# 2. 各ラベルについて、各クラスタの薬剤反応の相関行列を計算
drug_response_columns = ['cAMP_%s' % drug for drug in drug_list]

# ラベル毎の相関行列を保存するための辞書
correlation_matrices_per_label = {}

for label in unique_labels:
    adata_subset = adata[adata.obs["label"] == label]  # ラベルに基づいてadataのサブセットを取得
    
    # サブセット内のクラスタを取得
    clusters = adata_subset.obs['leiden'].cat.categories
    correlation_matrices = {}
    
    for cluster in clusters:
        subset = adata_subset.obs.loc[adata_subset.obs['leiden'] == cluster, drug_response_columns]
        correlation_matrices[cluster] = subset.corr()
    
    correlation_matrices_per_label[label] = correlation_matrices

# 3. 各ラベルの相関行列を1つのFigureにまとめてプロット
for label, correlation_matrices in correlation_matrices_per_label.items():
    num_clusters = len(correlation_matrices)
    
    # 4つのクラスタごとに新しい行を作成する
    rows = math.ceil(num_clusters / 4)
    fig = plt.figure(figsize=(40, rows * 10))  # 1つの相関行列あたりの横幅を10として計算
    spec = gridspec.GridSpec(rows, 4, figure=fig)  # 4列のグリッドを作成
    
    for i, (cluster, corr_matrix) in enumerate(correlation_matrices.items()):
        ax = fig.add_subplot(spec[i // 4, i % 4])
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, ax=ax)
        ax.set_title(f'Cluster {cluster}')
    
    fig.suptitle(f'Correlation Matrices for Label {label}', fontsize=20, y=1.02)
    plt.tight_layout()
    plt.show()

In [None]:
# adataからobs_namesを取得
var_names = adata.var_names.tolist()

# obs_namesに含まれる要素だけをgene_of_interestから残す
filtered_genes = [gene for gene in params['markers'] if gene in var_names]
print(filtered_genes)
sc.pl.dotplot(adata, var_names=filtered_genes, groupby='leiden')