# RAPIDS & Scanpy Single-Cell RNA-seq Workflow on mouse PFC cells

Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License") you may not use this file except in compliance with the License. You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0 

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

This notebook demonstrates a single-cell RNA analysis workflow that begins with preprocessing a count matrix of size `(n_gene, n_cell)` and results in a visualization of the clustered cells for further analysis.

For demonstration purposes, we use a dataset of 1.3 M brain cells with Unified Virtual Memory to oversubscribe GPU memory.

## Import requirements

In [1]:
import numpy as np
import scanpy as sc
import anndata

import time
import os, wget

import cupy as cp
import cudf

from cuml.decomposition import PCA
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from cuml.preprocessing import StandardScaler

import rapids_scanpy_funcs
import utils

import warnings
warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')
import pandas as pd
from sh import gunzip
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

We use the RAPIDS memory manager to enable Unified Virtual Memory management, which allows us to oversubscribe the GPU memory.

In [2]:
import rmm

rmm.reinitialize(managed_memory=True)
cp.cuda.set_allocator(rmm.rmm_cupy_allocator)

In [8]:
from scipy import sparse
def load_text(path,label,sample_ID):
    adata = sc.read_text(path)
    adata=adata.transpose()
    sparse_X = sparse.csr_matrix(adata.X)
    adata.X = sparse_X
    adata.obs['label'] = label
    adata.obs['sampleID'] = sample_ID
    return adata

## load data

In [9]:
path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample1C.txt"
adata1=load_text(path,"control","sample1C")

path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample3C.txt"
adata2=load_text(path,"control","sample3C")

path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample5C.txt"
adata3=load_text(path,"control","sample5C")

path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample2E.txt"
adata4=load_text(path,"SD","sample2E")

path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample4E.txt"
adata5=load_text(path,"SD","sample4E")

path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/snRNA-seq_UMI_sample8E.txt"
adata6=load_text(path,"SD","sample8E")

In [12]:
all_adatas = []
all_adatas.append(adata1)
all_adatas.append(adata2)
all_adatas.append(adata3)
all_adatas.append(adata4)
all_adatas.append(adata5)
all_adatas.append(adata6)

combined_adata = all_adatas[0].concatenate(all_adatas[1:], join='outer')
combined_adata.write("/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/combined_data.h5ad")

... storing 'label' as categorical
... storing 'sampleID' as categorical


In [3]:
h5path=r"/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/combined_data.h5ad"
adata = anndata.read_h5ad(h5path)

In [16]:
print(adata.obs)
adata.obs.to_csv("/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/genes.csv")

Empty DataFrame
Columns: []
Index: [ENSMUSG00000102693.1, ENSMUSG00000064842.1, ENSMUSG00000102851.1, ENSMUSG00000089699.1, ENSMUSG00000103147.1, ENSMUSG00000102348.1, ENSMUSG00000102592.1, ENSMUSG00000104238.1, ENSMUSG00000102269.1, ENSMUSG00000096126.1, ENSMUSG00000102735.1, ENSMUSG00000103922.1, ENSMUSG00000025903.14, ENSMUSG00000104217.1, ENSMUSG00000033813.15, ENSMUSG00000085623.1, ENSMUSG00000091665.1, ENSMUSG00000033793.12, ENSMUSG00000104046.1, ENSMUSG00000102907.1, ENSMUSG00000025905.14, ENSMUSG00000025907.14, ENSMUSG00000087247.3, ENSMUSG00000103355.1, ENSMUSG00000033740.17, ENSMUSG00000104385.1, ENSMUSG00000102135.1, ENSMUSG00000103282.1, ENSMUSG00000102534.1, ENSMUSG00000102213.1, ENSMUSG00000103629.1, ENSMUSG00000051285.17, ENSMUSG00000103509.1, ENSMUSG00000048538.7, ENSMUSG00000077244.1, ENSMUSG00000102768.1, ENSMUSG00000103498.1, ENSMUSG00000103067.1, ENSMUSG00000102320.1, ENSMUSG00000104226.1, ENSMUSG00000103903.1, ENSMUSG00000103933.1, ENSMUSG00000076135.1, ENSMUSG0000

In [15]:
# Assuming adata.var.index contains your Ensembl gene IDs
print(adata.var.index)
geneIDs = pd.DataFrame(adata.var.index, columns=['GeneID'])  # Create a DataFrame with a proper column name

# Extract the base Ensembl gene IDs (before the dot)
base_gene_ids = geneIDs['GeneID'].str.split('.', expand=True)[0]  # You need to specify the column to split
print(base_gene_ids)


Index(['ENSMUSG00000102693.1', 'ENSMUSG00000064842.1', 'ENSMUSG00000102851.1',
       'ENSMUSG00000089699.1', 'ENSMUSG00000103147.1', 'ENSMUSG00000102348.1',
       'ENSMUSG00000102592.1', 'ENSMUSG00000104238.1', 'ENSMUSG00000102269.1',
       'ENSMUSG00000096126.1',
       ...
       'ENSMUSG00000099436.1-I', 'ENSMUSG00000101694.1-I',
       'ENSMUSG00000101402.1-I', 'ENSMUSG00000100574.1-I',
       'ENSMUSG00000100608.1-I', 'ENSMUSG00000102011.1-I',
       'ENSMUSG00000100964.1-I', 'ENSMUSG00000099619.6-I',
       'ENSMUSG00000099399.6-I', 'ENSMUSG00000095366.2-I'],
      dtype='object', length=87308)
0        ENSMUSG00000102693
1        ENSMUSG00000064842
2        ENSMUSG00000102851
3        ENSMUSG00000089699
4        ENSMUSG00000103147
                ...        
87303    ENSMUSG00000102011
87304    ENSMUSG00000100964
87305    ENSMUSG00000099619
87306    ENSMUSG00000099399
87307    ENSMUSG00000095366
Name: 0, Length: 87308, dtype: object


In [23]:
def parse_gtf_to_dict(file_path):
    """GTFファイルから遺伝子IDと遺伝子名のマッピングを抽出する関数"""
    # GTFファイルの読み込み
    col_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
    df = pd.read_csv(file_path, sep='\t', comment='#', names=col_names, usecols=['feature', 'attributes'])
    
    # 遺伝子レコードのみを抽出
    df_genes = df[df['feature'] == 'gene']
    
    # 属性から遺伝子IDと遺伝子名を抽出
    gene_id_name_map = {}
    for attributes in df_genes['attributes']:
        attribute_dict = {item.split(' ')[0]: item.split(' ')[1].strip('"') for item in attributes.split('; ') if item}
        gene_id = attribute_dict.get('gene_id')
        gene_name = attribute_dict.get('gene_name', 'Unknown')  # 遺伝子名がない場合は'Unknown'を使用
        gene_id_name_map[gene_id] = gene_name
    
    return gene_id_name_map

def get_gene_names_from_ids(gene_ids, gene_id_name_map):
    """遺伝子IDのリストを受け取り、対応する遺伝子名のリストを返す関数"""
    return [gene_id_name_map.get(gene_id, 'Unknown') for gene_id in gene_ids]

# GTFファイルパスを定義
gtf_file_path = r"/temp/data/mouse_PFD_sleepdeprivation/Mus_musculus.GRCm39.111.gtf"

# GTFファイルから遺伝子IDと遺伝子名のマッピングを作成
gene_id_name_map = parse_gtf_to_dict(gtf_file_path)

# 遺伝子名のリストを取得
gene_names = get_gene_names_from_ids(base_gene_ids, gene_id_name_map)

In [24]:
print(gene_names)

['4933401J01Rik', 'Gm26206', 'Gm18956', 'Gm1992', 'Gm7341', 'Gm10568', 'Gm38385', 'Gm37587', 'Gm7357', 'Unknown', 'Gm7369', 'Gm6123', 'Lypla1', 'Gm37988', 'Tcea1', 'Gm16041', 'Gm17101', 'Atp6v1h', 'Gm37567', 'Gm38264', 'Oprk1', 'Rb1cc1', 'Alkal1', 'Gm2147', 'St18', 'Gm7449', 'Gm37108', 'Gm37275', 'Gm37225', 'Gm37489', 'Gm5694', 'Pcmtd1', 'Gm38372', 'Gm9826', 'Gm23274', 'Gm19002', 'Gm18984', 'Gm30414', 'Gm37791', 'Gm7470', 'Rps2-ps2', 'Gm36964', 'Unknown', 'Gm7445', 'Gm37143', 'Gm7512', 'Gm24765', 'Rrs1', 'Adhfe1', 'Gm6161', 'Vxn', '1700034P13Rik', 'Sgk3', 'Mcmdc2', 'E330040D14Rik', 'Gm15818', 'Cspp1', 'Unknown', 'Gm15603', 'Gm5522', 'Gm28659', 'Unknown', 'Prex2', 'A830018L16Rik', 'Gm38178', 'Gm38069', 'Gm7560', 'Unknown', 'Mir6341', 'Gm2383', 'Gm6216', 'Gm17644', 'Sulf1', 'Gm5250', 'Gm29283', 'Unknown', 'Gm29570', 'Xkr9', 'Unknown', 'Unknown', 'Gm9947', 'Gm37412', 'Rpl5-ps1', 'Kcnb2', 'Gm38116', 'Gm37138', 'Gm25168', 'Gm28669', 'Terf1', 'Gm37509', 'Unknown', 'Gm7634', 'Rdh10', 'Gm28095

In [25]:
adata.var.index=gene_names

In [26]:
adata.write("/temp/data/mouse_PFD_sleepdeprivation/GSE211088_snRNA_txt/adata_genename.h5ad")

In [27]:
# marker genes
MITO_GENE_PREFIX = "mt-" # Prefix for mitochondrial genes to regress out
markers = ["CX3CR1","CLDN5","GLUL","NDRG2","PCDH15","PLP1","MBP","SATB2","SLC17A7",
"SLC17A6","GAD2","GAD1","SNAP25"] # Marker genes for visualization
#print(d_r_column_rep[1:])

markers = [str.upper() for str in markers]

# filtering cells
min_genes_per_cell = 200 # Filter out cells with fewer genes than this expressed 
max_genes_per_cell = 6000 # Filter out cells with more genes than this expressed 

# filtering genes
min_cells_per_gene = 1 # Filter out genes expressed in fewer cells than this
n_top_genes = 4000 # Number of highly variable genes to retain

# PCA
n_components = 50 # Number of principal components to compute

# Batched PCA
pca_train_ratio = 0.35 # percentage of cells to use for PCA training
n_pca_batches = 10

# t-SNE
tsne_n_pcs = 20 # Number of principal components to use for t-SNE

# k-means
k = 35 # Number of clusters for k-means

# KNN
n_neighbors = 15 # Number of nearest neighbors for KNN graph
knn_n_pcs = 50 # Number of principal components to use for finding nearest neighbors

# UMAP
umap_min_dist = 0.3 
umap_spread = 1.0