# RAPIDS & Scanpy Single-Cell RNA-seq Workflow on MnPO cells

Copyright (c) 2020, NVIDIA CORPORATION.

Licensed under the Apache License, Version 2.0 (the "License") you may not use this file except in compliance with the License. You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0 

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

This notebook demonstrates a single-cell RNA analysis workflow that begins with preprocessing a count matrix of size `(n_gene, n_cell)` and results in a visualization of the clustered cells for further analysis.

For demonstration purposes, we use a dataset of 1.3 M brain cells with Unified Virtual Memory to oversubscribe GPU memory.

## Import requirements

In [1]:
import numpy as np
import scanpy as sc
import anndata
import scipy.io
import scipy.sparse

import time
import os, wget


import cudf

from cuml.decomposition import PCA
from cuml.manifold import TSNE
from cuml.cluster import KMeans
from cuml.preprocessing import StandardScaler

import cuml
import rapids_scanpy_funcs
import utils

import warnings
warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import rmm

from rmm.allocators.cupy import rmm_cupy_allocator
import cupy
cupy.cuda.set_allocator(rmm_cupy_allocator)
from scipy import sparse
import gc
import cupy as cp
gc.collect()
cp.get_default_memory_pool().free_all_blocks()
import calculation_tool as ct

We use the RAPIDS memory manager to enable Unified Virtual Memory management, which allows us to oversubscribe the GPU memory.

## Input data

In the cell below, we provide the path to the sparse `.h5ad` file containing the count matrix to analyze. Please see the README for instructions on how to download the dataset we use here.

To run this notebook using your own dataset, please see the README for instructions to convert your own count matrix into this format. Then, replace the path in the cell below with the path to your generated `.h5ad` file.

In [None]:

url=r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4663167&format=file&file=GSM4663167%5F1%2DSFO%2Dsingle%2Dcell%2Ecsv%2Egz"
input_file="/temp/data/mouse_SFO/GSM4663167_1-SFO-single-cell.csv.gz"
input_csv="/temp/data/mouse_SFO/GSM4663167_1-SFO-single-cell.csv"
if not os.path.exists(input_csv):
    print('Downloading import file...')
    os.makedirs('/temp/data/mouse_SFO', exist_ok=True)
    wget.download(url,input_file)

results_file = "/temp/data/mouse_SFO/sfo_ctrl.h5ad"

In [None]:

url=r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4663169&format=file&file=GSM4663169%5F6%2DSFO%2Dstimulus%2Dto%2Dcell%2Dtype%2Dmapping%2Dwater%5Fsated%2Ecsv%2Egz"
input_file="/temp/data/mouse_SFO/GSM4663169_6-SFO-stimulus-to-cell-type-mapping-water_sated.csv.gz"
input_csv="/temp/data/mouse_SFO/GSM4663169_6-SFO-stimulus-to-cell-type-mapping-water_sated.csv"
if not os.path.exists(input_csv):
    print('Downloading import file...')
    os.makedirs('/temp/data/mouse_SFO', exist_ok=True)
    wget.download(url,input_file)

In [None]:

url=r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4663170&format=file&file=GSM4663170%5F7%2DSFO%2Dstimulus%2Dto%2Dcell%2Dtype%2Dmapping%2Dosmotic%5Fthirst%2Ecsv%2Egz"
input_file="/temp/data/mouse_SFO/GSM4663170_7-SFO-stimulus-to-cell-type-mapping-osmotic_thirst.csv.gz"
input_csv="/temp/data/mouse_SFO/GSM4663170_7-SFO-stimulus-to-cell-type-mapping-osmotic_thirst.csv"
if not os.path.exists(input_csv):
    print('Downloading import file...')
    os.makedirs('/temp/data/mouse_SFO', exist_ok=True)
    wget.download(url,input_file)

In [None]:

url=r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4663171&format=file&file=GSM4663171%5F8%2DSFO%2Dstimulus%2Dto%2Dcell%2Dtype%2Dmapping%2Dhypovolemic%5Fthirst%2Ecsv%2Egz"
input_file="/temp/data/mouse_SFO/GSM4663171_8-SFO-stimulus-to-cell-type-mapping-hypovolemic_thirst.csv.gz"
input_csv="/temp/data/mouse_SFO/GSM4663171_8-SFO-stimulus-to-cell-type-mapping-hypovolemic_thirst.csv"
if not os.path.exists(input_csv):
    print('Downloading import file...')
    os.makedirs('/temp/data/mouse_SFO', exist_ok=True)
    wget.download(url,input_file)

In [None]:

url=r"https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM4663172&format=file&file=GSM4663172%5F9%2DSFO%2Dstimulus%2Dto%2Dcell%2Dtype%2Dmapping%2D36H%5Fwater%5Fdeprivation%2Ecsv%2Egz"
input_file="/temp/data/mouse_SFO/GSM4663172_9-SFO-stimulus-to-cell-type-mapping-36H_water_deprivation.csv.gz"
input_csv="/temp/data/mouse_SFO/GSM4663172_9-SFO-stimulus-to-cell-type-mapping-36H_water_deprivation.csv"
if not os.path.exists(input_csv):
    print('Downloading import file...')
    os.makedirs('/temp/data/mouse_SFO', exist_ok=True)
    wget.download(url,input_file)

In [None]:
from sh import gunzip
import glob
dir="/data/mouse_SFO/"
gz_list=glob.glob(os.path.join(dir,"GSM*.gz"))
for gz in gz_list:
    gunzip(gz)


In [None]:
import glob
dir="/data/mouse_SFO/"
csv_list=glob.glob(os.path.join(dir,"GSM*.csv"))


In [None]:
from scipy import sparse
csv_list
label_list=["plane","hypovolemic_thirst","water_sated","osmotic_thirst","water_deprivation"]
all_adatas = []
for i,csv in enumerate(csv_list):
    print(csv)
    adata=sc.read_csv(csv)
    adata=adata.transpose()
    sparse_X = sparse.csr_matrix(adata.X)
    adata.X = sparse_X
    adata.obs['label'] = label_list[i]
    all_adatas.append(adata)
    
combined_adata = all_adatas[0].concatenate(all_adatas[1:], join='outer')
combined_adata.write("/data/mouse_SFO/combined_data.h5ad")

In [None]:
combined_adata.var_names

In [None]:
genes=adata.var_names
pd.DataFrame(genes).to_csv("/temp/data/mouse_SFO/WT_SFO_adata_genes.csv")

In [None]:
import calculation_tool as ct
file_path="/data/mouse_SFO/combined_data.h5ad"
add_markers=["NOS1","CAMK2A","CAMK2B","ETV1","RXFP1","PDYN"]
adata,GPCR_df=ct.preprocess_adata_in_bulk(file_path,label=None,add_markers=add_markers,is_gpu=False)
GPCR_df.to_csv("/data/mouse_SFO/combined_data_GPCR_df.csv")

In [2]:
file_path="/data/mouse_SFO/WT_SFO_adata_processed.h5ad"
adata=sc.read_h5ad(file_path)
GPCR_df=pd.read_csv("/data/mouse_SFO/combined_data_GPCR_df.csv")
D_R_mtx,GPCR_type_df,drug_list,GPCR_list=ct.load_parameters()
params=ct.set_parameters_for_preprocess(GPCR_list)

In [None]:
sc.pl.umap(adata, color=["CAMK2A_raw"])

In [None]:
sc.pl.umap(adata, color=["RXFP1_raw"])

In [None]:
sc.pl.umap(adata, color=["PDYN_raw"])

In [None]:
adata.obs_keys

In [None]:
sc.pl.umap(adata, color=["is_clz_selective"],palette=["grey","red"])

In [None]:
nos1_expression=adata.obs["NOS1_raw"]

In [None]:
plt.hist(nos1_expression)

In [None]:
import matplotlib.pyplot as plt

add_markers = ["NOS1", "CAMK2A", "CAMK2B", "ETV1", "RXFP1", "PDYN", "SLC17A6", "SLC32A1"]  # SLC17A6=VGLUT2, SLC32A1=VGAT
for marker in add_markers:
    # show=False にすることでプロットを描画せずに Axes オブジェクトを返す
    ax = sc.pl.violin(adata, marker, groupby='leiden', show=False)
    
    # Axes オブジェクトのレジェンドを取得して削除
    legend = ax.get_legend()
    if legend is not None:
        legend.remove()
    
    # プロットを表示
    plt.show()

In [None]:
#thirst_cell_leiden=[4,12]
thirst_cell_leiden=[11,16]
adata.obs["is_thirst"] = np.where(adata.obs["leiden"].isin(thirst_cell_leiden), 1, 0)
adata.obs["is_thirst"] =adata.obs["is_thirst"].astype("category")

In [None]:
sc.pl.umap(adata, color=["is_thirst"],palette=["grey","red"])

In [None]:
sc.pl.umap(adata, color=["is_clz_inhibited"],palette=["grey","red"])

In [None]:
adata.obs_keys

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

label_include_list=["plane","water_sated"]
#label_include_list=["hypovolemic_thirst","osmotic_thirst","water_deprivation"]
adata_sated=adata[adata.obs['label'].isin(label_include_list)]
# is_thirstがTrue（または1）のデータをフィルタリング
thirsty_cells = adata_sated[adata_sated.obs['is_thirst'] == 1]

# 長い形式（long-form）にデータを整形
long_df = pd.melt(thirsty_cells.obs, id_vars=['is_thirst'], value_vars=['cAMP_CLOZAPINE', 'cAMP_OLANZAPINE','cAMP_HALOPERIDOL'])

# value=0のcellを除外
long_df=long_df[long_df.value!=0]

# ボックスプロットを作成する
plt.figure(figsize=(10, 6))
#sns.boxplot(data=long_df, x='variable', y='value',whis=(25,75),
#            showfliers=True,notch=True,medianprops={"color": "r", "linewidth": 2})
sns.violinplot(data=long_df, x='variable', y='value')
plt.title('Comparison of cAMP_CLOZAPINE and cAMP_OLANZAPINE in Thirsty Cells')
plt.xlabel('Treatment')
plt.ylabel('cAMP Levels')
plt.show()

# 統計検定
clozapine_values = thirsty_cells.obs['cAMP_CLOZAPINE']
olanzapine_values = thirsty_cells.obs['cAMP_OLANZAPINE']

# t-test
t_stat, t_pval = stats.ttest_ind(clozapine_values, olanzapine_values, equal_var=False)

# Mann-Whitney U test
u_stat, u_pval = stats.mannwhitneyu(clozapine_values, olanzapine_values)

print(f"T-test statistic: {t_stat}, p-value: {t_pval}")
print(f"Mann-Whitney U test statistic: {u_stat}, p-value: {u_pval}")

In [None]:
long_df