### author by yangshichen
### 注意：脚本仅供参考，使用前请仔细阅读

### 加载cpdb

In [10]:
import os
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
from scipy.io import mmread
from scipy.sparse import csr_matrix
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
import cellphonedb

import warnings
warnings.filterwarnings("ignore")

In [11]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

In [12]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, facecolor='white')

scanpy==1.9.8 anndata==0.9.2 umap==0.5.6 numpy==1.24.4 scipy==1.10.1 pandas==2.0.3 scikit-learn==1.3.2 statsmodels==0.14.1 pynndescent==0.5.12


### 读取数据（100%采样）

In [4]:
adata=sc.read("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/HIV-pbmc2/pbmc_celltype.h5ad")
adata.X = adata.layers['counts'].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata

normalizing counts per cell
    finished (0:06:09)


AnnData object with n_obs × n_vars = 2744009 × 21679
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'doublet', 'batch', 'sample', 'stage', 'experiments', 'age', 'total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ncRNA', 'pct_counts_ncRNA', 'total_counts_LOC', 'pct_counts_LOC', 'total_counts_erccs', 'pct_counts_erccs', 'celltype_L1', 'phase_ordered', 'celltype_L3', 'celltype_L1_5', 'celltype_L2'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'age_colors', 'celltype_L2_colors', 'experiments_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'phase_ordered_colors', 'stage_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

### cpdb数据分组提取count和meta信息

In [5]:
adata_HDs=adata[adata.obs['stage'].isin(["HDs"])]
adata_HDs

View of AnnData object with n_obs × n_vars = 1167681 × 21679
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'doublet', 'batch', 'sample', 'stage', 'experiments', 'age', 'total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ncRNA', 'pct_counts_ncRNA', 'total_counts_LOC', 'pct_counts_LOC', 'total_counts_erccs', 'pct_counts_erccs', 'celltype_L1', 'phase_ordered', 'celltype_L3', 'celltype_L1_5', 'celltype_L2'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'age_colors', 'celltype_L2_colors', 'experiments_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'phase_ordered_colors', 'stage_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [6]:
adata_IRs=adata[adata.obs['stage'].isin(["IRs"])]
adata_IRs

View of AnnData object with n_obs × n_vars = 846332 × 21679
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'doublet', 'batch', 'sample', 'stage', 'experiments', 'age', 'total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ncRNA', 'pct_counts_ncRNA', 'total_counts_LOC', 'pct_counts_LOC', 'total_counts_erccs', 'pct_counts_erccs', 'celltype_L1', 'phase_ordered', 'celltype_L3', 'celltype_L1_5', 'celltype_L2'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'age_colors', 'celltype_L2_colors', 'experiments_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'phase_ordered_colors', 'stage_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [7]:
adata_INRs=adata[adata.obs['stage'].isin(["INRs"])]
adata_INRs

View of AnnData object with n_obs × n_vars = 729996 × 21679
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_score', 'doublet', 'batch', 'sample', 'stage', 'experiments', 'age', 'total_counts_rp', 'pct_counts_rp', 'total_counts_hb', 'pct_counts_hb', 'total_counts_ncRNA', 'pct_counts_ncRNA', 'total_counts_LOC', 'pct_counts_LOC', 'total_counts_erccs', 'pct_counts_erccs', 'celltype_L1', 'phase_ordered', 'celltype_L3', 'celltype_L1_5', 'celltype_L2'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'age_colors', 'celltype_L2_colors', 'experiments_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'phase_ordered_colors', 'stage_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

#### Count

In [8]:
adata_HDs.write('/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_HDs_count.h5ad')
adata_IRs.write('/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_IRs_count.h5ad')
adata_INRs.write('/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_INRs_count.h5ad')

In [9]:
del adata

#### Meta

In [10]:
adata_HDs_obs_L2 = adata_HDs.obs[['celltype_L2']]
adata_IRs_obs_L2 = adata_IRs.obs[['celltype_L2']]
adata_INRs_obs_L2 = adata_INRs.obs[['celltype_L2']]

In [11]:
adata_HDs_obs_L2.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_HDs_meta_L2.txt", sep='\t')
adata_IRs_obs_L2.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_IRs_meta_L2.txt", sep='\t')
adata_INRs_obs_L2.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_INRs_meta_L2.txt", sep='\t')

In [12]:
adata_HDs_obs_L3 = adata_HDs.obs[['celltype_L3']]
adata_IRs_obs_L3 = adata_IRs.obs[['celltype_L3']]
adata_INRs_obs_L3 = adata_INRs.obs[['celltype_L3']]

In [13]:
adata_HDs_obs_L3.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L3/meta_file_path/adata_HDs_meta_L3.txt", sep='\t')
adata_IRs_obs_L3.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L3/meta_file_path/adata_IRs_meta_L3.txt", sep='\t')
adata_INRs_obs_L3.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L3/meta_file_path/adata_INRs_meta_L3.txt", sep='\t')

### 分组运行cpdb

#### HDs

In [4]:
cpdb_results_HDs = cpdb_statistical_analysis_method.call(
    cpdb_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/cellphonedb_NATMI.ZIP",
    meta_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_HDs_meta_L2.txt",
    counts_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_HDs_count.h5ad",
    counts_data = 'hgnc_symbol',
    output_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs",
    score_interactions = True,
    iterations = 1000, 
    threshold = 0.1,
    result_precision = 3,
    pvalue = 0.05,
    separator = '|',
    threads = 60,
    output_suffix = "HDs")

Reading user files...
The following user files were loaded successfully:
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_HDs_count.h5ad
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_HDs_meta_L2.txt
[ ][CORE][21/06/24-11:18:05][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:-1 Threads:60 Precision:3
[ ][CORE][21/06/24-11:18:44][INFO] Running Real Analysis
[ ][CORE][21/06/24-11:18:44][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [6:14:50<00:00, 22.49s/it]   


[ ][CORE][21/06/24-17:33:48][INFO] Building Pvalues result
[ ][CORE][21/06/24-17:33:51][INFO] Building results
[ ][CORE][21/06/24-17:33:53][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 28/28 [00:38<00:00,  1.36s/it]

[ ][CORE][21/06/24-17:34:37][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 28/28 [00:09<00:00,  2.86it/s]


[ ][CORE][21/06/24-17:34:50][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 784/784 [01:24<00:00,  9.25it/s]


Saved deconvoluted to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_deconvoluted_HDs.txt
Saved deconvoluted_percents to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_deconvoluted_percents_HDs.txt
Saved means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_means_HDs.txt
Saved pvalues to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_pvalues_HDs.txt
Saved significant_means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_significant_means_HDs.txt
Saved interaction_scores to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/statistical_analysis_interaction_scores_HDs.txt


In [5]:
import ktplotspy as kpy
result = kpy.plot_cpdb_heatmap(pvals = cpdb_results_HDs['pvalues'],
                               degs_analysis = False,
                               figsize = (15, 15),
                               title = "IRs Sum of significant interactions",
                               symmetrical = False,
                               return_tables = True)
# 获取返回的数据表格
interaction_count = result["interaction_count"]
count_network = result["count_network"]
# 将数据表格保存为CSV文件
interaction_count.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/interaction_count_HDs.csv", index=True)
count_network.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/HDs/count_network_HDs.csv", index=True)

#### IRs

In [6]:
cpdb_results_IRs = cpdb_statistical_analysis_method.call(
    cpdb_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/cellphonedb_NATMI.ZIP",
    meta_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_IRs_meta_L2.txt",
    counts_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_IRs_count.h5ad",
    counts_data = 'hgnc_symbol',
    output_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs",
    score_interactions = True,
    iterations = 1000, 
    threshold = 0.1,
    result_precision = 3,
    pvalue = 0.05,
    separator = '|',
    threads = 60,
    output_suffix = "IRs")

Reading user files...
The following user files were loaded successfully:
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_IRs_count.h5ad
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_IRs_meta_L2.txt
[ ][CORE][21/06/24-17:44:22][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:-1 Threads:60 Precision:3
[ ][CORE][21/06/24-17:45:04][INFO] Running Real Analysis
[ ][CORE][21/06/24-17:45:04][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [4:34:37<00:00, 16.48s/it] 


[ ][CORE][21/06/24-22:19:57][INFO] Building Pvalues result
[ ][CORE][21/06/24-22:19:58][INFO] Building results
[ ][CORE][21/06/24-22:19:59][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 28/28 [00:26<00:00,  1.04it/s]

[ ][CORE][21/06/24-22:20:28][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 28/28 [00:06<00:00,  4.05it/s]


[ ][CORE][21/06/24-22:20:37][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 784/784 [00:43<00:00, 18.04it/s]


Saved deconvoluted to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_deconvoluted_IRs.txt
Saved deconvoluted_percents to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_deconvoluted_percents_IRs.txt
Saved means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_means_IRs.txt
Saved pvalues to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_pvalues_IRs.txt
Saved significant_means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_significant_means_IRs.txt
Saved interaction_scores to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/statistical_analysis_interaction_scores_IRs.txt


In [7]:
result = kpy.plot_cpdb_heatmap(pvals = cpdb_results_IRs['pvalues'],
                               degs_analysis = False,
                               figsize = (15, 15),
                               title = "IRs Sum of significant interactions",
                               symmetrical = False,
                               return_tables = True)
# 获取返回的数据表格
interaction_count = result["interaction_count"]
count_network = result["count_network"]
# 将数据表格保存为CSV文件
interaction_count.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/interaction_count_IRs.csv", index=True)
count_network.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/IRs/count_network_IRs.csv", index=True)

#### INRs

In [8]:
cpdb_results_INRs = cpdb_statistical_analysis_method.call(
    cpdb_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/cellphonedb_NATMI.ZIP",
    meta_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_INRs_meta_L2.txt",
    counts_file_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_INRs_count.h5ad",
    counts_data = 'hgnc_symbol',
    output_path = "/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs",
    score_interactions = True,
    iterations = 1000, 
    threshold = 0.1,
    result_precision = 3,
    pvalue = 0.05,
    separator = '|',
    threads = 60,
    output_suffix = "INRs")

Reading user files...
The following user files were loaded successfully:
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/counts_file_path/adata_INRs_count.h5ad
/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/meta_file_path/adata_INRs_meta_L2.txt
[ ][CORE][21/06/24-22:25:25][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:-1 Threads:60 Precision:3
[ ][CORE][21/06/24-22:25:48][INFO] Running Real Analysis
[ ][CORE][21/06/24-22:25:48][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [3:46:38<00:00, 13.60s/it] 


[ ][CORE][22/06/24-02:12:36][INFO] Building Pvalues result
[ ][CORE][22/06/24-02:12:38][INFO] Building results
[ ][CORE][22/06/24-02:12:38][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 28/28 [00:23<00:00,  1.18it/s]

[ ][CORE][22/06/24-02:13:04][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 28/28 [00:06<00:00,  4.63it/s]


[ ][CORE][22/06/24-02:13:12][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 784/784 [00:40<00:00, 19.13it/s]


Saved deconvoluted to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_deconvoluted_INRs.txt
Saved deconvoluted_percents to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_deconvoluted_percents_INRs.txt
Saved means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_means_INRs.txt
Saved pvalues to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_pvalues_INRs.txt
Saved significant_means to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_significant_means_INRs.txt
Saved interaction_scores to /media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/statistical_analysis_interaction_scores_INRs.txt


In [9]:
result = kpy.plot_cpdb_heatmap(pvals = cpdb_results_INRs['pvalues'],
                               degs_analysis = False,
                               figsize = (15, 15),
                               title = "INRs Sum of significant interactions",
                               symmetrical = False,
                               return_tables = True)
# 获取返回的数据表格
interaction_count = result["interaction_count"]
count_network = result["count_network"]
# 将数据表格保存为CSV文件
interaction_count.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/interaction_count_INRs.csv", index=True)
count_network.to_csv("/media/AnalysisDisk1/Yanshichen/0_HIV_RNA/cpdb/L2/out_file_path/INRs/count_network_INRs.csv", index=True)