# Cell Cell Communication:

Comparing cell to cell communication in plaques for sort vs unsort paper

Downsample to make comparisons easier:


In [1]:
#Imports:
import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc
from datetime import datetime
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Set up data working directory:
data_path = '/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data/'
figure_path = 'home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/figures/'
figure_data = figure_path + "figure_data/"

%matplotlib inline

In [2]:
#Output session Info
import session_info
session_info.show()

In [3]:
#Output package versions:
sc.logging.print_header()

scanpy==1.10.4 anndata==0.11.2 umap==0.5.7 numpy==2.0.2 scipy==1.15.0 pandas==2.2.3 scikit-learn==1.6.0 statsmodels==0.14.4 pynndescent==0.5.13


In [4]:
#Date time run
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

date and time = 04/02/2025 19:37:32


In [5]:
#Load Data data:
adata = sc.read_h5ad(data_path + "all_sort_unsort_fine_new_colors.h5ad")

In [6]:
#Replace .X with raw data (i.e. Log normalised data:)
adata = adata.raw.to_adata()

In [7]:
adata.obs['type'].value_counts()

type
unsorted    12088
sorted       8205
Name: count, dtype: int64

In [8]:
pd.crosstab(adata.obs['fine_clustering'],
           adata.obs['type'])

type,sorted,unsorted
fine_clustering,Unnamed: 1_level_1,Unnamed: 2_level_1
C1Q+ Macro.,120,836
CD4+ Teff,409,529
CD4+ Tnaive,1913,2534
CD4+ Treg,290,359
CD4+/CD8+ Trm/exh,128,138
CD8+ Tem,1980,2585
CD8+ Temra,278,350
CD8+ Tnaive,104,102
CD8+ Trm,418,595
CD16+ NK,681,692


In [9]:
##DOWNSAMPLING CODE##
# Assume 'adata' is your original anndata object and 'condition' is the key in `adata.obs` indicating treatment conditions
condition_col = 'type'
cell_type_col = 'fine_clustering'

# Set the random seed for reproducibility
#Used 87 previously | 837 2nd sample
random_seed = 87
np.random.seed(random_seed)

# Split the data into two conditions
adata_cond1 = adata[adata.obs[condition_col] == 'sorted']
adata_cond2 = adata[adata.obs[condition_col] == 'unsorted']

# Get unique cell types
cell_types = adata.obs[cell_type_col].unique()

# Downsample for each cell type
downsampled_cells = []
for cell_type in cell_types:
    # Get cells of this type from each condition
    cells_cond1 = adata_cond1[adata_cond1.obs[cell_type_col] == cell_type]
    cells_cond2 = adata_cond2[adata_cond2.obs[cell_type_col] == cell_type]
    
    # Determine the number of cells to sample (min count from both conditions)
    n_cells = min(cells_cond1.shape[0], cells_cond2.shape[0])
    
    # Randomly sample cells
    sampled_cells_cond1 = cells_cond1[np.random.choice(cells_cond1.shape[0], n_cells, replace=False)]
    sampled_cells_cond2 = cells_cond2[np.random.choice(cells_cond2.shape[0], n_cells, replace=False)]
    
    # Append to the list
    downsampled_cells.append(sampled_cells_cond1)
    downsampled_cells.append(sampled_cells_cond2)

# Concatenate all downsampled cells into a new anndata object
downsampled_adata = ad.concat(downsampled_cells)

In [10]:
downsampled_adata.obs['type'].value_counts()

type
sorted      8057
unsorted    8057
Name: count, dtype: int64

In [11]:
pd.crosstab(downsampled_adata.obs['fine_clustering'],
           downsampled_adata.obs['type'])

type,sorted,unsorted
fine_clustering,Unnamed: 1_level_1,Unnamed: 2_level_1
C1Q+ Macro.,120,120
CD4+ Teff,409,409
CD4+ Tnaive,1913,1913
CD4+ Treg,290,290
CD4+/CD8+ Trm/exh,128,128
CD8+ Tem,1980,1980
CD8+ Temra,278,278
CD8+ Tnaive,102,102
CD8+ Trm,418,418
CD16+ NK,681,681


In [12]:
adata = downsampled_adata.copy()

In [13]:
adata

AnnData object with n_obs × n_vars = 16114 × 30671
    obs: 'sample', 'species', 'gene_count', 'tscp_count', 'mread_count', 'bc1_wind', 'bc2_wind', 'bc3_wind', 'bc1_well', 'bc2_well', 'bc3_well', 'n_counts', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier', 'leiden', 'predicted_labels', 'majority_voting', 'conf_score', 'over_clustering', 'celltypist_cell_label_coarse', 'celltypist_conf_score_coarse', 'celltypist_cell_label_fine', 'celltypist_conf_score_fine', 'type', 'plaque', 'initial_clustering', 'fine_temp', 'fine_clustering', 'fine_temp2', 'fine_temp3', 'fine_temp4', 'fine_temp5', 'pie_clustering'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'

### Sorted

In [14]:
#Subset data based on sorting and save log-normalised files to drive:
bdata = adata[adata.obs['type']=='sorted'].copy()

In [15]:
#Write required files to cpdb analysis folder to use later:
#Make Data object - only requies the expression matrix:
adata1 = sc.AnnData(bdata.X, obs=pd.DataFrame(index=bdata.obs.index),
                   var=pd.DataFrame(index=bdata.var.index))
adata1.write_h5ad(data_path + "cpdb/sorted_log_norm_down.h5ad", compression='gzip')
adata1

AnnData object with n_obs × n_vars = 8057 × 30671

In [16]:
#Write required files to cpdb analysis folder to use later:
#Make meatadata object - only requies the expression matrix:
df_meta = pd.DataFrame(data={'Cell':list(bdata[bdata.obs.index].obs.index),
                             'cell_type':[ i for i in bdata[adata1.obs.index].obs['fine_clustering']]
                            })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv(data_path + 'cpdb/sorted_meta_down.tsv', sep = '\t')

In [17]:
#Define variables for cpdb:
cpdb_file_path = '/home/jo388/rds/hpc-work/dependencies/cpdb/cellphonedb_v5.0.zip'
meta_file_path = data_path + '/cpdb/sorted_meta_down.tsv'
counts_file_path = data_path + '/cpdb/sorted_log_norm_down.h5ad'
microenvs_file_path = None
active_tf_path = None
out_path = data_path + '/cpdb/results'

In [18]:
cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    active_tfs_file_path = active_tf_path,           # optional: defines cell types and their active TFs.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 10,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = 'sorted_down'                    # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/sorted_log_norm_down.h5ad
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/sorted_meta_down.tsv
[ ][CORE][04/02/25-19:37:40][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:10 Precision:3
[ ][CORE][04/02/25-19:37:41][INFO] Running Real Analysis
[ ][CORE][04/02/25-19:37:41][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:57<00:00, 17.53it/s]

[ ][CORE][04/02/25-19:38:38][INFO] Building Pvalues result





[ ][CORE][04/02/25-19:38:39][INFO] Building results
[ ][CORE][04/02/25-19:38:39][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 27/27 [00:00<00:00, 206.14it/s]

[ ][CORE][04/02/25-19:38:40][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 27/27 [00:00<00:00, 727.71it/s]


[ ][CORE][04/02/25-19:38:40][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 729/729 [00:36<00:00, 20.15it/s]


Saved deconvoluted to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_sorted_down.txt
Saved deconvoluted_percents to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_percents_sorted_down.txt
Saved means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_means_sorted_down.txt
Saved pvalues to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_pvalues_sorted_down.txt
Saved significant_means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_significant_means_sorted_down.txt
Saved interaction_scores to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_interaction_scores_sorted_down.txt


In [19]:
#Save cpdb_results object to import later for analysis:
import pickle
with open (data_path + 'cpdb/cpdb_sorted_data_down.pkl', 'wb') as pkl_file:
    pickle.dump(cpdb_results, pkl_file)

### Filtered:

In [20]:
#Now remove SMC and endothelial cells as these are enriched in bead sorted compared to facs sorted:
bdata = bdata[~bdata.obs['fine_clustering'].isin(['Endothelial','SMCs'])]

In [21]:
#Write required files to cpdb analysis folder to use later:
#Make Data object - only requies the expression matrix:
adata1 = sc.AnnData(bdata.X, obs=pd.DataFrame(index=bdata.obs.index),
                   var=pd.DataFrame(index=bdata.var.index))
adata1.write_h5ad(data_path + "cpdb/sorted_log_norm_filtered_down.h5ad", compression='gzip')
adata1

AnnData object with n_obs × n_vars = 8028 × 30671

In [22]:
#Write required files to cpdb analysis folder to use later:
#Make meatadata object - only requies the expression matrix:
df_meta = pd.DataFrame(data={'Cell':list(bdata[bdata.obs.index].obs.index),
                             'cell_type':[ i for i in bdata[adata1.obs.index].obs['fine_clustering']]
                            })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv(data_path + 'cpdb/sorted_meta_filtered_down.tsv', sep = '\t')

In [23]:
#Define variables for cpdb:
cpdb_file_path = '/home/jo388/rds/hpc-work/dependencies/cpdb/cellphonedb_v5.0.zip'
meta_file_path = data_path + '/cpdb/sorted_meta_filtered_down.tsv'
counts_file_path = data_path + '/cpdb/sorted_log_norm_filtered_down.h5ad'
microenvs_file_path = None
active_tf_path = None
out_path = data_path + '/cpdb/results'

In [24]:
cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    active_tfs_file_path = active_tf_path,           # optional: defines cell types and their active TFs.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 10,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = 'sorted_filtered_down'                # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/sorted_log_norm_filtered_down.h5ad
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/sorted_meta_filtered_down.tsv
[ ][CORE][04/02/25-19:39:22][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:10 Precision:3
[ ][CORE][04/02/25-19:39:22][INFO] Running Real Analysis
[ ][CORE][04/02/25-19:39:22][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:54<00:00, 18.22it/s]

[ ][CORE][04/02/25-19:40:17][INFO] Building Pvalues result





[ ][CORE][04/02/25-19:40:18][INFO] Building results
[ ][CORE][04/02/25-19:40:18][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 25/25 [00:00<00:00, 228.11it/s]

[ ][CORE][04/02/25-19:40:19][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 25/25 [00:00<00:00, 678.19it/s]


[ ][CORE][04/02/25-19:40:19][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 625/625 [00:29<00:00, 21.18it/s]


Saved deconvoluted to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_sorted_filtered_down.txt
Saved deconvoluted_percents to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_percents_sorted_filtered_down.txt
Saved means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_means_sorted_filtered_down.txt
Saved pvalues to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_pvalues_sorted_filtered_down.txt
Saved significant_means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_significant_means_sorted_filtered_down.txt
Saved interaction_scores to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_interaction_scores_sorted_filtered_down.txt


In [25]:
#Save cpdb_results object to import later for analysis:
import pickle
with open (data_path + 'cpdb/cpdb_sorted_data_filtered_down.pkl', 'wb') as pkl_file:
    pickle.dump(cpdb_results, pkl_file)

### Unsorted:

In [26]:
#Subset data based on sorting and save log-normalised files to drive:
bdata = adata[adata.obs['type']=='unsorted'].copy()

In [27]:
#Write required files to cpdb analysis folder to use later:
#Make Data object - only requies the expression matrix:
adata1 = sc.AnnData(bdata.X, obs=pd.DataFrame(index=bdata.obs.index),
                   var=pd.DataFrame(index=bdata.var.index))
adata1.write_h5ad(data_path + "cpdb/unsorted_log_norm_down.h5ad", compression='gzip')
adata1

AnnData object with n_obs × n_vars = 8057 × 30671

In [28]:
#Write required files to cpdb analysis folder to use later:
#Make meatadata object - only requies the expression matrix:
df_meta = pd.DataFrame(data={'Cell':list(bdata[bdata.obs.index].obs.index),
                             'cell_type':[ i for i in bdata[adata1.obs.index].obs['fine_clustering']]
                            })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv(data_path + 'cpdb/unsorted_meta_down.tsv', sep = '\t')

In [29]:
#Define variables for cpdb:
cpdb_file_path = '/home/jo388/rds/hpc-work/dependencies/cpdb/cellphonedb_v5.0.zip'
meta_file_path = data_path + '/cpdb/unsorted_meta_down.tsv'
counts_file_path = data_path + '/cpdb/unsorted_log_norm_down.h5ad'
microenvs_file_path = None
active_tf_path = None
out_path = data_path + '/cpdb/results'

In [30]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    active_tfs_file_path = active_tf_path,           # optional: defines cell types and their active TFs.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 10,                                    # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = 'unsorted_down'                       # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/unsorted_log_norm_down.h5ad
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/unsorted_meta_down.tsv
[ ][CORE][04/02/25-19:40:54][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:10 Precision:3
[ ][CORE][04/02/25-19:40:54][INFO] Running Real Analysis
[ ][CORE][04/02/25-19:40:55][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:56<00:00, 17.71it/s]

[ ][CORE][04/02/25-19:41:51][INFO] Building Pvalues result





[ ][CORE][04/02/25-19:41:52][INFO] Building results
[ ][CORE][04/02/25-19:41:52][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 27/27 [00:00<00:00, 230.43it/s]

[ ][CORE][04/02/25-19:41:53][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 27/27 [00:00<00:00, 704.79it/s]


[ ][CORE][04/02/25-19:41:53][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 729/729 [00:34<00:00, 21.04it/s]


Saved deconvoluted to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_unsorted_down.txt
Saved deconvoluted_percents to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_percents_unsorted_down.txt
Saved means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_means_unsorted_down.txt
Saved pvalues to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_pvalues_unsorted_down.txt
Saved significant_means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_significant_means_unsorted_down.txt
Saved interaction_scores to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_interaction_scores_unsorted_down.txt


### Filtered

In [31]:
#Save cpdb_results object to import later for analysis:
import pickle
with open (data_path + 'cpdb/cpdb_unsorted_data_down.pkl', 'wb') as pkl_file:
    pickle.dump(cpdb_results, pkl_file)

In [32]:
#Now remove SMC and endothelial cells as these are enriched in bead sorted compared to facs sorted:
bdata = bdata[~bdata.obs['fine_clustering'].isin(['Endothelial','SMCs'])]

In [33]:
#Write required files to cpdb analysis folder to use later:
#Make Data object - only requies the expression matrix:
adata1 = sc.AnnData(bdata.X, obs=pd.DataFrame(index=bdata.obs.index),
                   var=pd.DataFrame(index=bdata.var.index))
adata1.write_h5ad(data_path + "cpdb/unsorted_log_norm_filtered_down.h5ad", compression='gzip')
adata1

AnnData object with n_obs × n_vars = 8028 × 30671

In [34]:
#Write required files to cpdb analysis folder to use later:
#Make meatadata object - only requies the expression matrix:
df_meta = pd.DataFrame(data={'Cell':list(bdata[bdata.obs.index].obs.index),
                             'cell_type':[ i for i in bdata[adata1.obs.index].obs['fine_clustering']]
                            })
df_meta.set_index('Cell', inplace=True)
df_meta.to_csv(data_path + 'cpdb/unsorted_meta_filtered_down.tsv', sep = '\t')

In [35]:
#Define variables for cpdb:
cpdb_file_path = '/home/jo388/rds/hpc-work/dependencies/cpdb/cellphonedb_v5.0.zip'
meta_file_path = data_path + '/cpdb/unsorted_meta_filtered_down.tsv'
counts_file_path = data_path + '/cpdb/unsorted_log_norm_filtered_down.h5ad'
microenvs_file_path = None
active_tf_path = None
out_path = data_path + '/cpdb/results'

In [36]:
cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    active_tfs_file_path = active_tf_path,           # optional: defines cell types and their active TFs.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 10,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = 'unsorted_filtered_down'                # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/unsorted_log_norm_filtered_down.h5ad
/home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/unsorted_meta_filtered_down.tsv
[ ][CORE][04/02/25-19:42:34][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:10 Precision:3
[ ][CORE][04/02/25-19:42:34][INFO] Running Real Analysis
[ ][CORE][04/02/25-19:42:34][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:53<00:00, 18.59it/s]

[ ][CORE][04/02/25-19:43:28][INFO] Building Pvalues result





[ ][CORE][04/02/25-19:43:29][INFO] Building results
[ ][CORE][04/02/25-19:43:29][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 25/25 [00:00<00:00, 208.92it/s]

[ ][CORE][04/02/25-19:43:29][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 25/25 [00:00<00:00, 618.24it/s]


[ ][CORE][04/02/25-19:43:29][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 625/625 [00:29<00:00, 21.25it/s]


Saved deconvoluted to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_unsorted_filtered_down.txt
Saved deconvoluted_percents to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_deconvoluted_percents_unsorted_filtered_down.txt
Saved means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_means_unsorted_filtered_down.txt
Saved pvalues to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_pvalues_unsorted_filtered_down.txt
Saved significant_means to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_significant_means_unsorted_filtered_down.txt
Saved interaction_scores to /home/jo388/rds/rds-zhao-group-HvBBlpwx4dc/ELLIPSE/sort_unsort/data//cpdb/results/statistical_analysis_interaction_scores_unsorted_filtered_dow

In [37]:
#Save cpdb_results object to import later for analysis:
import pickle
with open (data_path + 'cpdb/cpdb_unsorted_data_filtered_down.pkl', 'wb') as pkl_file:
    pickle.dump(cpdb_results, pkl_file)