## B. CellphoneDB:  Identify candidate cell-cell interactions

In [1]:
library(dplyr)
library(reshape2)
library(RColorBrewer)
library(harmony)
library(ggplot2)
library(pheatmap)
library(cowplot)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Rcpp



In [2]:
# Define variables
filter_int_user_curated = T #Use only user_curated interactions?
per_cutoff = 0.1 # min % of cells in the cluster required  with expression > 0 for the gene
pval_cutoff = 0.001 # max adjusted p-value requeired to consider a gene as DEG
logFC_cutoff = 0.2 # min logFC to consider a gene as DEG
strict_DEG = F # Boolean, True if both partner in an interactions should be differentially expressed. False, if only one partner gene is requires to be a DEGs 

In [3]:
# Load cellphone database - genes and complexes file
# Gene names
genes_cpDB = read.delim('~/gonads/main/cellphoneDB/hsa_uniprot.txt', stringsAsFactors = F, sep = '\t')

# Complexes members
com_cpDB = read.csv('~/gonads/main/cellphoneDB/database/complex_curated_032021.tsv', stringsAsFactors = F, sep='\t')
com_cpDB = unique(com_cpDB)
com_cpDB$complex_name = paste0('complex:', com_cpDB$complex_name)
# Generate complexes2gene symbol dictionary
Com2Gene = lapply(com_cpDB$complex_name, function(cx) subset(genes_cpDB, uniprot %in% unlist(com_cpDB[com_cpDB$complex_name == cx, 2:5]) )$gene_name )
Com2Gene = lapply(Com2Gene, unique)
names(Com2Gene) = com_cpDB$complex_name
                  
# Load interactions from cellphoneDB/out/means.txt output file                   
int_cpDB = read.delim('~/gonads/main/cellphoneDB/out/means.txt', stringsAsFactors = F)[, 1:10]
# MANDATORY: remove "curated" because we have cleaned and renamed them (this is a long story, just do it)
int_cpDB = subset(int_cpDB, annotation_strategy != 'curated')
# OPTIONAL: Use only user_curated interactions?
if(filter_int_user_curated)
    int_cpDB = subset(int_cpDB, annotation_strategy == 'user_curated')
# Generate Int2Gene dictionary
Int2Gene = apply(int_cpDB, 1, function(int) {
    int = unname(int)
    if(int[5] == '') # if partnerA is complex, then retrieve members from dictionary
       partnerA = Com2Gene[[int[3]]]
    else  # if partnerA is a single gene, take name from file
       partnerA = int[5]
    if(int[6] == '')  # if partnerB is complex, then retrieve members from dictionary
       partnerB = Com2Gene[[int[4]]]
    else  # if partnerB is a single gene, take name from file
       partnerB = int[6]
    list(partner_a=partnerA,partner_b=partnerB)
    }) 
names(Int2Gene) = int_cpDB$interacting_pair

In [4]:
# Load somatic cells DEGs
# you need to put in a folder a file for each cluster that you included in cellphone
# The file must contain at least three columns (aka, you can reuse the output of the DEG files):
# - Gene: gene symbol
# - cluster: cell type
# - percentExpr_cluster: % of cells in the cluster with expression > 0 for the gene
path_Exp = 'cellphoneDB/input/percent.csv'


# Load
Per_df = read.csv(path_Exp, stringsAsFactors = F, row.names = 1) 
Per_df = setNames(melt(as.matrix(Per_df), factorsAsStrings = T), c('gene', 'cluster', 'percent'))


# Filetr expressed
Per_df = subset(Per_df, percent >= per_cutoff)
Per_df = Per_df[ order(Per_df$gene), ]

# Build dictionary clusters2expressed genes
clusters = unique(Per_df$cluster)
is_Exp = lapply(clusters, function(cl){
    filter(Per_df, cluster == cl)$gene
})
names(is_Exp) = clusters
sapply(is_Exp, length)
names(is_Exp) = gsub('pre.spermatogonia', 'pre-spermatogonia', names(is_Exp))

In [5]:
# load DE expression info
# you need to put in a folder the DE files of the clusters that you're interested in
# The file must contain at least four columns (aka, you can reuse the output of the DEG files):
# - Gene: gene symbol
# - cluster: cell type
# - adj.P.Val: adjusted p-value from the DE test
# - logFC: from the DE test
path_DEGs = 'cellphoneDB/input/DEGs/merged.csv'


# Load
DE_df = read.csv(path_DEGs, stringsAsFactors = F) %>%
        filter(., gene %in% genes_cpDB$gene_name) %>%
        filter(., p_val_adj < pval_cutoff) %>%
        filter(., avg_logFC > logFC_cutoff)
        


# Build dictionary clusters2expressed genes
clusters = unique(Per_df$cluster)
is_DE = lapply(clusters, function(cl){
    filter(DE_df, cluster == cl)$gene
})
names(is_DE) = clusters
  
sapply(is_DE, length)
names(is_DE) = gsub('pre.spermatogonia', 'pre-spermatogonia', names(is_DE))

In [6]:
# Get all pairwise cluster combinations: A--B, B--A
cluster_combinations = combn(names(is_Exp), 2, simplify = T)
ncol(cluster_combinations)

# add self interactions
cluster_combinations = cbind(sapply(names(is_Exp), function(x) rep(x, 2) ), cluster_combinations)


# We only want to test pairs including at least one cluster in the DE folder (here, epithelial)
keep = apply(cluster_combinations, 2, function(x) any(x %in% names(is_DE)))
cluster_combinations = cluster_combinations[, keep]
             
# # We only want to test pairs including at least one cluster named 'media'
# keep = apply(cluster_combinations, 2, function(x) any(x %in% 'Media'))
# cluster_combinations = cluster_combinations[, keep]
# ncol(cluster_combinations)

# Consider the reverse interaction too: A--B but B--A as well
cluster_combinations = cbind(cluster_combinations, cluster_combinations[2:1,]) %>% t(.) %>% unique(.)  %>% t(.) 

In [7]:
micro_df = read.csv('cellphoneDB/input/microenviroments/microenviroments.csv', stringsAsFactors = F)
microenviroments = split(micro_df, micro_df$microenviroment)  %>%  sapply(., select, celltype)

# Filter interactions within microenviroments
within_idx = c()
for (mienv in microenviroments){
    x = apply(cluster_combinations, 1, function(x) x %in% mienv) 
    within_idx = c(within_idx,
                   which(x[, 1] + x[, 2] == 2))
}
cluster_combinations = cluster_combinations[, within_idx]
              

In [8]:
# Make cluster pair labels: celltypeA--celltypeB             
cluster_combinations_labels = apply(cluster_combinations, 2, paste, collapse = '---')
ncol(cluster_combinations)
             
colnames(cluster_combinations) = cluster_combinations_labels

In [9]:
# Make scaffold matrix: L/R interactions (rows) x celltype pairs (columns)
E = matrix(0, nrow = length(Int2Gene), ncol = length(cluster_combinations_labels), 
           dimnames = list(names(Int2Gene), cluster_combinations_labels) )
dim(E) # is it big?

In [10]:
# E will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# So, fill 1 if all genes are expressed in all clusters
for (Int in rownames(E)){ # iterate over interactions
    for (clpair in colnames(E)){ # iterate over celltype pairs
        cl_A = cluster_combinations[1, clpair] # celltype_A
        cl_B = cluster_combinations[2, clpair] # celltype_B
        are_all_expressed = all(Int2Gene[[Int]]$partner_a %in% is_Exp[[cl_A]]) &&  # are all partner_A genes expressed in celltype_A
                            all(Int2Gene[[Int]]$partner_b %in% is_Exp[[cl_B]])     # are all partner_B genes expressed in celltype_B
        if(are_all_expressed)
            E[Int, clpair] = 1
    }
}

In [11]:
# keep celltypepairs with at least one expressed interaction
keep = apply(E, 2, sum) > 0
E = E[,keep]
# is it big?
dim(E)
sum(E)

In [12]:
# keep interactions expressed in at least one celltype pair
keep = apply(E, 1, sum) > 0
E = E[keep,]
# is it big?
dim(E)
sum(E)

In [13]:
# Initialize DE matrix from E and set all values to 0
# DE will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# and one is a DE in the celltypes of interests
DE = E
DE[] = 0

In [14]:
if (strict_DEG){
    # Fill 1 if BOTH GENES in the interaction is DE
    for (Int in rownames(DE)){ # iterate over interactions
        for (clpair in colnames(DE)){ # iterate over celltype pairs
            cl_A = cluster_combinations[1, clpair] # celltype_A
            cl_B = cluster_combinations[2, clpair] # celltype_B
            any_DE = any(Int2Gene[[Int]]$partner_a %in% is_DE[[cl_A]]) &&  # is any partner_A gene DE in celltype_A
                     any(Int2Gene[[Int]]$partner_b %in% is_DE[[cl_B]])     # is any partner_B gene DE in celltype_B
            if(any_DE & E[Int, clpair] == 1) # check all members are expressed
                DE[Int, clpair] = 1
        }
    }
}else{
    #Fill 1 if at least one gene in the interaction is DE
    for (Int in rownames(DE)){ # iterate over interactions
        for (clpair in colnames(DE)){ # iterate over celltype pairs
            cl_A = cluster_combinations[1, clpair] # celltype_A
            cl_B = cluster_combinations[2, clpair] # celltype_B
            any_DE = any(Int2Gene[[Int]]$partner_a %in% is_DE[[cl_A]]) ||  # is any partner_A gene DE in celltype_A
                     any(Int2Gene[[Int]]$partner_b %in% is_DE[[cl_B]])     # is any partner_B gene DE in celltype_B
            if(any_DE & E[Int, clpair] == 1) # check all members are expressed
                DE[Int, clpair] = 1
        }
    }
}

In [15]:
# keep celltypepairs with at least one expressed interaction
keep = apply(DE, 2, sum) > 0
DE = DE[,keep]

# keep interactions expressed in at least one celltype pair
keep = apply(DE, 1, sum) > 0
DE = DE[keep,]

# is it big?
dim(DE)
sum(DE)


In [16]:
write.csv(DE, file = paste0('cellphoneDB/out/interactions_' , pval_cutoff, '_logFC', logFC_cutoff, '.csv'))

In [17]:
rownames(DE)[ grep('RAr', rownames(DE)) ]



In [18]:
colnames(DE)