In [153]:
library(dplyr)
library(RColorBrewer)
library(harmony)
library(ggplot2)
library(pheatmap)
library(cowplot)

In [154]:
# Define variables
filter_int_user_curated = T #Use only user_curated interactions?
per_cutoff = 0.1 # min % of cells in the cluster required  with expression > 0 for the gene
pval_cutoff = 0.001 # max adjusted p-value requeired to consider a gene as DEG
logFC_cutoff = 0 # min logFC to consider a gene as DEG

In [155]:
# Load cellphone database - genes and complexes file
# Gene names
genes_cpDB = read.delim('~/cellphoneDB/hsa_uniprot.txt', stringsAsFactors = F, sep = '\t')

# Complexes members
com_cpDB = read.csv('~/farm/CellPhoneDB-data_smallmolecules/data/sources/complex_curated.csv', stringsAsFactors = F)
com_cpDB = unique(com_cpDB)
com_cpDB$complex_name = paste0('complex:', com_cpDB$complex_name)
# Generate complexes2gene symbol dictionary
Com2Gene = lapply(com_cpDB$complex_name, function(cx) subset(genes_cpDB, uniprot %in% unlist(com_cpDB[com_cpDB$complex_name == cx, 2:5]) )$gene_name )
Com2Gene = lapply(Com2Gene, unique)
names(Com2Gene) = com_cpDB$complex_name
                  
# Load interactions from cellphoneDB/out/means.txt output file                   
int_cpDB = read.delim('data/cellphoneDB/out/means.txt', stringsAsFactors = F)[, 1:10]
# MANDATORY: remove "curated" because we have cleaned and renamed them (this is a long story, just do it)
int_cpDB = subset(int_cpDB, annotation_strategy != 'curated')
# OPTIONAL: Use only user_curated interactions?
if(filter_int_user_curated)
    int_cpDB = subset(int_cpDB, annotation_strategy == 'user_curated')
# Generate Int2Gene dictionary
Int2Gene = apply(int_cpDB, 1, function(int) {
    int = unname(int)
    if(int[5] == '') # if partnerA is complex, then retrieve members from dictionary
       partnerA = Com2Gene[[int[3]]]
    else  # if partnerA is a single gene, take name from file
       partnerA = int[5]
    if(int[6] == '')  # if partnerB is complex, then retrieve members from dictionary
       partnerB = Com2Gene[[int[4]]]
    else  # if partnerB is a single gene, take name from file
       partnerB = int[6]
    list(partner_a=partnerA,partner_b=partnerB)
    }) 
names(Int2Gene) = int_cpDB$interacting_pair

In [156]:
# Load percentage expression info
# you need to put in a folder a file for each cluster that you included in cellphone
# The file must contain at least three columns (aka, you can reuse the output of the DEG files):
# - Gene: gene symbol
# - cluster: cell type
# - percentExpr_cluster: % of cells in the cluster with expression > 0 for the gene
path_Exp = '/home/jovyan/farm/endometrium/data/cellphoneDB/clusterDEGs_and_TFs/DEGs/'

# Load
Per_files = list.files(path_Exp, pattern = 'DEGs.csv', full.names = T)
Per_df = lapply(Per_files, read.csv, stringsAsFactors = F) 

# Set rownames as genes, for convenience
Per_df = lapply(Per_df, function(x){ 
    rownames(x) = x$Gene
    return(x) })

# Remove genes not in cellphone
Per_df = lapply(Per_df, function(x) subset(x, Gene %in% genes_cpDB$gene_name ))
                
# Build dictionary clusters2expressed genes                       
names(Per_df) = sapply(Per_df, function(x) x$cluster[1])
is_Exp = lapply(Per_df, function(x) subset(x, percentExpr_cluster > per_cutoff)$Gene )

length(Per_df)
names(Per_df)

In [157]:
# load DE expression info
# you need to put in a folder the DE files of the clusters that you're interested in
# The file must contain at least four columns (aka, you can reuse the output of the DEG files):
# - Gene: gene symbol
# - cluster: cell type
# - adj.P.Val: adjusted p-value from the DE test
# - logFC: from the DE test
path_DE = '/home/jovyan/farm/endometrium/data/cellphoneDB/clusterDEGs_and_TFs/DEGs_withinEpithelial/'

# Load
DE_files = list.files(path_DE, pattern = 'DEGs.csv', full.names = T)
DE_df = lapply(DE_files, read.csv, stringsAsFactors = F)

# Set rownames as genes, for convenience
DE_df = lapply(DE_df, function(x){ 
    rownames(x) = x$Gene
    return(x) })

# Remove genes not in cellphone
DE_df = lapply(DE_df, function(x) subset(x, Gene %in% genes_cpDB$gene_name )) 

# Build dictionary clusters2DEGs              
names(DE_df) = sapply(DE_df, function(x) x$cluster[1])
is_DE = lapply(DE_df, function(x) subset(x, adj.P.Val < pval_cutoff & logFC > logFC_cutoff)$Gene )
               
length(DE_df)
names(DE_df)

In [158]:
# Get all pairwise cluster combinations: A--B, B--A
cluster_combinations = combn(names(is_Exp), 2, simplify = T)
ncol(cluster_combinations)

# add self interactions
cluster_combinations = cbind(sapply(names(is_Exp), function(x) rep(x, 2) ), cluster_combinations)


# We only want to test pairs including at least one cluster in the DE folder (here, epithelial)
keep = apply(cluster_combinations, 2, function(x) any(x %in% names(is_DE)))
cluster_combinations = cluster_combinations[, keep]
             
# # We only want to test pairs including at least one cluster named 'media'
# keep = apply(cluster_combinations, 2, function(x) any(x %in% 'Media'))
# cluster_combinations = cluster_combinations[, keep]
# ncol(cluster_combinations)

# Consider the reverse interaction too: A--B but B--A as well
cluster_combinations = cbind(cluster_combinations, cluster_combinations[2:1,]) %>% t(.) %>% unique(.)  %>% t(.) 
             
# Make cluster pair labels: celltypeA--celltypeB             
cluster_combinations_labels = apply(cluster_combinations, 2, paste, collapse = '---')
ncol(cluster_combinations)

In [159]:
# Make scaffold matrix: L/R interactions (rows) x celltype pairs (columns)
E = matrix(0, nrow = length(Int2Gene), ncol = length(cluster_combinations_labels), 
           dimnames = list(names(Int2Gene), cluster_combinations_labels) )
dim(E) # is it big?

In [160]:
# E will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# So, fill 1 if all genes are expressed in all clusters
for (Int in rownames(E)){ # iterate over interactions
    for (clpair in 1:ncol(E)){ # iterate over celltype pairs
        cl_A = cluster_combinations[1, clpair] # celltype_A
        cl_B = cluster_combinations[2, clpair] # celltype_B
        are_all_expressed = all(Int2Gene[[Int]]$partner_a %in% is_Exp[[cl_A]]) &&  # are all partner_A genes expressed in celltype_A
                            all(Int2Gene[[Int]]$partner_b %in% is_Exp[[cl_B]])     # are all partner_B genes expressed in celltype_B
        if(are_all_expressed)
            E[Int, clpair] = 1
    }
}

In [161]:
# keep celltypepairs with at least one expressed interaction
keep = apply(E, 2, sum) > 0
E = E[,keep]
# is it big?
dim(E)
sum(E)

In [162]:
# keep interactions expressed in at least one celltype pair
keep = apply(E, 1, sum) > 0
E = E[keep,]
# is it big?
dim(E)
sum(E)

In [163]:
# Initialize DE matrix from E and set all values to 0
# DE will be a binary matrix with 1 indicating that all the genes in the interaction are expressed in the corresponding celltype
# and one is a DE in the celltypes of interests
DE = E
DE[] = 0

In [164]:
# So, fill 1 if at least one gene in the interaction is DE
for (Int in rownames(DE)){ # iterate over interactions
    for (clpair in 1:ncol(DE)){ # iterate over celltype pairs
        cl_A = cluster_combinations[1, clpair] # celltype_A
        cl_B = cluster_combinations[2, clpair] # celltype_B
        any_DE = any(Int2Gene[[Int]]$partner_a %in% is_DE[[cl_A]]) ||  # is any partner_A gene DE in celltype_A
                 any(Int2Gene[[Int]]$partner_b %in% is_DE[[cl_B]])     # is any partner_B gene DE in celltype_B
        if(any_DE & E[Int, clpair] == 1) # check all members are expressed
            DE[Int, clpair] = 1
    }
}

In [165]:
# keep interactions expressed in at least one celltype pair
keep = apply(DE, 1, sum) > 0
DE = DE[keep,]
# keep celltypepairs with at least one interaction
keep = apply(DE, 2, sum) > 0
DE = DE[, keep]
# is it big?
dim(DE)
sum(DE)

In [166]:
DE

Unnamed: 0,epithelial_ciliated---epithelial_ciliated,epithelial_glandular---epithelial_glandular,epithelial_LGR5---epithelial_LGR5,epithelial_LYPD1---epithelial_LYPD1,epithelial_proliferative---epithelial_proliferative,epithelial_WIF1_1---epithelial_WIF1_1,epithelial_WIF1_2---epithelial_WIF1_2,epithelial_ciliated---epithelial_glandular,epithelial_ciliated---epithelial_LGR5,epithelial_ciliated---epithelial_LYPD1,⋯,stromal_eS_S-G2M---epithelial_WIF1_1,TRUE---epithelial_WIF1_1,fibroblasts---epithelial_WIF1_2,myeloid---epithelial_WIF1_2,NK---epithelial_WIF1_2,stromal_dS---epithelial_WIF1_2,stromal_eS---epithelial_WIF1_2,stromal_eS_G2M---epithelial_WIF1_2,stromal_eS_S-G2M---epithelial_WIF1_2,TRUE---epithelial_WIF1_2
CCL4_CCR5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CCL5_CCR5,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CCL5_CCR1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
HLA-C_KIR2DL3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
IGF2_IGF1R,1,0,0,0,1,0,0,1,1,1,⋯,0,0,0,0,0,0,0,0,0,0
IGF1_IGF1R,1,1,0,0,1,0,0,1,1,1,⋯,0,0,0,0,0,0,0,0,0,0
IGF2_IGF2R,1,0,0,0,1,0,0,1,1,1,⋯,0,0,0,0,0,0,0,0,0,0
KLRB1_CLEC2D,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CCL5_ACKR1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CCL2_ACKR1,0,0,0,1,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [167]:
colnames(DE) = gsub('TRUE', 'Tcell', colnames(DE))

In [168]:
# So DE is our output matrix
# Is big, filter it accordingly for visualization
write.csv(DE, file = paste0('/home/jovyan/farm/endometrium/data/cellphoneDB/out/DEapproach_results_pval', pval_cutoff, '_logFC', logFC_cutoff, '.csv'))

### Report DE filtered by LR in interactions

In [169]:
cls = intersect(names(DE_df), unlist(strsplit(colnames(DE), split = '--')))
cls

In [170]:
# These are just helper files
# Here, for convenience, we filter the DE that are in DE file above
for (cl in cls ){
    print(cl)
    # Extract interactions where cl is partnerA
    idx = grep(paste0('^', cl), colnames(DE))
    if( length(idx) > 1 )
        int = which(apply(DE[, idx], 1, sum) > 0) %>% names(.)
    if( length(idx) == 1 )
        int = names(which(DE[, idx] > 0))
    # Extract genes at partnerA
    partnersA = sapply(int, function(i)  Int2Gene[[i]]$partner_a) %>% unlist(.)
    # Extract interactions where cl is partnerB
    idx = grep(paste0('--', cl), colnames(DE))
    if( length(idx) > 1 )
        int = which(apply(DE[, idx], 1, sum) > 0) %>% names(.)
    if( length(idx) == 1 )
        int = names(which(DE[, idx] > 0))
    # Extract genes at partnerB
    partnersB = sapply(int, function(i)  Int2Gene[[i]]$partner_b) %>% unlist(.) 
    # Get DEGs for cl
    df = DE_df[[cl]]
    # Filter DE genes in the output
    df = subset(df, adj.P.Val < pval_cutoff & logFC > logFC_cutoff & Gene %in% c(partnersA, partnersB) )
    write.csv(file = paste0(path_DE, cl, '_DEG_inCellphoneResult.csv'), df, row.names=F )
}

[1] "epithelial_ciliated"
[1] "epithelial_glandular"
[1] "epithelial_LGR5"
[1] "epithelial_LYPD1"
[1] "epithelial_proliferative"
[1] "epithelial_WIF1_1"
[1] "epithelial_WIF1_2"
