# Germ lineage HUMANs - TFs prioritization

In [1]:
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(cowplot)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /home/jovyan/my-conda-envs/R_Seurat3/lib/libblas.so.3.8.0
LAPACK: /home/jovyan/my-conda-envs/R_Seurat3/lib/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] reshape2_1.4.4     cowplot_1.1.1      pheatmap_1.0.12    ggplot2_3.3.3     
[5] RColorBrewer_1.1-2 dplyr_1.0.5       

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6          plyr_1.8.6          pillar_1.5.1       
 [4] compiler_4.

In [3]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'

# Load measurements
## RNA derived TFs info

In [4]:
# Load TF activities computed with Dorothea
TFact = read.csv(paste0(path_to_gonads,'/FCA_germcells_TFact.tsv'), stringsAsFactors = F, sep ='\t')
TFact$Dorothea_score = strsplit(TFact$gene, split = '-') %>% sapply(., tail, 1) #  dorothea score
TFact$TF = strsplit(TFact$gene, split = '-') %>% sapply(., head, 1) # remove dorothea score
TFact$gene = TFact$gene
TFact$ID = paste0(TFact$cluster, '__', TFact$TF)
TFact$RNA_TFact = ifelse(TFact$p_val_adj < 0.01 & TFact$avg_logFC > 0.1 , 1, 0) 


# Load differentially expressed genes
DEGs = read.csv(paste0(path_to_gonads,'/FCA-gonad_germcells_DEGs.tsv'), stringsAsFactors = F, sep ='\t')
DEGs$TF = DEGs$gene
DEGs$ID = paste0(DEGs$cluster, '__', DEGs$TF)
# DEGs$avg_logFC = DEGs$avg_logFC
DEGs$RNA_DEGs = ifelse(DEGs$p_val_adj < 0.01 & DEGs$avg_logFC > 0.15, 1, 0) 

## ATAC derived TFs info

In [5]:
TFmotifs = read.csv('/nfs/team292/vl6/my_MULTIOME_dir/germcells_july2021/TFs/DARs/all_DA_peaks_LR.csv', stringsAsFactors = F)
colnames(TFmotifs)[ colnames(TFmotifs) == 'avg_log2FC'] = 'avg_logFC'

TFmotifs$cluster = gsub('-', '_', TFmotifs$cluster)
# TFmotifs$cluster = gsub('oogonia_meiotic', 'oogonia_meiosis', TFmotifs$cluster)
# TFmotifs$cluster = gsub('pre_spermatogonia', 'pre-spermatogonia', TFmotifs$cluster)
TFmotifs$TF = strsplit(TFmotifs$X, split = '\\.') %>% sapply(., head, 1)
TFmotifs$TF = gsub('-HUMAN', '', TFmotifs$TF)
TFmotifs = subset(TFmotifs, TF %in% c(TFact$TF, DEGs$TF) )
TFmotifs$ID = paste0(TFmotifs$cluster, '__', TFmotifs$TF)
# TFmotifs = subset(TFmotifs, TF %in% TFact$TF)
TFmotifs = TFmotifs[ order(TFmotifs$avg_logFC, decreasing = T) , ]
TFmotifs = TFmotifs[ ! duplicated(TFmotifs$ID) , ]

TFmotifs$ATAC_TFact = ifelse(TFmotifs$p_val_adj < 0.01 & TFmotifs$avg_logFC > 0.1 , 1, 0) 

### Check clusters are shared

In [6]:
shared = Reduce(intersect, list(TFmotifs$cluster, TFact$cluster, DEGs$cluster) )
print(setdiff(DEGs$cluster, shared ))
print(intersect(DEGs$cluster, shared ))

[1] "GC_mitotic"
[1] "PGC"               "GC"                "oocyte"           
[4] "oogonia_STRA8"     "pre_oocyte"        "oogonia_meiotic"  
[7] "pre_spermatogonia"


# Merge into a dataframe

In [7]:
# add prefix to column names to identify origin
names(TFact)[c(1:2,5, 8)] = paste0('RNA_TFact.', names(TFact)[c(1:2,5,8)])
names(DEGs)[c(1:5)] = paste0('RNA_DEGs.', names(DEGs)[c(1:5)])
names(TFmotifs)[c(2:3,6)] = paste0('ATAC_TFact.', names(TFmotifs)[c(2:3,6)])
names(TFmotifs)[1] = 'ATAC_PWM' 

In [8]:
DEGs = subset(DEGs, TF %in% c(TFact$TF, TFmotifs$TF) )

# merge RNA measurements
df = merge(DEGs, TFact, by = 'ID', all = T)
# add info in missing DEGs
idx = is.na(df$cluster.x)
df$cluster.x[ idx ] = df$cluster.y[ idx ]
df$TF.x[ idx ] = df$TF.y[ idx ]
df$gene.x[ idx ] = df$gene.y[ idx ]

df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))

# # append ATAC measurements
df = merge(df, TFmotifs, by = 'ID', all = T)
# add info in missing DEGs
idx = is.na(df$cluster.x)
df$cluster.x[ idx ] = df$cluster.y[ idx ]
df$TF.x[ idx ] = df$TF.y[ idx ]
df$gene.x[ idx ] = df$gene.y[ idx ]

df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))

# Format 
df = df[, ! colnames(df) %in% c('pct.1', 'pct.2') ]
df$gene = df$TF
# Clean NAs
for (i in c('RNA_DEGs', 'RNA_TFact', 'ATAC_TFact'))
    df[ is.na( df[ , i]) , i] = 0

for (i in c('RNA_DEGs.avg_logFC', 'RNA_TFact.avg_logFC', 'ATAC_TFact.avg_logFC'))
    df[ is.na( df[ , i]) , i] = 0
for (i in c('RNA_DEGs.p_val_adj', 'RNA_TFact.p_val_adj', 'ATAC_TFact.p_val_adj'))
    df[ is.na( df[ , i]) , i] = 1

head(df)

Unnamed: 0_level_0,ID,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,RNA_DEGs.p_val_adj,cluster,gene,TF,RNA_DEGs,RNA_TFact.p_val,RNA_TFact.avg_logFC,RNA_TFact.p_val_adj,RNA_TFact.Dorothea_score,RNA_TFact,ATAC_PWM,ATAC_TFact.p_val,ATAC_TFact.avg_logFC,ATAC_TFact.p_val_adj,ATAC_TFact
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,GC__ADNP,0.0002855294,0.1002194,0.528,0.423,1,GC,ADNP,ADNP,0,6.268939e-07,0.1929231,0.0008669943,D,1,,,0.0,1,0
2,GC__ADNP2,,0.0,,,1,GC,ADNP2,ADNP2,0,0.2163933,-0.2144921,1.0,E,0,,,0.0,1,0
3,GC__AEBP2,,0.0,,,1,GC,AEBP2,AEBP2,0,0.07413126,-0.1902172,1.0,E,0,,,0.0,1,0
4,GC__AHR,,0.0,,,1,GC,AHR,AHR,0,0.001214789,-0.1550981,1.0,B,0,AHR-HUMAN.H10MO.B.1,0.5656201,-0.2206721,1,0
5,GC__AHRR,,0.0,,,1,GC,AHRR,AHRR,0,3.53059e-05,0.1745275,0.048828053,E,0,,,0.0,1,0
6,GC__AIRE,,0.0,,,1,GC,AIRE,AIRE,0,0.6313612,-0.2040962,1.0,E,0,AIRE.SwissRegulon.1,0.4149412,-0.7937911,1,0


In [9]:
# compute agreement
df$N_agreement = apply(df[, c('RNA_DEGs', 'RNA_TFact', 'ATAC_TFact') ], 1, function(x){
    x[ is.na(x) ] = 0
    sum(x)
})
# reorganize columns
coln = c('TF', 'cluster', 'N_agreement', 'ID', 'ATAC_PWM', 'gene')
df = df[ , c(coln, setdiff(colnames(df), coln) ) ]


# order by agreement
df = df[ order(df$N_agreement, decreasing = T) , ]

# save
write.csv(df, file = 'figures_germcells/human_TFs_measurements.csv', row.names=F)
head(df)

Unnamed: 0_level_0,TF,cluster,N_agreement,ID,ATAC_PWM,gene,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,⋯,RNA_DEGs,RNA_TFact.p_val,RNA_TFact.avg_logFC,RNA_TFact.p_val_adj,RNA_TFact.Dorothea_score,RNA_TFact,ATAC_TFact.p_val,ATAC_TFact.avg_logFC,ATAC_TFact.p_val_adj,ATAC_TFact
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
758,SOX4,GC,3,GC__SOX4,SOX4.MA0867.1.1,SOX4,9.26364e-40,0.433477,0.941,0.654,⋯,1,5.9664069999999996e-21,0.3307047,8.251541e-18,E,1,6.632225e-09,0.2671011,1.273387e-05,1
2884,BARX2,oocyte,3,oocyte__BARX2,BARX2-HUMAN.H10MO.D.5,BARX2,2.407859e-07,0.180969,0.116,0.047,⋯,1,6.066193e-81,0.8863681,8.389545e-78,E,1,1.9193059999999998e-19,1.1599274,3.685067e-16,1
3025,FIGLA,oocyte,3,oocyte__FIGLA,FIGLA.MA0820.1.5,FIGLA,1.00096e-194,1.3789022,0.859,0.14,⋯,1,7.400333e-153,1.8039547,1.023466e-149,B,1,0.0,16.8569292,0.0,1
3120,HEY2,oocyte,3,oocyte__HEY2,HEY2.MA0649.1.5,HEY2,1.031171e-11,0.4092867,0.409,0.273,⋯,1,1.049073e-42,0.5778002,1.450868e-39,E,1,1.220886e-25,0.6699419,2.344102e-22,1
3536,RORA,oocyte,3,oocyte__RORA,RORA-HUMAN.H10MO.B.5,RORA,2.040031e-07,0.2381747,0.212,0.116,⋯,1,1.56204e-12,0.3656247,2.160301e-09,D,1,1.7984059999999998e-20,0.754082,3.45294e-17,1
3627,STAT1,oocyte,3,oocyte__STAT1,STAT1.SwissRegulon.5,STAT1,1.127996e-07,0.3694292,0.347,0.242,⋯,1,9.035608e-15,0.3059179,1.249625e-11,AA,1,5.24105e-09,0.402596,1.006282e-05,1


# Plot - shared

In [10]:
# Select TFs of interest    
df$RNA_DEGs.avg_logFC = signif(df$RNA_DEGs.avg_logFC, 2)
df$ATAC_TFact.avg_logFC = signif(df$ATAC_TFact.avg_logFC, 2)
df$RNA_TFact.avg_logFC = signif(df$RNA_TFact.avg_logFC, 2)

# TFs need to meet be strongly supported by 3 or 2 evidences with high fold changes 
idx_tf_1 = which(df$N_agreement == 3 | 
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$RNA_DEGs.avg_logFC > 0.75 ) |
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$ATAC_TFact.avg_logFC > 0.75 ) |
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$RNA_TFact.avg_logFC > 0.7 )
              )
# Expression changes need to be robust 
idx_tf_2 = which( df$RNA_DEGs.avg_logFC >= 0.2 & df$RNA_DEGs.pct.1 >= 0.25)

idx_tf = intersect(idx_tf_1, idx_tf_2)

# These are next to the thresholds - add manually
i = which(df$TF == 'ZHX3')[1]
idx_tf = c(idx_tf, i)
i = which(df$TF == 'KLF4')[2]
idx_tf = c(idx_tf, i)

idx_tf = unique(idx_tf)

In [11]:
# Order selected genes alphabetically
df_selected = df[idx_tf, ]
df_selected = df_selected[ order(df_selected$TF) , ]

# Define clusters to plot 
CLofinterest = c('PGC', 'GC', 'oogonia_STRA8', 'oogonia_meiotic', 'pre_oocyte', 'oocyte',  'pre_spermatogonia')

# Order selected genes by cluster
df_selected = lapply(CLofinterest, function(i)
      subset(df_selected, cluster == i))
df_selected = do.call(rbind, df_selected)

TFofinterest = unique(df_selected$TF)       

In [12]:
length(unique(TFofinterest))

In [13]:
write.csv(df_selected, file = 'figures_germcells/human_relevantTFs_priotitized.csv', row.names=F)


## as heatmaps

In [14]:
data.frame(colnames(df_selected), 1:ncol(df_selected))

colnames.df_selected.,X1.ncol.df_selected.
<chr>,<int>
TF,1
cluster,2
N_agreement,3
ID,4
ATAC_PWM,5
gene,6
RNA_DEGs.p_val,7
RNA_DEGs.avg_logFC,8
RNA_DEGs.pct.1,9
RNA_DEGs.pct.2,10


In [15]:
load_RNAzscores = function(TFofinterest, df_selected, nonstrict = F){
    # Load Expr matrix for heatmap ploting
    x_Expre = read.csv('figures/data_utils/average_log.csv', stringsAsFactors = F, row.names = 1) %>% t(.)
    rownames(x_Expre) = strsplit(rownames(x_Expre), '\\.') %>% sapply(., tail, 1)

    x_Expre = x_Expre[ rownames(x_Expre) %in% CLofinterest ,  colnames(x_Expre) %in%  TFofinterest  ]
    rn = rownames(x_Expre)
    x_Expre = apply(x_Expre, 2, scale)
    rownames(x_Expre) = rn

    # Add labels
    x_Expre_Lab = x_Expre
    x_Expre_Lab[] = ''
    for (i in rownames(x_Expre) )
        for (j in colnames(x_Expre) ){
            x = subset(df_selected, cluster == i & TF == j)
            if( nrow(x) == 1 ){
                if( x[3] > 1 & ! is.na(x[11]) ){
                    if(x[3] == 3)
                        x_Expre_Lab[i,j] = '*'
                     if ( x[3] == 2 & x[12] == 1 ){
                        if( ! is.na( x[20])) 
                            if(x[21] == 1)
                             x_Expre_Lab[i,j] = 'o'
                        if( ! is.na( x[15])) 
                            if(x[17] == 1 )
                             x_Expre_Lab[i,j] = 'a'
                        }
                }
                if ( nonstrict & x_Expre_Lab[i,j] == ''){
                        if( ! is.na( x[20])) 
                            if(x[21] == 1)
                             x_Expre_Lab[i,j] = 'o'
                        if( ! is.na( x[15])) 
                            if(x[17] == 1 )
                             x_Expre_Lab[i,j] = 'a'
                 }
            }
        }
        X = list(zscores = x_Expre, labels = x_Expre_Lab)        
        return(X)
}
            
            
# Load TF act Dorothea matrix  for heatmap ploting
x_TFact = read.csv('figures_germcells/TFact_percluster.csv', stringsAsFactors = F, row.names = 1)
colnames(x_TFact) = strsplit(colnames(x_TFact), split = '\\.') %>% sapply(., head, 1)
x_TFact = x_TFact[ rownames(x_TFact) %in% CLofinterest , colnames(x_TFact) %in%  TFofinterest ]
rn = rownames(x_TFact)
x_TFact = apply(x_TFact, 2, scale)
rownames(x_TFact) = rn

    
# Load TFmotifs matrix  for heatmap ploting
x_TFmotifs = read.csv('/nfs/team292/vl6/my_MULTIOME_dir/germcells_july2021/germcells__chromvar_celltype_zscore.csv', stringsAsFactors = F, row.names = 1)
x_TFmotifs = x_TFmotifs[ rownames(x_TFmotifs) %in% CLofinterest , ]

In [16]:
myHeatmap = function(X, TFs, cl, pdf_file, display_numbers = NULL){
    print(cl)
    tfs = intersect(TFs, colnames(X))
    cls =  intersect(cl, rownames(X))
    X = X[ cls,  tfs ]
    print(dim(X))
    
    paletteLength <- 50
    myColor <- colorRampPalette(c(brewer.pal(n = 5, name = 'Blues')[4], "white", brewer.pal(n = 5, name = 'Reds')[4]))(paletteLength)
    # length(breaks) == length(paletteLength) + 1
    # use floor and ceiling to deal with even/odd length pallettelengths
    myBreaks <- c(seq(min(X), 0, length.out=ceiling(paletteLength/2) + 1), 
                  seq(max(X)/paletteLength, max(X), length.out=floor(paletteLength/2)))    
    if( is.null(display_numbers) ){
        pheatmap(X, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, 
                 color=myColor, breaks=myBreaks,
                filename = pdf_file)
    }else{     
        display_numbers = display_numbers[ cls,  tfs ]
        print(dim(display_numbers))
        pheatmap(t(X), cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = t(display_numbers), 
                 color=myColor, breaks=myBreaks,
                filename = pdf_file)
        }
}
RNAexpr = load_RNAzscores(TFofinterest, df_selected)
myHeatmap(RNAexpr$zscores, TFofinterest, CLofinterest,  'figures_germcells/heatmap_human_relevantTFs_RNA_avgLog.pdf', display_numbers = RNAexpr$labels)

[1] "PGC"               "GC"                "oogonia_STRA8"    
[4] "oogonia_meiotic"   "pre_oocyte"        "oocyte"           
[7] "pre_spermatogonia"
[1]  7 80
[1]  7 80


# Plot CellSign TFs

In [17]:
# Add TFs from CellSign analysis
cellsign_TFs = c('STAT3', 
                         'ZGLP1', 'RARA', 'RARB', 'ESR1',
                         'STAT5B',
                         'SMAD2', 'ESR2', 'OTX2', 'ESR2')
RNAexpr = load_RNAzscores(cellsign_TFs, df, nonstrict = T)
myHeatmap(RNAexpr$zscores, cellsign_TFs, CLofinterest[-length(CLofinterest)], 'figures_germcells/heatmap_human_TFsfromCellSign_RNA_avgLog.pdf', display_numbers = RNAexpr$labels)

[1] "PGC"             "GC"              "oogonia_STRA8"   "oogonia_meiotic"
[5] "pre_oocyte"      "oocyte"         
[1] 6 9
[1] 6 9
