# Granulosa lineage HUMANs - TFs prioritization

In [1]:
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(cowplot)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /home/jovyan/my-conda-envs/R_Seurat3/lib/libblas.so.3.8.0
LAPACK: /home/jovyan/my-conda-envs/R_Seurat3/lib/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] reshape2_1.4.4     cowplot_1.1.1      pheatmap_1.0.12    ggplot2_3.3.3     
[5] RColorBrewer_1.1-2 dplyr_1.0.5       

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6          plyr_1.8.6          pillar_1.5.1       
 [4] compiler_4.

In [3]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'

# Load measurements
## RNA derived TFs info

In [4]:
# Load TF activities computed with Dorothea
TFact = read.csv(paste0(path_to_gonads,'/FCA_somaticOvary_TFact.tsv'), stringsAsFactors = F, sep ='\t')
TFact$Dorothea_score = strsplit(TFact$gene, split = '-') %>% sapply(., tail, 1) #  dorothea score
TFact$TF = strsplit(TFact$gene, split = '-') %>% sapply(., head, 1) # remove dorothea score
TFact$gene = TFact$gene
TFact$ID = paste0(TFact$cluster, '__', TFact$TF)
TFact$RNA_TFact = ifelse(TFact$p_val_adj < 0.01 & TFact$avg_logFC > 0.1 , 1, 0) 


# Load differentially expressed genes
DEGs = read.csv(paste0(path_to_gonads,'/FCA-gonad_somaticOvary_DEGs.tsv'), stringsAsFactors = F, sep ='\t')
DEGs$TF = DEGs$gene
DEGs$ID = paste0(DEGs$cluster, '__', DEGs$TF)
# DEGs$avg_logFC = DEGs$avg_logFC
DEGs$RNA_DEGs = ifelse(DEGs$p_val_adj < 0.01 & DEGs$avg_logFC >= 0.1 & DEGs$pct.1 > 0.2, 1, 0) 

## ATAC derived TFs info

In [5]:
TFmotifs = read.csv('/nfs/team292/vl6/my_MULTIOME_dir/supporting_july2021/TFs/DARs/all_DA_peaks_LR_for_Luz.csv', stringsAsFactors = F)
colnames(TFmotifs)[ colnames(TFmotifs) == 'avg_log2FC'] = 'avg_logFC'

TFmotifs$cluster = gsub('-', '_', TFmotifs$cluster)
TFmotifs$cluster = gsub('sPAX8', 'early_sPAX8', TFmotifs$cluster)
# TFmotifs$cluster = gsub('pre_spermatogonia', 'pre-spermatogonia', TFmotifs$cluster)
TFmotifs$TF = strsplit(TFmotifs$X, split = '\\.') %>% sapply(., head, 1)
TFmotifs$TF = gsub('-HUMAN', '', TFmotifs$TF)
TFmotifs = subset(TFmotifs, TF %in% c(TFact$TF, DEGs$TF) )
TFmotifs$ID = paste0(TFmotifs$cluster, '__', TFmotifs$TF)
# TFmotifs = subset(TFmotifs, TF %in% TFact$TF)
TFmotifs = TFmotifs[ order(TFmotifs$avg_logFC, decreasing = T) , ]
TFmotifs = TFmotifs[ ! duplicated(TFmotifs$ID) , ]

TFmotifs$ATAC_TFact = ifelse(TFmotifs$p_val_adj < 0.01 & TFmotifs$avg_logFC > 0.1 , 1, 0) 

### Check clusters are shared

In [6]:
shared = Reduce(intersect, list(TFmotifs$cluster, TFact$cluster, DEGs$cluster) )
print(setdiff(DEGs$cluster, shared ))
print('shared')
print(intersect(DEGs$cluster, shared ))

[1] "early_somatic"
[1] "shared"
[1] "preGC_IIa"        "preGC_I"          "CoelEpi_LHX9"     "OSE"             
[5] "early_supporting" "preGC_IIb"        "granulosa"       


# Merge into a dataframe

In [7]:
# add prefix to column names to identify origin
names(TFact)[c(1:2,5, 8)] = paste0('RNA_TFact.', names(TFact)[c(1:2,5,8)])
names(DEGs)[c(1:5)] = paste0('RNA_DEGs.', names(DEGs)[c(1:5)])
names(TFmotifs)[c(2:3,6)] = paste0('ATAC_TFact.', names(TFmotifs)[c(2:3,6)])
names(TFmotifs)[1] = 'ATAC_PWM' 

In [8]:
DEGs = subset(DEGs, TF %in% c(TFact$TF, TFmotifs$TF) )

# merge RNA measurements
df = merge(DEGs, TFact, by = 'ID', all = T)
# add info in missing DEGs
idx = is.na(df$cluster.x)
df$cluster.x[ idx ] = df$cluster.y[ idx ]
df$TF.x[ idx ] = df$TF.y[ idx ]
df$gene.x[ idx ] = df$gene.y[ idx ]

df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))

# # append ATAC measurements
df = merge(df, TFmotifs, by = 'ID', all = T)
# add info in missing DEGs
idx = is.na(df$cluster.x)
df$cluster.x[ idx ] = df$cluster.y[ idx ]
df$TF.x[ idx ] = df$TF.y[ idx ]
df$gene.x[ idx ] = df$gene.y[ idx ]

df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))

# Format 
df = df[, ! colnames(df) %in% c('pct.1', 'pct.2') ]
df$gene = df$TF
# Clean NAs
for (i in c('RNA_DEGs', 'RNA_TFact', 'ATAC_TFact'))
    df[ is.na( df[ , i]) , i] = 0

for (i in c('RNA_DEGs.avg_logFC', 'RNA_TFact.avg_logFC', 'ATAC_TFact.avg_logFC'))
    df[ is.na( df[ , i]) , i] = 0
for (i in c('RNA_DEGs.p_val_adj', 'RNA_TFact.p_val_adj', 'ATAC_TFact.p_val_adj'))
    df[ is.na( df[ , i]) , i] = 1

head(df)

Unnamed: 0_level_0,ID,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,RNA_DEGs.p_val_adj,cluster,gene,TF,RNA_DEGs,RNA_TFact.p_val,RNA_TFact.avg_logFC,RNA_TFact.p_val_adj,RNA_TFact.Dorothea_score,RNA_TFact,ATAC_PWM,ATAC_TFact.p_val,ATAC_TFact.avg_logFC,ATAC_TFact.p_val_adj,ATAC_TFact
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1,CoelEpi_LHX9__ADNP,,0.0,,,1,CoelEpi_LHX9,ADNP,ADNP,0,7.75142e-12,-0.4139814,1.066595e-08,D,0,,,0.0,1.0,0
2,CoelEpi_LHX9__ADNP2,,0.0,,,1,CoelEpi_LHX9,ADNP2,ADNP2,0,2.800632e-14,-0.4738519,3.853669e-11,E,0,,,0.0,1.0,0
3,CoelEpi_LHX9__AEBP2,0.599066883,0.0003403847,0.208,0.224,1,CoelEpi_LHX9,AEBP2,AEBP2,0,1.119509e-05,-0.2891948,0.01540445,E,0,,,0.0,1.0,0
4,CoelEpi_LHX9__AHR,0.039467137,0.0658903044,0.185,0.145,1,CoelEpi_LHX9,AHR,AHR,0,7.900296e-08,-0.2770679,0.0001087081,B,0,AHR.SwissRegulon,3.759244e-21,0.3963754,7.217748e-18,1
5,CoelEpi_LHX9__AHRR,0.001146924,0.0668011286,0.03,0.01,1,CoelEpi_LHX9,AHRR,AHRR,0,2.589658e-44,-0.8486817,3.563369e-41,E,0,,,0.0,1.0,0
6,CoelEpi_LHX9__AIRE,,0.0,,,1,CoelEpi_LHX9,AIRE,AIRE,0,1.897572e-05,-0.1880948,0.02611059,E,0,AIRE.SwissRegulon,0.9047466,-0.3017886,1.0,0


In [9]:
# compute agreement
df$N_agreement = apply(df[, c('RNA_DEGs', 'RNA_TFact', 'ATAC_TFact') ], 1, function(x){
    x[ is.na(x) ] = 0
    sum(x)
})
# reorganize columns
coln = c('TF', 'cluster', 'N_agreement', 'ID', 'ATAC_PWM', 'gene')
df = df[ , c(coln, setdiff(colnames(df), coln) ) ]


# order by agreement
df = df[ order(df$N_agreement, decreasing = T) , ]

# save
write.csv(df, file = 'figures_somatic/granulosa_human_TFs_measurements.csv', row.names=F)
head(df)

Unnamed: 0_level_0,TF,cluster,N_agreement,ID,ATAC_PWM,gene,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,⋯,RNA_DEGs,RNA_TFact.p_val,RNA_TFact.avg_logFC,RNA_TFact.p_val_adj,RNA_TFact.Dorothea_score,RNA_TFact,ATAC_TFact.p_val,ATAC_TFact.avg_logFC,ATAC_TFact.p_val_adj,ATAC_TFact
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
506,NFIB,CoelEpi_LHX9,3,CoelEpi_LHX9__NFIB,NFIB.SwissRegulon,NFIB,1.027223e-16,0.271445,0.73,0.606,⋯,1,4.518503e-22,0.4696849,6.217459999999999e-19,E,1,1.1744379999999999e-35,4.4674621,2.25492e-32,1
509,NFIX,CoelEpi_LHX9,3,CoelEpi_LHX9__NFIX,NFIX.SwissRegulon,NFIX,4.012526e-09,0.2164568,0.235,0.127,⋯,1,3.6170099999999996e-21,1.0180643,4.977006e-18,E,1,1.1744379999999999e-35,4.4674621,2.25492e-32,1
2895,CREB3L2,early_supporting,3,early_supporting__CREB3L2,CREB3L2.SwissRegulon.1,CREB3L2,1.542641e-11,0.1658625,0.318,0.165,⋯,1,3.5301780000000005e-33,0.4482661,4.857524e-30,D,1,3.4113030000000004e-43,0.24154,6.549702e-40,1
2939,E2F4,early_supporting,3,early_supporting__E2F4,E2F4.MA0470.1.1,E2F4,4.355112e-07,0.1349259,0.47,0.311,⋯,1,7.731151e-13,0.2314748,1.063806e-09,AA,1,4.638742e-227,0.5437334,8.906383999999999e-224,1
2940,E2F5,early_supporting,3,early_supporting__E2F5,E2F5-HUMAN.H10MO.B.1,E2F5,2.5847969999999998e-26,0.2982455,0.33,0.126,⋯,1,3.111133e-09,0.3700149,4.280919e-06,C,1,5.398421e-07,0.11203,0.001036497,1
2951,EGR3,early_supporting,3,early_supporting__EGR3,EGR3.MA0732.1.1,EGR3,9.809083e-13,0.2123276,0.235,0.107,⋯,1,6.246046e-07,0.3249447,0.0008594559,E,1,2.637957e-205,0.5456717,5.0648780000000005e-202,1


# Plot - shared

In [10]:
# Select TFs of interest    
df$RNA_DEGs.avg_logFC = signif(df$RNA_DEGs.avg_logFC, 2)
df$ATAC_TFact.avg_logFC = signif(df$ATAC_TFact.avg_logFC, 2)
df$RNA_TFact.avg_logFC = signif(df$RNA_TFact.avg_logFC, 2)
idx_tf = which(df$N_agreement == 3 | 
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$RNA_DEGs.avg_logFC >= 0.5 ) |
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$ATAC_TFact.avg_logFC >= 0.75 ) |
               (df$N_agreement == 2 & df$RNA_DEGs == 1 & df$RNA_TFact.avg_logFC >= 0.75 )
              )




In [11]:
# Order selected genes alphabetically
df_selected = df[idx_tf, ]
df_selected = df_selected[ order(df_selected$TF) , ]

# Define clusters to plot 
# Define clusters to plot 
CLofinterest = c('CoelEpi_LHX9',  'early_supporting', #"early_sPAX8",
                 'preGC_I',  
                 'OSE', 'preGC_IIa', 'preGC_IIb','granulosa')
CLofinterest2 = c('preGC_I',  
                 'OSE', 'preGC_IIa', 'preGC_IIb','granulosa')

# Order selected genes by cluster
df_selected = lapply(CLofinterest2, function(i)
      subset(df_selected, cluster == i))
df_selected = do.call(rbind, df_selected)

TFofinterest = unique(df_selected$TF)       

In [12]:
length(unique(TFofinterest))

In [13]:
write.csv(df_selected, file = 'figures_somatic/granulosa_human_relevantTFs_priotitized.csv', row.names=F)

## as heatmaps

In [14]:
data.frame(colnames(df_selected), 1:ncol(df_selected))

colnames.df_selected.,X1.ncol.df_selected.
<chr>,<int>
TF,1
cluster,2
N_agreement,3
ID,4
ATAC_PWM,5
gene,6
RNA_DEGs.p_val,7
RNA_DEGs.avg_logFC,8
RNA_DEGs.pct.1,9
RNA_DEGs.pct.2,10


In [15]:
load_RNAzscores = function(TFofinterest, df_selected, nonstrict = F){
    # Load Expr matrix for heatmap ploting
    x_Expre = read.csv('figures/data_utils/average_log.csv', stringsAsFactors = F, row.names = 1) %>% t(.)
    rownames(x_Expre) = strsplit(rownames(x_Expre), '\\.') %>% sapply(., tail, 1)

    x_Expre = x_Expre[ rownames(x_Expre) %in% CLofinterest ,  colnames(x_Expre) %in%  TFofinterest  ]
    rn = rownames(x_Expre)
    x_Expre = apply(x_Expre, 2, scale)
    rownames(x_Expre) = rn

    # Add labels
    x_Expre_Lab = x_Expre
    x_Expre_Lab[] = ''
    for (i in rownames(x_Expre) )
        for (j in colnames(x_Expre) ){
            x = subset(df_selected, cluster == i & TF == j)
            if( nrow(x) == 1 ){
                if( x[3] > 1 & ! is.na(x[11]) ){
                    if(x[3] == 3)
                        x_Expre_Lab[i,j] = '*'
                     if ( x[3] == 2 & x[12] == 1 ){
                        if( ! is.na( x[20])) 
                            if(x[21] == 1)
                             x_Expre_Lab[i,j] = 'o'
                        if( ! is.na( x[15])) 
                            if(x[17] == 1 )
                             x_Expre_Lab[i,j] = 'a'
                        }
                }
                if ( nonstrict & x_Expre_Lab[i,j] == ''){
                        if( ! is.na( x[20])) 
                            if(x[21] == 1)
                             x_Expre_Lab[i,j] = 'o'
                        if( ! is.na( x[15])) 
                            if(x[17] == 1 )
                             x_Expre_Lab[i,j] = 'a'
                 }
            }
        }
        X = list(zscores = x_Expre, labels = x_Expre_Lab)        
        return(X)
}
                        
# Load TF act Dorothea matrix  for heatmap ploting
x_TFact = read.csv('figures_somatic/somaticOvary_TFact_percluster.csv', stringsAsFactors = F, row.names = 1)
colnames(x_TFact) = strsplit(colnames(x_TFact), split = '\\.') %>% sapply(., head, 1)
x_TFact = x_TFact[ rownames(x_TFact) %in% CLofinterest , colnames(x_TFact) %in%  TFofinterest ]
rn = rownames(x_TFact)
x_TFact = apply(x_TFact, 2, scale)
rownames(x_TFact) = rn

            

# Load TFmotifs matrix  for heatmap ploting
x_TFmotifs = read.csv('/nfs/team292/vl6/my_MULTIOME_dir/supporting_july2021/supporting__chromvar_celltype_zscore_for_Luz.csv', stringsAsFactors = F, row.names = 1)
x_TFmotifs = x_TFmotifs[ rownames(x_TFmotifs) %in% CLofinterest , ]

idx = colnames(x_TFmotifs) %in% TFmotifs$X
x_TFmotifs = x_TFmotifs[ , idx ] # remove duplicated motifs to match TFmotifs
colnames(x_TFmotifs) = strsplit(colnames(x_TFmotifs), split = '\\.') %>% sapply(., head, 1)




In [17]:
myHeatmap = function(X, TFs, cl, pdf_file, display_numbers = NULL){
    print(cl)
    tfs = intersect(TFs, colnames(X))
    cls =  intersect(cl, rownames(X))
    X = X[ cls,  tfs ]
    print(dim(X))
    
    paletteLength <- 50
    myColor <- colorRampPalette(c(brewer.pal(n = 5, name = 'Blues')[4], "white", brewer.pal(n = 5, name = 'Reds')[4]))(paletteLength)
    # length(breaks) == length(paletteLength) + 1
    # use floor and ceiling to deal with even/odd length pallettelengths
    myBreaks <- c(seq(min(X), 0, length.out=ceiling(paletteLength/2) + 1), 
                  seq(max(X)/paletteLength, max(X), length.out=floor(paletteLength/2)))    
    if( is.null(display_numbers) ){
        pheatmap(X, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, 
                 color=myColor, breaks=myBreaks,
                filename = pdf_file)
    }else{     
        display_numbers = display_numbers[ cls,  tfs ]
        print(dim(display_numbers))
        pheatmap(t(X), cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = t(display_numbers), 
                 color=myColor, breaks=myBreaks,
                filename = pdf_file)
        }
}
# myHeatmap(x_TFact, TFofinterest, CLofinterest, 'figures_somatic/heatmap_human_relevantTFs_RNA_TFact_scores.pdf')
RNAexpr = load_RNAzscores(TFofinterest, df_selected)
myHeatmap(RNAexpr$zscores, TFofinterest, CLofinterest, 'figures_somatic/granulosa_heatmap_human_relevantTFs_RNA_avgLog.pdf', display_numbers = RNAexpr$labels)
# myHeatmap(x_TFmotifs, TFofinterest, CLofinterest, 'figures-germcells/0_FS1_heatmap_TFintersection_snATAC_TFact.pdf')

[1] "CoelEpi_LHX9"     "early_supporting" "preGC_I"          "OSE"             
[5] "preGC_IIa"        "preGC_IIb"        "granulosa"       
[1]  7 46
[1]  7 46


# Plot CellSign TFs

In [21]:
# Add TFs from CellSign analysis
cellsign_TFs = c('AR', 'SMAD1', 'STAT1',
                         'ID4', 'STAT3',
                         'ESR1', 'NR1H4', 
                          'STAT5B', 'ELK1', 'HES1', 'HEY1','HEY2', 'HES4')
RNAexpr = load_RNAzscores(cellsign_TFs, df, nonstrict = T)
myHeatmap(RNAexpr$zscores, cellsign_TFs, c('preGC_I', 'granulosa', 'preGC_IIb', 'preGC_IIa', 'OSE', 'early_supporting', 'CoelEpi_LHX9'), 'figures_somatic/granulosa_heatmap_human_TFsfromCellSign_RNA_avgLog.pdf', display_numbers = RNAexpr$labels)

[1] "preGC_I"          "granulosa"        "preGC_IIb"        "preGC_IIa"       
[5] "OSE"              "early_supporting" "CoelEpi_LHX9"    
[1]  7 13
[1]  7 13


In [19]:
subset(df, TF == 'SMAD1')[, - c(4:5) ]

Unnamed: 0_level_0,TF,cluster,N_agreement,gene,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,RNA_DEGs.p_val_adj,RNA_DEGs,RNA_TFact.p_val,RNA_TFact.avg_logFC,RNA_TFact.p_val_adj,RNA_TFact.Dorothea_score,RNA_TFact,ATAC_TFact.p_val,ATAC_TFact.avg_logFC,ATAC_TFact.p_val_adj,ATAC_TFact
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
3520,SMAD1,early_supporting,2,SMAD1,,0.0,,,1.0,0,1.196736e-09,0.37,1.646709e-06,C,1,2.875578e-191,0.5,5.5211100000000005e-188,1
7756,SMAD1,preGC_I,2,SMAD1,1.855508e-13,0.34,0.39,0.245,2.891809e-09,1,9.532185e-07,-0.28,0.001311629,C,0,4.670586e-06,0.28,0.008967525,1
709,SMAD1,CoelEpi_LHX9,0,SMAD1,0.02438399,0.12,0.305,0.258,1.0,0,0.003883213,0.1,1.0,C,0,0.045547,0.079,1.0,0
2109,SMAD1,early_somatic,0,SMAD1,,0.0,,,1.0,0,0.790859,-0.022,1.0,C,0,,0.0,1.0,0
4932,SMAD1,granulosa,0,SMAD1,0.9382557,0.02,0.254,0.265,1.0,0,0.8542069,0.014,1.0,C,0,3.88174e-05,0.19,0.07452941,0
6344,SMAD1,OSE,0,SMAD1,,0.0,,,1.0,0,0.6414122,-0.013,1.0,C,0,4.812801999999999e-137,-0.67,9.24058e-134,0
9168,SMAD1,preGC_IIa,0,SMAD1,0.2420391,0.019,0.292,0.26,1.0,0,3.000927e-08,-0.34,4.129275e-05,C,0,1.470835e-11,0.066,2.824003e-08,0
10580,SMAD1,preGC_IIb,0,SMAD1,,0.0,,,1.0,0,0.1339438,0.043,1.0,C,0,3.611342e-45,-0.35,6.933775999999999e-42,0


In [20]:
CLofinterest