# Germ lineage HUMANs - TFs prioritization

In [1]:
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(cowplot)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /home/jovyan/my-conda-envs/R_Seurat3/lib/libblas.so.3.8.0
LAPACK: /home/jovyan/my-conda-envs/R_Seurat3/lib/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] reshape2_1.4.4     cowplot_1.1.1      pheatmap_1.0.12    ggplot2_3.3.3     
[5] RColorBrewer_1.1-2 dplyr_1.0.5       

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6          plyr_1.8.6          pillar_1.5.1       
 [4] compiler_4.

In [3]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'

# Load measurements
## human TFs from multiomics integration

In [4]:
# Load TF activities computed with Dorothea
HUMAN = read.csv('figures_germcells/human_TFs_measurements.csv', stringsAsFactors = F)
HUMAN = subset(HUMAN, ! is.na(gene) )
HUMAN$ID = paste0(HUMAN$cluster, '__', HUMAN$TF)
dim(HUMAN)
length(unique(HUMAN$gene))

In [5]:
# Load TF activities computed with Dorothea
HUMAN_prioritized = read.csv('figures_germcells/human_relevantTFs_priotitized.csv', stringsAsFactors = F)
# HUMAN = subset(HUMAN, N_agreement > 1)
HUMAN_prioritized$ID = paste0(HUMAN_prioritized$cluster, '__', HUMAN_prioritized$gene)

## mouse

In [6]:
MOUSE = read.csv(paste0(path_to_gonads, 'mouse-gonads_germcells_DEGs.tsv'), stringsAsFactors = F, sep = '\t')
MOUSE$gene_mouse = MOUSE$gene
MOUSE$mouse_DEG = 0
MOUSE$mouse_DEG[ MOUSE$p_val_adj < 0.05 & MOUSE$avg_logFC > 0 ] = 1
dim(MOUSE)


# load orthologous
orthologous = read.csv('/nfs/team292/vl6/Mouse_Niu2020/biomart_human_mouse_orthologs.tsv', stringsAsFactors = F, sep ='\t')
# Remove duplicates prioritizing gene match
orthologous$match = (toupper(orthologous$Mouse.gene.name) == orthologous$Gene.name) * 1
orthologous = orthologous[ order(orthologous$match, decreasing = T),  ]
orthologous = orthologous[ ! duplicated(orthologous$Mouse.gene.name),  ]
orthologous = orthologous[ ! duplicated(orthologous$Gene.name),  ]
rownames(orthologous) = orthologous$Mouse.gene.name

# Reannotate `gene` with orthologous
MOUSE$gene = orthologous[ MOUSE$gene_mouse , ]$Gene.name
MOUSE$gene[ is.na(MOUSE$gene) ] = toupper(MOUSE$gene_mouse[ is.na(MOUSE$gene) ])

# Make id
MOUSE$ID = paste0(MOUSE$cluster, '__', MOUSE$gene)

## monkey

In [7]:
MONKEY = read.csv(paste0(path_to_gonads, 'monkey-gonads_germcells_DEGs.tsv'), stringsAsFactors = F, sep = '\t')
MONKEY$monkey_DEG = 0
MONKEY$monkey_DEG[ MONKEY$p_val_adj < 0.05 & MONKEY$avg_logFC > 0 ] = 1
MONKEY$ID = paste0(MONKEY$cluster, '__', MONKEY$gene)
dim(MONKEY)

### Check clusters are shared

In [8]:
shared = Reduce(intersect, list(MOUSE$cluster, HUMAN$cluster, MONKEY$cluster) )
message('different')
print(setdiff(HUMAN$cluster, shared ))
message('shared')
print(intersect(HUMAN$cluster, shared ))

different



[1] "GC"                "pre_spermatogonia" "GC_mitotic"       


shared



[1] "oocyte"          "oogonia_meiotic" "oogonia_STRA8"   "PGC"            
[5] "pre_oocyte"     


# Merge into a dataframe

In [9]:
# add prefix to column names to identify origin
names(MOUSE)[c(1:5)] = paste0('mouse.', names(MOUSE)[c(1:5)])
names(MONKEY)[c(1:5)] = paste0('monkey', names(MONKEY)[c(1:5)])

In [10]:
# HUMAN = subset(HUMAN, gene %in% c(MOUSE$gene, MONKEY$gene) )
MOUSE = subset(MOUSE, gene %in% c(HUMAN$gene) )
MONKEY = subset(MONKEY, gene %in% c(HUMAN$gene) )
length(unique(HUMAN$gene))

# merge RNA measurements
df = merge(HUMAN, MOUSE, by = 'ID', all = T)
df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))
length(unique(df$gene))

# merge RNA measurements
df = merge(df, MONKEY, by = 'ID', all = T)
df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))
length(unique(df$gene))

## format and save table

In [11]:
df = subset(df, ! is.na(N_agreement) )
df$mouse_DEG[ is.na(df$mouse_DEG) ] = 0
df$monkey_DEG[ is.na(df$monkey_DEG) ] = 0
df$N_agreement_species = df$mouse_DEG + df$monkey_DEG
# df = subset(df, N_agreement_species > 0 )
length(unique(df$gene))

In [12]:
# reorganize columns
coln = c('TF', 'cluster', 'N_agreement', 'ID')
df = df[ , c(coln, setdiff(colnames(df), coln) ) ]

# order by agreement
df = df[ order(df$N_agreement, decreasing = T) , ]

In [13]:
length(unique(df$gene))

In [14]:
# save
write.csv(df, file = 'figures_germcells/crosspecie_TF_measurements.csv', row.names=F)

In [15]:
subset(df, gene == 'ID4')

Unnamed: 0_level_0,TF,cluster,N_agreement,ID,ATAC_PWM,gene,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,⋯,mouse.p_val_adj,gene_mouse,mouse_DEG,monkeyp_val,monkeyavg_logFC,monkeypct.1,monkeypct.2,monkeyp_val_adj,monkey_DEG,N_agreement_species
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
10374,ID4,pre_spermatogonia,3,pre_spermatogonia__ID4,ID4.SwissRegulon.6,ID4,7.858912000000001e-75,0.72091062,0.812,0.398,⋯,,,0,,,,,,0,0
3195,ID4,oocyte,2,oocyte__ID4,ID4.MA0824.1.5,ID4,,0.0,,,⋯,,,0,0.628283583,0.01008425,0.088,0.105,1.0,0,0
354,ID4,GC,1,GC__ID4,ID4.SwissRegulon.1,ID4,1.362251e-09,0.21378466,0.6,0.429,⋯,,,0,,,,,,0,0
4630,ID4,oogonia_meiotic,1,oogonia_meiotic__ID4,ID4.SwissRegulon.3,ID4,,0.0,,,⋯,,,0,,,,,,0,0
8938,ID4,pre_oocyte,1,pre_oocyte__ID4,ID4.SwissRegulon.4,ID4,,0.0,,,⋯,1.991178e-42,Id4,1,0.003078191,0.22306552,0.169,0.084,1.0,0,1
1776,ID4,GC_mitotic,0,GC_mitotic__ID4,,ID4,8.585621e-06,0.06223148,0.622,0.425,⋯,,,0,,,,,,0,0
6067,ID4,oogonia_STRA8,0,oogonia_STRA8__ID4,ID4.SwissRegulon.2,ID4,,0.0,,,⋯,,,0,,,,,,0,0
7503,ID4,PGC,0,PGC__ID4,ID4-HUMAN.H10MO.D,ID4,3.293004e-07,0.09836821,0.628,0.425,⋯,,,0,,,,,,0,0


## Plot heatmap summary for selected human TFs

In [16]:
selectedTF_df = subset(df, ID %in% HUMAN_prioritized$ID )

In [17]:
cluster_order = c('PGC', 'GC', 'oogonia_STRA8', 'oogonia_meiotic', 'pre_oocyte', 'oocyte',  'pre_spermatogonia')
selectedTF_df = selectedTF_df[ order(selectedTF_df$gene) , ]
sorted_tfs = lapply(cluster_order, function(i)
      subset(selectedTF_df, cluster == i))
sorted_tfs = do.call(rbind, sorted_tfs)

In [18]:
# save
write.csv(sorted_tfs, file = 'figures_germcells/crosspecie_TF_measurements_humanTFs.csv', row.names=F)

In [19]:
sorted_tfs = sorted_tfs[, c('gene', 'ID', 'N_agreement', 'monkey_DEG', 'mouse_DEG', 'N_agreement_species') ]
sorted_tfs = sorted_tfs[! duplicated(sorted_tfs$gene), ]
rownames(sorted_tfs) = sorted_tfs$gene

In [20]:
length(unique(sorted_tfs$gene))

In [21]:
xx = sorted_tfs[,c('N_agreement', 'monkey_DEG', 'mouse_DEG')]

In [22]:
paletteLength <- 50
myColor <- colorRampPalette(c("white", brewer.pal(n = 5, name = 'Reds')[2]))(paletteLength)
xx[ xx[,1] > 0 ,1]= 1
sym = xx
sym[] = ''
sym[ xx > 0 ] = '*'
pheatmap(xx, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = sym, color=myColor, 
        filename = 'figures_germcells/human_relevanTFs_crosspecie_summary.pdf')

## Plot heatmap summary for non selected human TFs that are conserved

In [23]:
selectedTF_df = subset(df, ! ID %in% HUMAN_prioritized$ID )

In [24]:
selectedTF_df1 = subset(selectedTF_df, N_agreement > 1 & N_agreement_species  > 0)
selectedTF_df2 = subset(selectedTF_df, RNA_DEGs > 0 & N_agreement_species == 2)
selectedTF_df = unique(rbind(selectedTF_df1, selectedTF_df2))

In [25]:
cluster_order = c('PGC', 'GC', 'oogonia_STRA8', 'oogonia_meiotic', 'pre_oocyte', 'oocyte',  'pre_spermatogonia')
selectedTF_df = selectedTF_df[ order(selectedTF_df$gene) , ]
sorted_tfs = lapply(cluster_order, function(i)
      subset(selectedTF_df, cluster == i))
sorted_tfs = do.call(rbind, sorted_tfs)

In [26]:
# save
write.csv(sorted_tfs, file = 'figures_germcells/crosspecie_TF_measurements_otherconservedTFs.csv', row.names=F)

In [27]:
sorted_tfs = sorted_tfs[, c('gene', 'ID', 'N_agreement', 'monkey_DEG', 'mouse_DEG', 'N_agreement_species') ]
sorted_tfs = sorted_tfs[! duplicated(sorted_tfs$gene), ]
rownames(sorted_tfs) = sorted_tfs$gene

In [28]:
xx = sorted_tfs[,c('N_agreement', 'monkey_DEG', 'mouse_DEG')]

In [29]:
paletteLength <- 50
myColor <- colorRampPalette(c("white", brewer.pal(n = 5, name = 'Reds')[2]))(paletteLength)
xx[ xx[,1] > 0 ,1]= 1
sym = xx
sym[] = ''
sym[ xx > 0 ] = '*'
pheatmap(xx, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = sym, color=myColor, 
        filename = 'figures_germcells/otherconservedTFs_crosspecie_summary.pdf')