# Granulosa lineage HUMANs - TFs prioritization

In [1]:
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(cowplot)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.1 LTS

Matrix products: default
BLAS:   /home/jovyan/my-conda-envs/R_Seurat3/lib/libblas.so.3.8.0
LAPACK: /home/jovyan/my-conda-envs/R_Seurat3/lib/liblapack.so.3.8.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] reshape2_1.4.4     cowplot_1.1.1      pheatmap_1.0.12    ggplot2_3.3.3     
[5] RColorBrewer_1.1-2 dplyr_1.0.5       

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.6          plyr_1.8.6          pillar_1.5.1       
 [4] compiler_4.

In [3]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'

# Load measurements
## human TFs from multiomics integration

In [4]:
# Load TF activities computed with Dorothea
HUMAN = read.csv('figures_somatic/granulosa_human_TFs_measurements.csv', stringsAsFactors = F)
HUMAN = subset(HUMAN, ! is.na(gene) )
HUMAN$ID = paste0(HUMAN$cluster, '__', HUMAN$TF)
dim(HUMAN)
length(unique(HUMAN$gene))

In [5]:
# Load TF activities computed with Dorothea
HUMAN_prioritized = read.csv('figures_somatic/granulosa_human_relevantTFs_priotitized.csv', stringsAsFactors = F)
# HUMAN = subset(HUMAN, N_agreement > 1)
HUMAN_prioritized$ID = paste0(HUMAN_prioritized$cluster, '__', HUMAN_prioritized$gene)

## mouse

In [6]:
MOUSE = read.csv(paste0(path_to_gonads, 'mouse-gonads_pregranulosa_DEGs.tsv'), stringsAsFactors = F, sep = '\t')
MOUSE$cluster[ MOUSE$cluster == 'preGC_II' ] = 'preGC_IIb'
MOUSE$cluster[ MOUSE$cluster == 'CoelEpi_Lhx9' ] = 'CoelEpi_LHX9'
# MOUSE = subset(MOUSE, p_val_adj < 0.01 & avg_logFC > 0.1)
MOUSE$gene_mouse = MOUSE$gene
MOUSE$mouse_DEG = 0
MOUSE$mouse_DEG[ MOUSE$p_val_adj < 0.05 & MOUSE$avg_logFC > 0 ] = 1
dim(MOUSE)


# load orthologous
orthologous = read.csv('/nfs/team292/vl6/Mouse_Niu2020/biomart_human_mouse_orthologs.tsv', stringsAsFactors = F, sep ='\t')
# Remove duplicates prioritizing gene match
orthologous$match = (toupper(orthologous$Mouse.gene.name) == orthologous$Gene.name) * 1
orthologous = orthologous[ order(orthologous$match, decreasing = T),  ]
orthologous = orthologous[ ! duplicated(orthologous$Mouse.gene.name),  ]
orthologous = orthologous[ ! duplicated(orthologous$Gene.name),  ]
rownames(orthologous) = orthologous$Mouse.gene.name

# Reannotate `gene` with orthologous
MOUSE$gene = orthologous[ MOUSE$gene_mouse , ]$Gene.name
MOUSE$gene[ is.na(MOUSE$gene) ] = toupper(MOUSE$gene_mouse[ is.na(MOUSE$gene) ])

# Make id
MOUSE$ID = paste0(MOUSE$cluster, '__', MOUSE$gene)

## monkey

In [7]:
MONKEY = read.csv(paste0(path_to_gonads, 'monkey-gonads_granulosa_DEGs.tsv'), stringsAsFactors = F, sep = '\t')
MONKEY$cluster[ MONKEY$cluster == 'preGC_II' ] = 'preGC_IIb'
# MONKEY = subset(MONKEY, p_val_adj < 0.01 & avg_logFC > 0.1)
MONKEY$monkey_DEG = 0
MONKEY$monkey_DEG[ MONKEY$p_val_adj < 0.05 & MONKEY$avg_logFC > 0 ] = 1
MONKEY$ID = paste0(MONKEY$cluster, '__', MONKEY$gene)
dim(MONKEY)

### Check clusters are shared

In [8]:
unique(MOUSE$cluster)

In [9]:
unique(MONKEY$cluster)

In [10]:
shared = Reduce(intersect, list(MOUSE$cluster, HUMAN$cluster, MONKEY$cluster) )
message('different')
print(setdiff(HUMAN$cluster, shared ))
message('shared')
print(intersect(HUMAN$cluster, shared ))

different



[1] "early_supporting" "early_somatic"    "preGC_IIa"       


shared



[1] "CoelEpi_LHX9" "granulosa"    "OSE"          "preGC_I"      "preGC_IIb"   


# Merge into a dataframe

In [11]:
# add prefix to column names to identify origin
names(MOUSE)[c(1:5)] = paste0('mouse.', names(MOUSE)[c(1:5)])
names(MONKEY)[c(1:5)] = paste0('monkey', names(MONKEY)[c(1:5)])

In [12]:
# HUMAN = subset(HUMAN, gene %in% c(MOUSE$gene, MONKEY$gene) )
MOUSE = subset(MOUSE, gene %in% c(HUMAN$gene) )
MONKEY = subset(MONKEY, gene %in% c(HUMAN$gene) )
length(unique(HUMAN$gene))

# merge RNA measurements
df = merge(HUMAN, MOUSE, by = 'ID', all = T)
df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))
length(unique(df$gene))

# merge RNA measurements
df = merge(df, MONKEY, by = 'ID', all = T)
df = df[, grep('y$', names(df), invert = T) ] # remove unwanted columns
colnames(df) = gsub('.x$', '', colnames(df))
length(unique(df$gene))

## format and save table

In [13]:
df = subset(df, ! is.na(N_agreement) )
df$mouse_DEG[ is.na(df$mouse_DEG) ] = 0
df$monkey_DEG[ is.na(df$monkey_DEG) ] = 0
df$N_agreement_species = df$mouse_DEG + df$monkey_DEG
# df = subset(df, N_agreement_species > 0 )
length(unique(df$gene))

In [14]:
# reorganize columns
coln = c('gene', 'cluster', 'N_agreement', 'ID')
df = df[ , c(coln, setdiff(colnames(df), coln) ) ]

# order by agreement
df = df[ order(df$N_agreement, decreasing = T) , ]

In [15]:
length(unique(df$gene))

In [16]:
# save
write.csv(df, file = 'figures_somatic/granulosa_crosspecie_TF_measurements.csv', row.names=F)

In [17]:
subset(df, gene == 'ID4')

Unnamed: 0_level_0,gene,cluster,N_agreement,ID,TF,ATAC_PWM,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,⋯,mouse.p_val_adj,gene_mouse,mouse_DEG,monkeyp_val,monkeyavg_logFC,monkeypct.1,monkeypct.2,monkeyp_val_adj,monkey_DEG,N_agreement_species
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
350,ID4,CoelEpi_LHX9,2,CoelEpi_LHX9__ID4,ID4,ID4.SwissRegulon,5.199681e-30,0.510867,0.642,0.405,⋯,2.2437259999999998e-49,Id4,1,1.444076e-22,1.1543969,0.633,0.1,1.692457e-18,1,2
3161,ID4,early_supporting,2,early_supporting__ID4,ID4,ID4.MA0824.1.1,2.6164569999999996e-19,0.3271443,0.63,0.407,⋯,,,0,,,,,,0,0
5986,ID4,OSE,2,OSE__ID4,ID4,ID4.SwissRegulon.3,2.5967e-08,0.253718,0.542,0.42,⋯,,,0,0.0002072936,0.1072702,0.184,0.093,1.0,0,0
4573,ID4,granulosa,1,granulosa__ID4,ID4,ID4-HUMAN.H10MO.D.6,,0.0,,,⋯,,,0,,,,,,0,0
8811,ID4,preGC_IIa,1,preGC_IIa__ID4,ID4,ID4.SwissRegulon.4,,0.0,,,⋯,,,0,,,,,,0,0
1755,ID4,early_somatic,0,early_somatic__ID4,ID4,,,0.0,,,⋯,,,0,,,,,,0,0
7399,ID4,preGC_I,0,preGC_I__ID4,ID4,ID4-HUMAN.H10MO.D.2,,0.0,,,⋯,0.001746769,Id4,1,,,,,,0,1
10223,ID4,preGC_IIb,0,preGC_IIb__ID4,ID4,ID4.SwissRegulon.5,,0.0,,,⋯,,,0,,,,,,0,0


## Plot heatmap summary for selected human TFs

In [18]:
selectedTF_df = subset(df, ID %in% HUMAN_prioritized$ID )

In [19]:
cluster_order = c('CoelEpi_LHX9',  'early_supporting', 
                 'preGC_I',  
                 'OSE', 'preGC_IIa', 'preGC_IIb','granulosa')
selectedTF_df = selectedTF_df[ order(selectedTF_df$gene) , ]
sorted_tfs = lapply(cluster_order, function(i)
      subset(selectedTF_df, cluster == i))
sorted_tfs = do.call(rbind, sorted_tfs)

In [20]:
# save
write.csv(sorted_tfs, file = 'figures_somatic/granulosa_crosspecie_TF_measurements_humanTFs.csv', row.names=F)

In [21]:
sorted_tfs = sorted_tfs[, c('gene', 'ID', 'N_agreement', 'monkey_DEG', 'mouse_DEG', 'N_agreement_species') ]
sorted_tfs = sorted_tfs[! duplicated(sorted_tfs$gene), ]
rownames(sorted_tfs) = sorted_tfs$gene

In [22]:
length(unique(sorted_tfs$gene))

In [23]:
xx = sorted_tfs[,c('N_agreement', 'monkey_DEG', 'mouse_DEG')]

In [24]:
paletteLength <- 50
myColor <- colorRampPalette(c("white", brewer.pal(n = 5, name = 'Reds')[2]))(paletteLength)
xx[ xx[,1] > 0 ,1]= 1
sym = xx
sym[] = ''
sym[ xx > 0 ] = '*'
pheatmap(xx, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = sym, color=myColor, 
        filename = 'figures_somatic/granulosa_human_relevanTFs_crosspecie_summary.pdf')

In [25]:
selectedTF_df

Unnamed: 0_level_0,gene,cluster,N_agreement,ID,TF,ATAC_PWM,RNA_DEGs.p_val,RNA_DEGs.avg_logFC,RNA_DEGs.pct.1,RNA_DEGs.pct.2,⋯,mouse.p_val_adj,gene_mouse,mouse_DEG,monkeyp_val,monkeyavg_logFC,monkeypct.1,monkeypct.2,monkeyp_val_adj,monkey_DEG,N_agreement_species
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4244,ARNT2,granulosa,3,granulosa__ARNT2,ARNT2,ARNT2-HUMAN.H10MO.D.6,2.025863e-10,0.287711,0.262,0.144,⋯,4.256818e-24,Arnt2,1,,,,,,0,1
5660,ARX,OSE,2,OSE__ARX,ARX,ARX-HUMAN.H10MO.D.3,7.428609000000001e-17,0.2917867,0.302,0.139,⋯,2.192757e-53,Arx,1,,,,,,0,1
4270,BBX,granulosa,2,granulosa__BBX,BBX,BBX.SwissRegulon.6,1.110236e-09,0.1808088,0.757,0.691,⋯,3.321182e-35,Bbx,1,,,,,,0,1
5694,BNC1,OSE,2,OSE__BNC1,BNC1,,3.727822e-62,0.6074326,0.49,0.147,⋯,1.108752e-129,Bnc1,1,5.676002e-28,0.43665094,0.268,0.036,6.652275e-24,1,2
4371,ELK1,granulosa,2,granulosa__ELK1,ELK1,ELK1-HUMAN.H10MO.A.6,1.184003e-14,0.2837151,0.685,0.549,⋯,,,0,,,,,,0,0
4376,EMX2,granulosa,2,granulosa__EMX2,EMX2,EMX2.SwissRegulon.6,6.923518e-16,0.2446977,0.807,0.689,⋯,,,0,0.04548041,0.04420203,0.792,0.64,1.0,0,0
5789,EMX2,OSE,2,OSE__EMX2,EMX2,EMX2.MA0886.1.3,6.559718e-20,0.2620331,0.845,0.682,⋯,,,0,9.832716e-06,0.24279627,0.744,0.656,0.1152394,0,0
4390,ETS1,granulosa,3,granulosa__ETS1,ETS1,ETS1.MA0098.3.6,5.650235e-29,0.4448545,0.257,0.079,⋯,0.6631871,Ets1,0,,,,,,0,0
4396,ETV5,granulosa,2,granulosa__ETV5,ETV5,ETV5.SwissRegulon.6,4.501048e-18,0.4104564,0.34,0.169,⋯,1.0,Etv5,0,,,,,,0,0
4430,FOXK1,granulosa,2,granulosa__FOXK1,FOXK1,FOXK1.MA0852.2.6,2.237827e-09,0.2461814,0.262,0.148,⋯,1.0,Foxk1,0,,,,,,0,0


## Plot heatmap summary for non selected human TFs that are conserved

In [26]:
selectedTF_df = subset(df, ! ID %in% HUMAN_prioritized$ID )

In [27]:
selectedTF_df1 = subset(selectedTF_df, N_agreement > 1 & N_agreement_species  > 0)
selectedTF_df2 = subset(selectedTF_df, RNA_DEGs > 0 & N_agreement_species == 2)
selectedTF_df = unique(rbind(selectedTF_df1, selectedTF_df2))

In [28]:
cluster_order = cluster_order
selectedTF_df = selectedTF_df[ order(selectedTF_df$gene) , ]
sorted_tfs = lapply(cluster_order, function(i)
      subset(selectedTF_df, cluster == i))
sorted_tfs = do.call(rbind, sorted_tfs)

In [29]:
# save
write.csv(sorted_tfs, file = 'figures_somatic/granulosa_crosspecie_TF_measurements_otherconservedTFs.csv', row.names=F)

In [30]:
sorted_tfs = sorted_tfs[, c('gene', 'ID', 'N_agreement', 'monkey_DEG', 'mouse_DEG', 'N_agreement_species') ]
sorted_tfs = sorted_tfs[! duplicated(sorted_tfs$gene), ]
rownames(sorted_tfs) = sorted_tfs$gene

In [31]:
xx = sorted_tfs[,c('N_agreement', 'monkey_DEG', 'mouse_DEG')]

In [32]:
paletteLength <- 50
myColor <- colorRampPalette(c("white", brewer.pal(n = 5, name = 'Reds')[2]))(paletteLength)
xx[ xx[,1] > 0 ,1]= 1
sym = xx
sym[] = ''
sym[ xx > 0 ] = '*'
pheatmap(xx, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, display_numbers = sym, color=myColor, 
        filename = 'figures_somatic/granulosa_otherconservedTFs_crosspecie_summary.pdf')

In [33]:
cluster_order