In [1]:
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(pheatmap)
library(cowplot)
library(reshape2)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
avgLog2 = read.csv('figures/data_utils/germcells_average_log.csv', stringsAsFactors = F, row.names=1)

colnames(avgLog2) = gsub('Germ.', '', colnames(avgLog2))
colnames(avgLog2) = gsub('Somatic.', '', colnames(avgLog2))

In [3]:
# Take ZGLP1 info from Niu+Sanger dataset only / Meyer doesn't have info for this gene

avgLog2_ZGLP1 = read.csv('figures/data_utils/average_log.csv', stringsAsFactors = F, row.names=1)

colnames(avgLog2_ZGLP1) = gsub('Germ.', '', colnames(avgLog2_ZGLP1))
colnames(avgLog2_ZGLP1) = gsub('Somatic.', '', colnames(avgLog2_ZGLP1))

In [4]:
# load orthologous
orthologous = read.csv('/nfs/team292/vl6/Mouse_Niu2020/biomart_human_mouse_orthologs.tsv', stringsAsFactors = F, sep ='\t')
# Remove duplicates prioritizing gene match
orthologous$match = (toupper(orthologous$Mouse.gene.name) == orthologous$Gene.name) * 1
orthologous = orthologous[ order(orthologous$match, decreasing = T),  ]
orthologous = orthologous[ ! duplicated(orthologous$Mouse.gene.name),  ]
orthologous = orthologous[ ! duplicated(orthologous$Gene.name),  ]
rownames(orthologous) = orthologous$Mouse.gene.name

# Reannotate `gene` with orthologous
avgLog2 = avgLog2[rownames(avgLog2) %in% orthologous$Mouse.gene.name, ]
rownames(avgLog2) = orthologous[ rownames(avgLog2) , ]$Gene.name
avgLog2 = avgLog2[ ! is.na(rownames(avgLog2)), ]

In [5]:
orthologous = read.csv('/nfs/team292/vl6/Mouse_Niu2020/biomart_human_mouse_orthologs.tsv', stringsAsFactors = F, sep ='\t')
subset(orthologous, Gene.name %in% c('ZGLP1', 'RARG', 'ID4') )

Unnamed: 0_level_0,Gene.stable.ID,Mouse.gene.stable.ID,Mouse.gene.name,Gene.name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>
5319,ENSG00000172201,ENSMUSG00000021379,Id4,ID4
8580,ENSG00000220201,ENSMUSG00000079681,Zglp1,ZGLP1
13322,ENSG00000172819,ENSMUSG00000001288,Rarg,RARG


In [6]:
TF_interspecie = read.csv('../human_v2/figures_germcells/crosspecie_TF_measurements_humanTFs.csv', stringsAsFactors = F)

In [7]:
# Define genes and clusters of interest
CLofInterest = unique(TF_interspecie$cluster)
CLofInterest = intersect( CLofInterest, colnames(avgLog2))

TFofInterest = unique(TF_interspecie$TF)
# TFofInterest_mouse = TF_interspecie[ ! duplicated(TF_interspecie$TF) , ]$gene_mouse

In [8]:
# scale clusters of interest
avgLog2 = avgLog2[,CLofInterest]
rn = colnames(avgLog2)
avgLog2_scaled = apply(t(avgLog2), 2, scale) 
avgLog2_scaled = t(avgLog2_scaled)
colnames(avgLog2_scaled) = rn

In [9]:
# prepare matrix with all TFs
M = matrix(0, nrow = length(TFofInterest), ncol = length(CLofInterest), dimnames = list(TFofInterest, CLofInterest)  )

In [10]:
# Fill the matrix with the Tfs with mouse information
MavgLog2 = M

idx = TFofInterest %in% rownames(avgLog2)
for (cl in CLofInterest)
    MavgLog2[ TFofInterest[idx], cl] <- avgLog2_scaled[ TFofInterest[idx], cl]

In [11]:
# add ZGLP1 infor from Sanger + Niu datasets
cname = intersect(colnames(avgLog2_ZGLP1), colnames(MavgLog2))
avgLog2_ZGLP1[ 'Zglp1', cname ] = scale(as.numeric(avgLog2_ZGLP1[ 'Zglp1', cname ]))
for (cl in cname)
    MavgLog2[ 'ZGLP1', cl] = avgLog2_ZGLP1[ 'Zglp1', cl ]

In [12]:
# plot
paletteLength <- 50
myColor <- colorRampPalette(c(brewer.pal(n = 5, name = 'Blues')[4], "white", brewer.pal(n = 5, name = 'Reds')[4]))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(min(MavgLog2), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(MavgLog2)/paletteLength, max(MavgLog2), length.out=floor(paletteLength/2))) 


pheatmap(MavgLog2, cellheight = 10, cellwidth = 10, cluster_rows = F, cluster_cols = F, color=myColor, breaks=myBreaks,
        filename= 'figures_germcells/heatmap_humanTFs_Zscores.pdf')