### GO Term Analysis

In [None]:
# BiocManager::install(c("topGO", "KEGGREST", "org.At.tair.db", "Rgraphviz","edgeR"))
# BiocManager::install("org.Hs.eg.db")
library(edgeR)
library(topGO)
library(KEGGREST)
library(Seurat)
library(magrittr)
library(dplyr)
library(Matrix)
library(xtable)
library(tidyr)
library(ggplot2)
library(org.Hs.eg.db)
library(ComplexHeatmap)

In [None]:
SO <- readRDS('Scaled_merged_data.rds')

In [None]:
#Reads in raw csv to get a list of all genes
ts5 <- read.csv('aaq0681_TableS5.csv') #Healthy
count_table <- ts5[, !names(ts5) %in% 
                            c('cell_id','ident', 'orig.ident',
                              'tSNE_1', 'tSNE_2', 'nGene')]
allGenes <- colnames(count_table) #NEED THIS!

In [None]:
#Collapses the time point replicates into one per time
timepoints <- c('CT','CT','CT','CT','18dpa','18dpa','25dpa','25dpa','38dpa','38dpa')
names(timepoints) <- levels(SO)
SO <- RenameIdents(SO, timepoints)
SO[["time.points"]] <- Idents(object = SO)

In [None]:
#Finds top DE genes 
#Documentation: https://rpubs.com/kshekhar/349874
SO.markers <- FindAllMarkers(object = SO, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
SO.markers %>% group_by(cluster) %>% top_n(2, avg_log2FC)

In [None]:
top10 <- SO.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
DoHeatmap(object = SO, features = top10$gene, label = TRUE, size=2, #remove.key = FALSE,
          ) + theme(text = element_text(size=4), axis.text.y = element_text(size=4))

In [None]:
# Documentation: https://github.com/SingleCellTranscriptomics/2018_scWorkshop
topGOterms = function( fg.genes = NULL,
                       bg.genes = NULL,
                       organism = "Mouse", 
                       ontology.use = "BP",
                       stats.use = "fisher",
                       algorithm.use = "weight01",
                       topnodes.print=20,
                       num.char=100){
  
  if (is.null(fg.genes) | is.null(bg.genes)){
    stop("Error : Both gene lists are empty")
  }
  
  require(topGO)
  if (organism == "Mouse"){
    mapping.use = "org.Mm.eg.db"
    library(org.Mm.eg.db)
  } else if (organism == "Human"){
    mapping.use = "org.Hs.eg.db"
    library(org.Hs.eg.db)
  } else {
    stop("Error : Organisms other than mouse not supported currently")
  }
  
  n = length(bg.genes)
  geneList = integer(n)
  names(geneList) = bg.genes
  geneList[intersect(names(geneList), fg.genes)]=1
  print(paste0("Total ", length(geneList), " genes. ", sum(geneList), " genes in the foreground"))
  geneList = factor(geneList)
  
  if (ontology.use %in% c("BP", "CC", "MF")){
    print(paste0("Using Ontology : ", ontology.use))
  } else {
    stop("Error: Ontology not available. Should be one of BP, CC or MF")
  }
  # Make GO object
  GOdata <- new("topGOdata",
                description = "GOanalysis",
                ontology = ontology.use,
                allGenes = geneList,
                annot = annFUN.org,
                mapping = mapping.use,
                ID = "SYMBOL",
                nodeSize = 10)
  print(paste0("Using the ", stats.use, " statistic with the ", algorithm.use, " algorithm"))
  res.result <- runTest(GOdata, statistic = stats.use, algorithm = algorithm.use)
  to.return = list()
  to.return$GOdata = GOdata
  to.return$res.result <- res.result
  to.return$res.table <- GenTable(GOdata, pval = res.result, topNodes = topnodes.print, numChar = num.char)
  return(to.return)
}

In [None]:
#Creates list of diff expressed genes by their pvalue
# geneList <- rep(0,length(allGenes))
# names(geneList) <- allGenes
# geneList[SO.markers$gene] <- SO.markers$p_val
# geneList

# SO.time.marker.gene <- unique(SO.markers$gene) # Takes all the unique cell type specific genes

# myGOdata <- new("topGOdata",
#                 description = 'GOanalysis',
#                 ontology = "BP",
#                 allGenes = geneList,
#                 geneSelectionFun = SO.time.marker.gene, # Selects for significant genes
#                 annot = annFUN.org,
#                 mapping = "org.Hs.eg.db",
#                 ID = "SYMBOL",
#                nodeSize=10)

In [None]:
#Performs GO Enrichment on the entire dataset
SO.time.marker.gene <- unique(SO.markers$gene) # Takes all the unique cell type specific genes
GOterms = topGOterms(fg.genes = SO.time.marker.gene, bg.genes = allGenes, organism = "Human",
                     topnodes.print=20)

In [None]:
GOterms$res.table

In [None]:
#Performs GO on each groupings
clust <- levels(SO.markers$cluster) #Time points list
combined.table <- list() #Stores the table for results

for (i in clust) {
    SO.markers.clust <- SO.markers[which(SO.markers$cluster == i),]
    GOterms = topGOterms(fg.genes = SO.markers.clust$gene, bg.genes = allGenes, organism = "Human",
                         topnodes.print=100)
    GOterms$res.table$time <- i #Adds column to delineate time point for each result
    combined.table <- rbind(combined.table, GOterms$res.table) #Combines table
}
head(combined.table)

In [None]:
combined.table$pval <- as.numeric(combined.table$pval) #Converts pval to numerics
top20 <- combined.table %>% group_by(time) %>% top_n(-20, pval) #Negative bc want lowest pval
top20

In [None]:
#Creates a matrix of pvals for the top terms
GOmatrix <- pivot_wider(combined.table[c('Term','pval','time')], names_from = time, 
                        values_from = pval,
                        values_fill = 1,)
GOterms <- GOmatrix$Term
GOmatrix <- data.matrix(GOmatrix[,-1])
rownames(GOmatrix) <- GOterms

#Scales the pvals by log10 for easier visualization
GOmatrix <- -log(GOmatrix)

In [None]:
GOmatrix <- GOmatrix[unique(top20$Term),]


In [None]:
# options(repr.plot.width=14, repr.plot.height=10)

GOheatmap <-Heatmap(GOmatrix,
        name = 'Log(pval)',
        col = c('white','red'),
        rect_gp = gpar(col = 'grey85'),
        cluster_rows = TRUE,
        
        #Row features
        show_row_dend = TRUE,
        row_title = 'Top GO Terms',
        row_title_side = 'left',
        row_title_gp = gpar(fontsize = 11, fontface = 'bold'),
        row_title_rot = 90,
        show_row_names = TRUE,
        row_names_gp = gpar(fontsize = 8),
        row_names_side = 'right',
        row_dend_width = unit(20, 'mm'),
        
        #Col features
        cluster_columns = TRUE,
        show_column_dend = TRUE,
        column_title = 'Time points',
        column_title_side = 'top',
        column_title_gp = gpar(fontsize = 11, fontface = 'bold'),
        column_title_rot = 0,
        show_column_names = TRUE,
        column_names_gp = gpar(fontsize = 6),
        column_dend_height = unit(5, 'mm'),

        #Legend + dimensions
        heatmap_legend_param = list(direction = "horizontal"),
        width = unit(3,'cm'),
        height = unit(18,'cm'),
       )

#Save file
jpeg(file="GOTerm.jpg", units='cm',
    width=28,
    height=28,
    res=1000)

draw(GOheatmap,
  heatmap_legend_side = 'bottom',
    merge_legend=TRUE)

dev.off()

In [None]:
# Copies data to GBucket
bucket <- Sys.getenv('WORKSPACE_BUCKET')
save.path = "/"
command = paste0("gsutil cp GOTerm.jpg ", bucket, save.path)
system(command, intern=TRUE)

In [None]:
cluster.table$pval <- as.numeric(cluster.table$pval) #Converts pval to numerics
top10 <- cluster.table %>% group_by(clust) %>% top_n(-10, pval) #Negative bc want lowest pval
top10