# Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb_R4.3/lib/R/library")

# Load Libraries

In [3]:
library(enrichR)
library(dplyr)
library(rtracklayer)
library(GenomicRanges)
library(openxlsx)
library(readxl)
library(glue)
library(ggplot2)
library(viridis)
library(tidyr)

# Load Data

In [None]:
modules <- readRDS("Modules.rds")

In [None]:
regions <- modules$regions

# Process Data

Select modules of interest based on significance in module trait correlations.

In [7]:
# Determine modules of interest
modules_of_interest <- list("honeydew1", "white", "brown", "brown4", "purple")

In [None]:
# Filter out modules that are not of interest
regions <- regions %>%
  filter(module %in% modules_of_interest)

In [None]:
# View
head(regions)

# Annotate Regions

In [None]:
# Read the GTF annotation file
gtf_file <- "/share/lasallelab/genomes/mm10/mm10.refGene.gtf"
gtf_data <- import(gtf_file)

# View
head(gtf_data)

In [None]:
# Create GRanges object for regions
gr_regions <- GRanges(seqnames = regions$chr,
                      ranges = IRanges(start = regions$start, end = regions$end))

In [None]:
# Extract transcript entries from the GTF data
gtf_transcripts <- gtf_data[gtf_data$type == "transcript"]

In [None]:
# Create GRanges object for gene annotations
gr_genes <- GRanges(seqnames = seqnames(gtf_transcripts),
                    ranges = IRanges(start = start(gtf_transcripts), end = end(gtf_transcripts)),
                    gene_name = mcols(gtf_transcripts)$gene_name)

In [None]:
# Find overlaps between regions and gene annotations
overlaps <- findOverlaps(gr_regions, gr_genes)

In [None]:
# Create a new column for gene names in the regions data frame
regions$gene_name <- NA
regions$gene_name[queryHits(overlaps)] <- gr_genes$gene_name[subjectHits(overlaps)]

In [None]:
# View annotated regions
head(regions)

# Gene Ontology

In [None]:
# Create a list of gene names per module
module_genes <- regions %>%
  group_by(module) %>%
  summarize(unique_genes = list(unique(gene_name[!is.na(gene_name)])), .groups = 'drop')

# View
head(module_genes)

In [None]:
# Count genes per module
module_genes_length <- module_genes %>%
  mutate(num_genes = sapply(unique_genes, length)) %>%
  select(module, num_genes)

# View
print(module_genes_length)

In [None]:
# Iterate over each module
for (module in module_genes$module) {
  # Read in gene list
  gene_list <- module_genes$unique_genes[[which(module_genes$module == module)]]
    
  tryCatch({
    # Perform the enrichR analysis on the gene list for the current module
    enrichr_results <- enrichr(gene_list, c("GO_Biological_Process_2023",
                                                     "GO_Cellular_Component_2023",
                                                     "GO_Molecular_Function_2023",
                                                     "KEGG_2019_Mouse",
                                                     "Panther_2016",
                                                     "Reactome_2016",
                                                     "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"))
    
    # Check if enrichr_results is empty
    if (length(enrichr_results) == 0) {
      cat("No results for module", module, "\n")
      next
    }
    
    # Save Enrichr outputs
    wb <- createWorkbook()
    
    for (i in seq_along(enrichr_results)) {
      # Extract the data frame from the list
      df <- enrichr_results[[i]]
      
      # Check if the data frame is empty
      if (nrow(df) == 0) {
        cat("Empty data frame for", names(enrichr_results)[i], "in module", module, "\n")
        next
      }
      
      # Define the original sheet name
      original_sheet_name <- names(enrichr_results)[i]
      
      # Modify the sheet name if it's specifically "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"
      sheet_name <- if (original_sheet_name == "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO") {
        "RNAseq_DiseaseGene_DrugSigs_GEO"
      } else {
        original_sheet_name
      }
      
      # Add the data frame as a new sheet in the Excel workbook
      addWorksheet(wb, sheet_name)
      writeData(wb, sheet = sheet_name, x = df)
    }
    
    # Save the Excel workbook
    saveWorkbook(wb, paste0(module, "_enrichr_results.xlsx"), overwrite = TRUE)
    
    # Function to plot and save the results
    plot_and_save <- function(df, filename, title) {
      if (nrow(df) == 0) {
        cat("Empty data frame for", title, "in module", module, "\n")
        return()
      }
      pdf(filename, height = 7, width = 15)
      print(plotEnrich(df, showTerms = 25, numChar = 75, y = "Count", orderBy = "P.value") + ggtitle(title))
      dev.off()
    }
    
    # Plot and save Enrichr results
    plot_and_save(enrichr_results$GO_Biological_Process_2023, 
                  paste0(module, "_GO_Biological_Process_2023.pdf"), 
                  paste("GO_Biological_Process_2023 for", module, "module"))
    
    plot_and_save(enrichr_results$GO_Cellular_Component_2023, 
                  paste0(module, "_GO_Cellular_Component_2023.pdf"), 
                  paste("GO_Cellular_Component_2023 for", module, "module"))
    
    plot_and_save(enrichr_results$GO_Molecular_Function_2023, 
                  paste0(module, "_GO_Molecular_Function_2023.pdf"), 
                  paste("GO_Molecular_Function_2023 for", module, "module"))
    
    plot_and_save(enrichr_results$KEGG_2019_Mouse, 
                  paste0(module, "_KEGG_2019_Mouse.pdf"), 
                  paste("KEGG_2019_Mouse for", module, "module"))
    
    plot_and_save(enrichr_results$Panther_2016, 
                  paste0(module, "_Panther_2016.pdf"), 
                  paste("Panther_2016 for", module, "module"))
    
    plot_and_save(enrichr_results$Reactome_2016, 
                  paste0(module, "_Reactome_2016.pdf"), 
                  paste("Reactome_2016 for", module, "module"))
    
    plot_and_save(enrichr_results$`RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO`, 
                  paste0(module, "_RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO.pdf"), 
                  paste("RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO for", module, "module"))
    
  }, error = function(e) {
    cat("Error occurred for module", module, ": ", conditionMessage(e), "\n")
    # Log the error to a file for further inspection
    write(paste("Error occurred for module", module, ": ", conditionMessage(e), "\n"), file = "error_log.txt", append = TRUE)
    # Continue to the next module
    next
  })
}

# Visualize GO Results

## Top 25 Dot Plots (Priotizing Overlap)

In [None]:
# List of GO databases
databases <- c("GO_Biological_Process_2023", "GO_Cellular_Component_2023", "GO_Molecular_Function_2023",
               "KEGG_2019_Mouse", "Panther_2016", "Reactome_2016", "RNAseq_DiseaseGene_DrugSigs_GEO")

In [None]:
# Store GO data into data frame

# Initialize an empty dataframe
all_data <- data.frame()

# Iterate over each module and read in the corresponding Excel file
for (module in module_genes$module) {
  file_path <- glue("{module}_enrichr_results.xlsx")
  
  for (database in databases) {
    try({
      # Read the data from the Excel file
      df <- read_excel(file_path, sheet = database)
      
      if (nrow(df) == 0) {
        next  # Skip to the next database if no data
      }
      
      # Select the required columns and add module and database information
      df <- df %>%
        select(Term, Adjusted.P.value, Odds.Ratio) %>%
        mutate(Module = module, Database = database)
      
      # Append to the dataframe
      all_data <- bind_rows(all_data, df)
      
    }, silent = TRUE)
  }
}

# Check if any data was read
if (nrow(all_data) == 0) {
  stop("No data read from any of the Excel files.")
}

# Display the combined dataframe
head(all_data)

In [None]:
# Filter out rows where the Adjusted.P.value is less than 0.1
all_data <- all_data %>%
  filter(Adjusted.P.value <= 0.05)

# Display the filtered dataframe
head(all_data)

In [None]:
# Count the number of total terms and unique terms for each database
database_term_counts <- all_data %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

In [None]:
# Calculate the number of modules each term appears in for each database
term_module_counts <- all_data %>%
  group_by(Database, Term) %>%
  summarise(ModuleCount = n_distinct(Module), .groups = 'drop')

# Rank the terms within each database by the number of modules they appear in
ranked_terms <- term_module_counts %>%
  arrange(Database, desc(ModuleCount)) %>%
  group_by(Database) %>%
  slice_head(n = 25) %>%
  ungroup()

# Merge with the original data to filter the top 25 terms per database
filtered_data_top_25 <- all_data %>%
  semi_join(ranked_terms, by = c("Database", "Term"))

# Print the filtered data
head(filtered_data_top_25)

In [None]:
# Count the number of total terms and unique terms for each database
filtered_database_term_counts <- filtered_data_top_25 %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(filtered_database_term_counts)

In [None]:
# Create plots for each database
for (database in unique(filtered_data_top_25$Database)) {
  database_filtered_data <- filtered_data_top_25 %>%
    filter(Database == database)
  
  if (nrow(database_filtered_data) > 0) {
    dot_plot <- ggplot(database_filtered_data, aes(x = Module, y = Term, size = Odds.Ratio, fill = Adjusted.P.value)) +
      geom_point(shape = 21) +
      scale_fill_viridis() +
      xlab('') + ylab('') +
      labs(
        title = 'Top Enrichr Terms Across Modules',
        subtitle = glue('{database}')
      ) +
      theme(
        panel.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = "black"),
        plot.background = element_rect(fill = "white", color = NA),
        axis.text.x = element_text(angle = 90, hjust = 1)
      )
    
    # Save the dot plot with the database name in the filename
    ggsave(filename = glue("top_25_dot_plot_{database}.pdf"), plot = dot_plot, height = 7, width = 15)
  } else {
    cat(glue("No data available for {database}. Skipping...\n"))
  }
}

## Top 25 Per Module Dot Plot

In [8]:
# List of GO databases
databases <- c("GO_Biological_Process_2023", "GO_Cellular_Component_2023", "GO_Molecular_Function_2023",
               "KEGG_2019_Mouse", "Panther_2016", "Reactome_2016", "RNAseq_DiseaseGene_DrugSigs_GEO")

In [9]:
# Store GO data into data frame

# Initialize an empty dataframe
all_data <- data.frame()

# Iterate over each module and read in the corresponding Excel file
for (module in modules_of_interest) {
  file_path <- glue("{module}_enrichr_results.xlsx")
  
  for (database in databases) {
    try({
      # Read the data from the Excel file
      df <- read_excel(file_path, sheet = database)
      
      if (nrow(df) == 0) {
        next  # Skip to the next database if no data
      }
      
      # Select the required columns and add module and database information
      df <- df %>%
        select(Term, Adjusted.P.value, Odds.Ratio) %>%
        mutate(Module = module, Database = database)
      
      # Append to the dataframe
      all_data <- bind_rows(all_data, df)
      
    }, silent = TRUE)
  }
}

# Check if any data was read
if (nrow(all_data) == 0) {
  stop("No data read from any of the Excel files.")
}

# Display the combined dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Regulation Of Neuron Projection Development (GO:0010975),0.001061955,4.369626,honeydew1,GO_Biological_Process_2023
2,Modulation Of Chemical Synaptic Transmission (GO:0050804),0.002139396,4.926606,honeydew1,GO_Biological_Process_2023
3,Peptidyl-Threonine Modification (GO:0018210),0.010963121,6.293506,honeydew1,GO_Biological_Process_2023
4,Regulation Of Trans-Synaptic Signaling (GO:0099177),0.031262087,8.472719,honeydew1,GO_Biological_Process_2023
5,Regulation Of Canonical Wnt Signaling Pathway (GO:0060828),0.032843915,3.171949,honeydew1,GO_Biological_Process_2023
6,Neuron Development (GO:0048666),0.032843915,3.639517,honeydew1,GO_Biological_Process_2023


In [10]:
# Filter out rows where the Adjusted.P.value is less than 0.1
all_data <- all_data %>%
  filter(Adjusted.P.value <= 0.05)

# Display the filtered dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Regulation Of Neuron Projection Development (GO:0010975),0.001061955,4.369626,honeydew1,GO_Biological_Process_2023
2,Modulation Of Chemical Synaptic Transmission (GO:0050804),0.002139396,4.926606,honeydew1,GO_Biological_Process_2023
3,Peptidyl-Threonine Modification (GO:0018210),0.010963121,6.293506,honeydew1,GO_Biological_Process_2023
4,Regulation Of Trans-Synaptic Signaling (GO:0099177),0.031262087,8.472719,honeydew1,GO_Biological_Process_2023
5,Regulation Of Canonical Wnt Signaling Pathway (GO:0060828),0.032843915,3.171949,honeydew1,GO_Biological_Process_2023
6,Neuron Development (GO:0048666),0.032843915,3.639517,honeydew1,GO_Biological_Process_2023


In [11]:
# Count the number of total terms and unique terms for each database
database_term_counts <- all_data %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 7 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023              328          187
[90m2[39m GO_Cellular_Component_2023              116           52
[90m3[39m GO_Molecular_Function_2023              154           80
[90m4[39m KEGG_2019_Mouse                         195           83
[90m5[39m Panther_2016                             75           33
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO        [4m1[24m177          348
[90m7[39m Reactome_2016                           305          148


In [14]:
# Filter to keep only the top 10 most significant per module and database
top_25 <- all_data %>%
  group_by(Module, Database) %>%
  slice_min(order_by = Adjusted.P.value, n = 25, with_ties = FALSE) %>%
  ungroup()

# View 
head(top_25)

Term,Adjusted.P.value,Odds.Ratio,Module,Database
<chr>,<dbl>,<dbl>,<chr>,<chr>
Regulation Of Neuron Projection Development (GO:0010975),1.494853e-05,2.602594,brown,GO_Biological_Process_2023
Phosphorylation (GO:0016310),7.808549e-05,1.726316,brown,GO_Biological_Process_2023
Regulation Of Small GTPase Mediated Signal Transduction (GO:0051056),8.92844e-05,2.950288,brown,GO_Biological_Process_2023
Regulation Of Intracellular Signal Transduction (GO:1902531),0.0001496323,1.871522,brown,GO_Biological_Process_2023
Protein Modification Process (GO:0036211),0.0001496323,1.491984,brown,GO_Biological_Process_2023
Protein Phosphorylation (GO:0006468),0.0003663245,1.576952,brown,GO_Biological_Process_2023


In [15]:
# Count the number of total terms and unique terms for each database
database_term_counts <- top_25 %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 7 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023              107           58
[90m2[39m GO_Cellular_Component_2023               92           36
[90m3[39m GO_Molecular_Function_2023              100           55
[90m4[39m KEGG_2019_Mouse                         100           49
[90m5[39m Panther_2016                             74           32
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO         125           41
[90m7[39m Reactome_2016                           100           58


In [19]:
# Create plots for each database
for (database in unique(top_25$Database)) {
  # Filter for the current database
  current_data <- top_25 %>%
    filter(Database == database)
  
  if (nrow(current_data) > 0) {
    dot_plot <- ggplot(current_data, aes(x = Module, y = Term, size = Odds.Ratio, fill = Adjusted.P.value)) +
      geom_point(shape = 21) +
      scale_fill_viridis() +
      xlab('') + ylab('') +
      labs(
        title = 'Top 25 Enrichr Terms Per Module',
        subtitle = glue('{database}')
      ) +
      theme(
        panel.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = "black"),
        plot.background = element_rect(fill = "white", color = NA),
        axis.text.x = element_text(angle = 90, hjust = 1)
      )
    
    # Save the dot plot with the database name in the filename
    ggsave(filename = glue("top_25_per_module_dot_plot_{database}.pdf"), plot = dot_plot, height = 12, width = 15)
  } else {
    cat(glue("No data available for {database}. Skipping...\n"))
  }
}