## Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb_R4.3/lib/R/library")

## Load Libraries

In [3]:
library(readxl)
library(ggplot2)
library(viridis)
library(dplyr)
library(glue)
library(tidyr)

## Set Modules and GO Databases

In [9]:
# Read in data
module_membership <- read.table("06_WGCNA/filtered_module_membership.txt", header = TRUE, sep = "\t")

# Covnert to df
module_membership <- as.data.frame(module_membership)

# View
colnames(module_membership)

The module removal was run after viewing the module trait Pearson correlation heatmap to remove modules related to litter effects and sex only, but unrelated to conditional differences and subset to only significant modules.

In [11]:
# Remove columns that don't correspond to modules
module_membership$Probe <- NULL
module_membership$treatment <- NULL
module_membership$Module <- NULL
module_membership$entrez_gene_id <- NULL
module_membership$external_gene_name <- NULL

# Subset to modules of interest
modules <- colnames(module_membership)

# View
print(modules)

 [1] "greenyellow"    "white"          "black"          "turquoise"     
 [5] "plum1"          "skyblue3"       "paleturquoise"  "darkred"       
 [9] "lightyellow"    "orangered4"     "royalblue"      "blue"          
[13] "brown"          "ivory"          "darkorange"     "midnightblue"  
[17] "darkgreen"      "salmon"         "purple"         "darkmagenta"   
[21] "grey60"         "skyblue"        "orange"         "darkolivegreen"
[25] "steelblue"      "violet"         "floralwhite"    "lightgreen"    
[29] "darkturquoise" 


In [12]:
# List of GO databases
databases <- c("GO_Biological_Process_2023", "GO_Cellular_Component_2023", "GO_Molecular_Function_2023",
               "KEGG_2019_Mouse", "Panther_2016", "Reactome_2016", "RNAseq_DiseaseGene_DrugSigs_GEO")

## Make Dot Plots

In [13]:
# Store GO data into data frame

# Initialize an empty dataframe
all_data <- data.frame()

# Iterate over each module and read in the corresponding Excel file
for (module in modules) {
  file_path <- glue("06_WGCNA/{module}_enrichr_results.xlsx")
  
  for (database in databases) {
    try({
      # Read the data from the Excel file
      df <- read_excel(file_path, sheet = database)
      
      if (nrow(df) == 0) {
        next  # Skip to the next database if no data
      }
      
      # Select the required columns and add module and database information
      df <- df %>%
        select(Term, Adjusted.P.value, Odds.Ratio) %>%
        mutate(Module = module, Database = database)
      
      # Append to the dataframe
      all_data <- bind_rows(all_data, df)
      
    }, silent = TRUE)
  }
}

# Check if any data was read
if (nrow(all_data) == 0) {
  stop("No data read from any of the Excel files.")
}

# Display the combined dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Cytoplasmic Translation (GO:0002181),8.207676e-07,8.903484,greenyellow,GO_Biological_Process_2023
2,Macromolecule Biosynthetic Process (GO:0009059),1.084755e-05,5.280709,greenyellow,GO_Biological_Process_2023
3,Peptide Biosynthetic Process (GO:0043043),2.029099e-05,5.515487,greenyellow,GO_Biological_Process_2023
4,Translation (GO:0006412),7.317214e-05,4.241632,greenyellow,GO_Biological_Process_2023
5,Gene Expression (GO:0010467),0.0002025855,3.629496,greenyellow,GO_Biological_Process_2023
6,Ribosome Biogenesis (GO:0042254),0.00111523,4.565934,greenyellow,GO_Biological_Process_2023


In [14]:
# Filter out rows where the Adjusted.P.value is less than 0.1
all_data <- all_data %>%
  filter(Adjusted.P.value <= 0.05)

# Display the filtered dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Cytoplasmic Translation (GO:0002181),8.207676e-07,8.903484,greenyellow,GO_Biological_Process_2023
2,Macromolecule Biosynthetic Process (GO:0009059),1.084755e-05,5.280709,greenyellow,GO_Biological_Process_2023
3,Peptide Biosynthetic Process (GO:0043043),2.029099e-05,5.515487,greenyellow,GO_Biological_Process_2023
4,Translation (GO:0006412),7.317214e-05,4.241632,greenyellow,GO_Biological_Process_2023
5,Gene Expression (GO:0010467),0.0002025855,3.629496,greenyellow,GO_Biological_Process_2023
6,Ribosome Biogenesis (GO:0042254),0.00111523,4.565934,greenyellow,GO_Biological_Process_2023


In [15]:
# Count the number of total terms and unique terms for each database
database_term_counts <- all_data %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 6 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023              376          326
[90m2[39m GO_Cellular_Component_2023              162          119
[90m3[39m GO_Molecular_Function_2023               86           78
[90m4[39m KEGG_2019_Mouse                          73           61
[90m5[39m Panther_2016                             24           23
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO        [4m3[24m292         [4m1[24m208


In [16]:
# Calculate the number of modules each term appears in for each database
term_module_counts <- all_data %>%
  group_by(Database, Term) %>%
  summarise(ModuleCount = n_distinct(Module), .groups = 'drop')

# Rank the terms within each database by the number of modules they appear in
ranked_terms <- term_module_counts %>%
  arrange(Database, desc(ModuleCount)) %>%
  group_by(Database) %>%
  slice_head(n = 25) %>%
  ungroup()

# Merge with the original data to filter the top 25 terms per database
filtered_data_top_25 <- all_data %>%
  semi_join(ranked_terms, by = c("Database", "Term"))

# Print the filtered data
head(filtered_data_top_25)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Cytoplasmic Translation (GO:0002181),8.207676e-07,8.903484,greenyellow,GO_Biological_Process_2023
2,Macromolecule Biosynthetic Process (GO:0009059),1.084755e-05,5.280709,greenyellow,GO_Biological_Process_2023
3,Peptide Biosynthetic Process (GO:0043043),2.029099e-05,5.515487,greenyellow,GO_Biological_Process_2023
4,Translation (GO:0006412),7.317214e-05,4.241632,greenyellow,GO_Biological_Process_2023
5,Gene Expression (GO:0010467),0.0002025855,3.629496,greenyellow,GO_Biological_Process_2023
6,Ribosome Biogenesis (GO:0042254),0.00111523,4.565934,greenyellow,GO_Biological_Process_2023


In [17]:
# Count the number of total terms and unique terms for each database
filtered_database_term_counts <- filtered_data_top_25 %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(filtered_database_term_counts)

[90m# A tibble: 6 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023               61           25
[90m2[39m GO_Cellular_Component_2023               57           25
[90m3[39m GO_Molecular_Function_2023               33           25
[90m4[39m KEGG_2019_Mouse                          37           25
[90m5[39m Panther_2016                             24           23
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO         138           25


In [18]:
# Create plots for each database
for (database in unique(filtered_data_top_25$Database)) {
  database_filtered_data <- filtered_data_top_25 %>%
    filter(Database == database)
  
  if (nrow(database_filtered_data) > 0) {
    dot_plot <- ggplot(database_filtered_data, aes(x = Module, y = Term, size = Odds.Ratio, fill = Adjusted.P.value)) +
      geom_point(shape = 21) +
      scale_fill_viridis() +
      xlab('') + ylab('') +
      labs(
        title = 'Top Enrichr Terms Across Modules',
        subtitle = glue('{database}')
      ) +
      theme(
        panel.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = "black"),
        plot.background = element_rect(fill = "white", color = NA),
        axis.text.x = element_text(angle = 90, hjust = 1)
      )
    
    # Save the dot plot with the database name in the filename
    ggsave(filename = glue("06_WGCNA/top_25_dot_plot_{database}.pdf"), plot = dot_plot, height = 7, width = 15)
  } else {
    cat(glue("No data available for {database}. Skipping...\n"))
  }
}