## Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb_R4.3/lib/R/library")

## Load Libraries

In [3]:
library(readxl)
library(ggplot2)
library(viridis)
library(dplyr)
library(glue)
library(tidyr)

## Set Modules and GO Databases

In [4]:
# Identify modules
module_file <- "06_WGCNA/gene_module_distribution.csv"
modules <- read.csv(module_file)$Module

# View
print(modules)

 [1] "turquoise"       "blue"            "brown"           "yellow"         
 [5] "green"           "red"             "black"           "pink"           
 [9] "magenta"         "purple"          "grey"            "greenyellow"    
[13] "tan"             "salmon"          "cyan"            "midnightblue"   
[17] "lightcyan"       "grey60"          "lightgreen"      "lightyellow"    
[21] "royalblue"       "darkred"         "darkgreen"       "darkturquoise"  
[25] "darkgrey"        "orange"          "darkorange"      "white"          
[29] "skyblue"         "saddlebrown"     "steelblue"       "paleturquoise"  
[33] "violet"          "darkolivegreen"  "darkmagenta"     "sienna3"        
[37] "yellowgreen"     "skyblue3"        "plum1"           "orangered4"     
[41] "mediumpurple3"   "lightsteelblue1" "lightcyan1"      "darkorange2"    
[45] "floralwhite"     "ivory"          


The module removal was run after viewing the module trait Pearson correlation heatmap to remove modules related to litter effects and sex only, but unrelated to conditional differences and subset to only significant modules.

In [5]:
# Modules to remove (modules not significant in any condition)
to_remove <- c("green", "ivory", "violet", "darkturquoise", "brown", "lightcyan1", "orangered4", "paleturquoise", "black", "magenta", "royalblue", "lightyellow", "red", "cyan", "grey")

# Remove specified modules
modules <- setdiff(modules, to_remove)

print(modules)

 [1] "turquoise"       "blue"            "yellow"          "pink"           
 [5] "purple"          "greenyellow"     "tan"             "salmon"         
 [9] "midnightblue"    "lightcyan"       "grey60"          "lightgreen"     
[13] "darkred"         "darkgreen"       "darkgrey"        "orange"         
[17] "darkorange"      "white"           "skyblue"         "saddlebrown"    
[21] "steelblue"       "darkolivegreen"  "darkmagenta"     "sienna3"        
[25] "yellowgreen"     "skyblue3"        "plum1"           "mediumpurple3"  
[29] "lightsteelblue1" "darkorange2"     "floralwhite"    


In [6]:
# Modules to remove (modules significant for only litter or sex)
to_remove <- c("purple", "darkgrey", "floralwhite", "greenyellow", "plum1", "salmon", "darkolivegreen", "tan", "midnightblue", "darkorange2")

# Remove specified modules
modules <- setdiff(modules, to_remove)

print(modules)

 [1] "turquoise"       "blue"            "yellow"          "pink"           
 [5] "lightcyan"       "grey60"          "lightgreen"      "darkred"        
 [9] "darkgreen"       "orange"          "darkorange"      "white"          
[13] "skyblue"         "saddlebrown"     "steelblue"       "darkmagenta"    
[17] "sienna3"         "yellowgreen"     "skyblue3"        "mediumpurple3"  
[21] "lightsteelblue1"


In [7]:
# List of GO databases
databases <- c("GO_Biological_Process_2023", "GO_Cellular_Component_2023", "GO_Molecular_Function_2023",
               "KEGG_2019_Mouse", "Panther_2016", "Reactome_2016", "RNAseq_DiseaseGene_DrugSigs_GEO")

## Make Dot Plots

In [8]:
# Store GO data into data frame

# Initialize an empty dataframe
all_data <- data.frame()

# Iterate over each module and read in the corresponding Excel file
for (module in modules) {
  file_path <- glue("06_WGCNA/{module}_enrichr_results.xlsx")
  
  for (database in databases) {
    try({
      # Read the data from the Excel file
      df <- read_excel(file_path, sheet = database)
      
      if (nrow(df) == 0) {
        next  # Skip to the next database if no data
      }
      
      # Select the required columns and add module and database information
      df <- df %>%
        select(Term, Adjusted.P.value, Odds.Ratio) %>%
        mutate(Module = module, Database = database)
      
      # Append to the dataframe
      all_data <- bind_rows(all_data, df)
      
    }, silent = TRUE)
  }
}

# Check if any data was read
if (nrow(all_data) == 0) {
  stop("No data read from any of the Excel files.")
}

# Display the combined dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Translation (GO:0006412),4.994624e-61,10.173798,turquoise,GO_Biological_Process_2023
2,Cytoplasmic Translation (GO:0002181),6.14281e-49,40.670984,turquoise,GO_Biological_Process_2023
3,Peptide Biosynthetic Process (GO:0043043),7.78975e-39,9.471098,turquoise,GO_Biological_Process_2023
4,Macromolecule Biosynthetic Process (GO:0009059),6.709707e-35,7.245637,turquoise,GO_Biological_Process_2023
5,Mitochondrial Gene Expression (GO:0140053),2.149675e-32,13.080014,turquoise,GO_Biological_Process_2023
6,Gene Expression (GO:0010467),1.065766e-31,4.558653,turquoise,GO_Biological_Process_2023


In [9]:
# Filter out rows where the Adjusted.P.value is less than 0.1
all_data <- all_data %>%
  filter(Adjusted.P.value <= 0.05)

# Display the filtered dataframe
head(all_data)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Translation (GO:0006412),4.994624e-61,10.173798,turquoise,GO_Biological_Process_2023
2,Cytoplasmic Translation (GO:0002181),6.14281e-49,40.670984,turquoise,GO_Biological_Process_2023
3,Peptide Biosynthetic Process (GO:0043043),7.78975e-39,9.471098,turquoise,GO_Biological_Process_2023
4,Macromolecule Biosynthetic Process (GO:0009059),6.709707e-35,7.245637,turquoise,GO_Biological_Process_2023
5,Mitochondrial Gene Expression (GO:0140053),2.149675e-32,13.080014,turquoise,GO_Biological_Process_2023
6,Gene Expression (GO:0010467),1.065766e-31,4.558653,turquoise,GO_Biological_Process_2023


In [10]:
# Count the number of total terms and unique terms for each database
database_term_counts <- all_data %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 7 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023              266          258
[90m2[39m GO_Cellular_Component_2023              111          105
[90m3[39m GO_Molecular_Function_2023               60           55
[90m4[39m KEGG_2019_Mouse                          53           52
[90m5[39m Panther_2016                             14           14
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO        [4m2[24m273         [4m1[24m144
[90m7[39m Reactome_2016                           473          409


In [11]:
# Calculate the number of modules each term appears in for each database
term_module_counts <- all_data %>%
  group_by(Database, Term) %>%
  summarise(ModuleCount = n_distinct(Module), .groups = 'drop')

# Rank the terms within each database by the number of modules they appear in
ranked_terms <- term_module_counts %>%
  arrange(Database, desc(ModuleCount)) %>%
  group_by(Database) %>%
  slice_head(n = 25) %>%
  ungroup()

# Merge with the original data to filter the top 25 terms per database
filtered_data_top_25 <- all_data %>%
  semi_join(ranked_terms, by = c("Database", "Term"))

# Print the filtered data
head(filtered_data_top_25)

Unnamed: 0_level_0,Term,Adjusted.P.value,Odds.Ratio,Module,Database
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>,<chr>
1,Translation (GO:0006412),4.994624e-61,10.173798,turquoise,GO_Biological_Process_2023
2,Cytoplasmic Translation (GO:0002181),6.14281e-49,40.670984,turquoise,GO_Biological_Process_2023
3,Aerobic Electron Transport Chain (GO:0019646),1.080075e-20,12.504262,turquoise,GO_Biological_Process_2023
4,Cellular Respiration (GO:0045333),4.839078e-18,8.033591,turquoise,GO_Biological_Process_2023
5,Aerobic Respiration (GO:0009060),2.094781e-14,9.426521,turquoise,GO_Biological_Process_2023
6,ATP Biosynthetic Process (GO:0006754),2.448194e-05,8.54222,turquoise,GO_Biological_Process_2023


In [12]:
# Count the number of total terms and unique terms for each database
filtered_database_term_counts <- filtered_data_top_25 %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(filtered_database_term_counts)

[90m# A tibble: 7 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023               33           25
[90m2[39m GO_Cellular_Component_2023               31           25
[90m3[39m GO_Molecular_Function_2023               30           25
[90m4[39m KEGG_2019_Mouse                          26           25
[90m5[39m Panther_2016                             14           14
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO         109           25
[90m7[39m Reactome_2016                            57           25


In [13]:
# Create plots for each database
for (database in unique(filtered_data_top_25$Database)) {
  database_filtered_data <- filtered_data_top_25 %>%
    filter(Database == database)
  
  if (nrow(database_filtered_data) > 0) {
    dot_plot <- ggplot(database_filtered_data, aes(x = Module, y = Term, size = Odds.Ratio, fill = Adjusted.P.value)) +
      geom_point(shape = 21) +
      scale_fill_viridis() +
      xlab('') + ylab('') +
      labs(
        title = 'Top Enrichr Terms Across Modules',
        subtitle = glue('{database}')
      ) +
      theme(
        panel.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line = element_line(color = "black"),
        plot.background = element_rect(fill = "white", color = NA),
        axis.text.x = element_text(angle = 90, hjust = 1)
      )
    
    # Save the dot plot with the database name in the filename
    ggsave(filename = glue("06_WGCNA/top_25_dot_plot_{database}.pdf"), plot = dot_plot, height = 7, width = 15)
  } else {
    cat(glue("No data available for {database}. Skipping...\n"))
  }
}