Resources for analysis:

* https://ucdavis-bioinformatics-training.github.io/2022-June-RNA-Seq-Analysis/data_analysis/DE_Analysis_mm_with_quizzes_fixed
* https://ucdavis-bioinformatics-training.github.io/2022-June-RNA-Seq-Analysis/data_reduction/03-counts_mm
* https://github.com/ben-laufer/RNA-seq/blob/main/04-limma-voom.R
* https://raw.githubusercontent.com/ucdavis-bioinformatics-training/2020-mRNA_Seq_Workshop/master/data_analysis/enrichment_mm.Rmd
* https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
* https://cran.r-project.org/web/packages/enrichR/vignettes/enrichR.html

## Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb_R4.3/lib/R/library")

## Load Libraries

In [3]:
library(enrichR)
library(dplyr)
library(biomaRt)
library(openxlsx)
library(ggplot2)
library(readxl)

## Load Data

In [4]:
# List all DEG files
files <- list.files(path = "05_DEGs", pattern = "\\_results.csv$", full.names = TRUE)

# View
print(files)

[1] "05_DEGs/control_male_vs_control_female_results.csv"   
[2] "05_DEGs/faexcess_female_vs_control_female_results.csv"
[3] "05_DEGs/faexcess_female_vs_control_male_results.csv"  
[4] "05_DEGs/faexcess_male_vs_control_female_results.csv"  
[5] "05_DEGs/faexcess_male_vs_control_male_results.csv"    
[6] "05_DEGs/faexcess_male_vs_faexcess_female_results.csv" 
[7] "05_DEGs/faexcess_vs_control_results.csv"              


In [12]:
# Select Ensembl database
ensembl <- useEnsembl(biomart = "genes", dataset = "mmusculus_gene_ensembl", mirror = "useast")

## Convert to Gene Names

In [13]:
# Loop through each DEG file
for (file in files) {
  
  # Read the current file
  df <- read.csv(file)
  
  # Split the gene identifiers and extract gene names (assuming the gene IDs are in the "X" column)
  split_genes <- strsplit(df$X, split = "\\.")
  gene_names <- sapply(split_genes, "[[", 1L)
  df$gene_names <- gene_names
  
  # Ensure geneList is a named vector for P.Values
  geneList <- setNames(df$P.Value, df$gene_names)
  
  # Lookup Entrez Gene IDs for Ensembl Gene IDs (first lookup)
  lookup_entrez <- getBM(
    mart = ensembl,
    attributes = c('ensembl_gene_id', 'entrezgene_id'),
    filter = 'ensembl_gene_id',
    values = df$gene_names,
    uniqueRows = TRUE
  )
  
  # Merge the lookup result for Entrez Gene IDs with the dataframe
  df <- merge(df, lookup_entrez, by.x = "gene_names", by.y = "ensembl_gene_id", all.x = TRUE)
  names(df)[names(df) == "entrezgene_id"] <- "entrez_gene_id"
  
  # Lookup external gene names for Ensembl Gene IDs (second lookup)
  lookup_external <- getBM(
    mart = ensembl,
    attributes = c('ensembl_gene_id', 'external_gene_name'),
    filter = 'ensembl_gene_id',
    values = df$gene_names,
    uniqueRows = TRUE
  )
  names(lookup_external)[names(lookup_external) == "ensembl_gene_id"] <- "gene_names"
  
  # Add external gene names to the dataframe
  df <- df %>%
    left_join(lookup_external, by = "gene_names") %>%
    mutate(external_gene_name = ifelse(is.na(external_gene_name), "NA", external_gene_name))
  
  # Reorder columns for consistency
  df <- df[, c("gene_names", "external_gene_name", "X", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", "entrez_gene_id")]
  
  # Filter to only unique rows (remove duplicate gene names)
  df <- df %>%
    filter(!is.na(external_gene_name) & external_gene_name != "") %>%
    distinct(external_gene_name, .keep_all = TRUE)
  
  # Save the results to a new file
  output_file <- sub("_results.csv$", "_genes.csv", basename(file))
  write.csv(df, file.path(dirname(file), output_file), row.names = FALSE)
  
  # Print a message indicating the completion for this file (optional)
  message(paste("Processed and saved results for", file, "to", output_file))
}

Processed and saved results for 05_DEGs/control_male_vs_control_female_results.csv to control_male_vs_control_female_genes.csv

Processed and saved results for 05_DEGs/faexcess_female_vs_control_female_results.csv to faexcess_female_vs_control_female_genes.csv

Processed and saved results for 05_DEGs/faexcess_female_vs_control_male_results.csv to faexcess_female_vs_control_male_genes.csv

Processed and saved results for 05_DEGs/faexcess_male_vs_control_female_results.csv to faexcess_male_vs_control_female_genes.csv

Processed and saved results for 05_DEGs/faexcess_male_vs_control_male_results.csv to faexcess_male_vs_control_male_genes.csv

Processed and saved results for 05_DEGs/faexcess_male_vs_faexcess_female_results.csv to faexcess_male_vs_faexcess_female_genes.csv




                                                                      

Processed and saved results for 05_DEGs/faexcess_vs_control_results.csv to faexcess_vs_control_genes.csv



## Gene Ontology

In [14]:
# List all DEG files
files <- list.files(path = "05_DEGs", pattern = "\\_genes.csv$", full.names = TRUE)

# View
print(files)

[1] "05_DEGs/control_male_vs_control_female_genes.csv"   
[2] "05_DEGs/faexcess_female_vs_control_female_genes.csv"
[3] "05_DEGs/faexcess_female_vs_control_male_genes.csv"  
[4] "05_DEGs/faexcess_male_vs_control_female_genes.csv"  
[5] "05_DEGs/faexcess_male_vs_control_male_genes.csv"    
[6] "05_DEGs/faexcess_male_vs_faexcess_female_genes.csv" 
[7] "05_DEGs/faexcess_vs_control_genes.csv"              


In [15]:
# Filter to significant DEGs only

# Initialize a vector to store files with significant DEGs
significant_DEGs_files <- c()

# Loop through each file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file)
  
  # Filter for significant DEGs
  significant_DEGs <- data[data$adj.P.Val < 0.05, ]
  
  # Print the file name and number of significant DEGs
  cat("File:", basename(file), "- Number of significant DEGs:", nrow(significant_DEGs), "\n")
  
  # Check if there are significant DEGs
  if (nrow(significant_DEGs) > 0) {
    significant_DEGs_files <- c(significant_DEGs_files, file)
  }
}

# Overwrite to reflect only significant DEGs
files <- significant_DEGs_files

# View files with significant DEGs
cat("\nFiles with significant DEGs:\n")
print(files)

File: control_male_vs_control_female_genes.csv - Number of significant DEGs: 0 
File: faexcess_female_vs_control_female_genes.csv - Number of significant DEGs: 161 
File: faexcess_female_vs_control_male_genes.csv - Number of significant DEGs: 0 
File: faexcess_male_vs_control_female_genes.csv - Number of significant DEGs: 0 
File: faexcess_male_vs_control_male_genes.csv - Number of significant DEGs: 0 
File: faexcess_male_vs_faexcess_female_genes.csv - Number of significant DEGs: 0 
File: faexcess_vs_control_genes.csv - Number of significant DEGs: 646 

Files with significant DEGs:
[1] "05_DEGs/faexcess_female_vs_control_female_genes.csv"
[2] "05_DEGs/faexcess_vs_control_genes.csv"              


In [16]:
# Loop through each file
for (file in files) {
  
  # Read the current file
  df <- read.csv(file)

  # Filter to only significant DEGs
 df <- df[df$adj.P.Val < 0.05, ]
    
  # Perform the enrichR analysis on the gene list
  enrichr_results <- enrichr(df$external_gene_name, c("GO_Biological_Process_2023",
                                                      "GO_Cellular_Component_2023",
                                                      "GO_Molecular_Function_2023",
                                                      "KEGG_2019_Mouse",
                                                      "Panther_2016",
                                                      "Reactome_2016",
                                                      "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"))
  
  # Create a new Excel workbook for the current file
  wb <- createWorkbook()
  
  # Loop through each table in enrichr_results and save it as a separate sheet in the Excel workbook
  for (i in seq_along(enrichr_results)) {
    # Extract the data frame from the list
    enrichr_df <- enrichr_results[[i]]
    
    # Define the original sheet name
    original_sheet_name <- names(enrichr_results)[i]
    
    # Modify the sheet name if it's specifically "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"
    sheet_name <- if (original_sheet_name == "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO") {
      "RNAseq_DiseaseGene_DrugSigs_GEO"
    } else {
      original_sheet_name
    }
    
    # Add the data frame as a new sheet in the Excel workbook
    addWorksheet(wb, sheet_name)
    writeData(wb, sheet = sheet_name, x = enrichr_df)
  }
  
  # Define the output Excel filename based on the input file name
  output_filename <- paste0("05_DEGs/", tools::file_path_sans_ext(basename(file)), "_enrichr_results.xlsx")
  
  # Save the Excel workbook for this file
  saveWorkbook(wb, output_filename, overwrite = TRUE)
  
  # Print message indicating successful save
  cat("Enrichr results saved for:", file, "->", output_filename, "\n")
}

Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
  Querying KEGG_2019_Mouse... Done.
  Querying Panther_2016... Done.
  Querying Reactome_2016... Done.
  Querying RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO... Done.
Parsing results... Done.
Enrichr results saved for: 05_DEGs/faexcess_female_vs_control_female_genes.csv -> 05_DEGs/faexcess_female_vs_control_female_genes_enrichr_results.xlsx 
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
  Querying KEGG_2019_Mouse... Done.
  Querying Panther_2016... Done.
  Querying Reactome_2016... Done.
  Querying RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO... Done.
Parsing results... Done.
Enrichr results saved for: 05_DEGs/faexcess_vs_control_genes.csv -> 05_DEGs/faexcess_vs_control_gene

## Plot All Genes

In [34]:
# List all Enrichr files
files <- list.files(path = "05_DEGs", pattern = "\\.xlsx$", full.names = TRUE)

# View
print(files)

[1] "05_DEGs/faexcess_female_vs_control_female_genes_enrichr_results.xlsx"
[2] "05_DEGs/faexcess_vs_control_genes_enrichr_results.xlsx"              


In [35]:
# Define functions to calculate -log10 of P-value and filter top 10 terms
calculate_neg_log_p <- function(df) {
  df %>%
    mutate(neg_log_p = ifelse(P.value > 0, -log10(P.value), Inf))
}

filter_top_10 <- function(df) {
  return(df %>% arrange(P.value) %>% head(10))
}

In [36]:
# Loop through each Excel file and make graphs of GO databases
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    group1 <- tolower(split_name[1])  
    group2 <- tolower(split_name[2])
    # Remove "_genes" from group2 
    group2 <- gsub("_genes_enrichr_results", "", group2)
    # Adjust group names based on specific cases
    if (grepl("faexcess_female", group1)) {
      group1 <- "faexcess_female"
    } else if (grepl("control_female", group2)) {
      group2 <- "control_female"
    }
  } else if (length(split_name) == 1) {
    group1 <- tolower(split_name[1])
    group2 <- "control"
  }

  # Confirm grouping
  cat("Generating plot for", group1, "vs", group2, "...\n")
 
  # Read Enrichr results from Excel sheets
  enrichr_results <- list(
    GO_Biological_Process_2023 = read_excel(file, sheet = "GO_Biological_Process_2023"),
    GO_Cellular_Component_2023 = read_excel(file, sheet = "GO_Cellular_Component_2023"),
    GO_Molecular_Function_2023 = read_excel(file, sheet = "GO_Molecular_Function_2023")
  )
    
  # Plot Enrichr results
  GO_BP <- enrichr_results$GO_Biological_Process_2023
  GO_CC <- enrichr_results$GO_Cellular_Component_2023
  GO_MF <- enrichr_results$GO_Molecular_Function_2023
  
  # Apply functions to each data frame
  GO_BP <- GO_BP %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_CC <- GO_CC %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_MF <- GO_MF %>%
    filter_top_10() %>%
    calculate_neg_log_p()

  # Combine data frames for plotting
  combined_df <- rbind(mutate(GO_BP, Category = "Biological Process"),
                       mutate(GO_CC, Category = "Cellular Component"),
                       mutate(GO_MF, Category = "Molecular Function"))
    
  # Filter out rows with NA in neg_log_p
  combined_df <- combined_df %>% filter(!is.na(neg_log_p))

  # Calculate the maximum limit for the x-axis
  x_max <- ceiling(max(combined_df$neg_log_p, na.rm = TRUE))
    
  # Plot
  pdf(paste0("05_DEGs/", group1, "_vs_", group2, "_enrichr_results_barplot_all.pdf"), height = 5, width = 12)

  # Rename groups for plotting  
  group1 <- ifelse(group1 == "faexcess_female", "FAE Female", "FAE")
  group2 <- ifelse(group2 == "control_female", "Control Female", "Control")
    
  p <- ggplot(combined_df, aes(x = neg_log_p, y = reorder(Term, neg_log_p), fill = Category)) +
    geom_col(position = position_dodge(width = 0.9)) +
    scale_fill_manual(values = c("Biological Process" = "#D1D2F9", 
                                  "Cellular Component" = "#8FD3FE", 
                                  "Molecular Function" = "#BEE3BA")) +
    labs(x = "-log10(P-value)", y = NULL, title = paste("Enrichr Results for", group1, "vs.", group2, "for All DEGs")) +
    scale_x_continuous(limits = c(0, x_max)) +
    facet_grid(Category ~ ., scales = "free_y", space = "free_y") +
    theme_minimal() +
    theme(legend.position = "right",
          strip.text.y = element_blank(),  # Remove y-axis labels
          panel.grid.major = element_blank(),  # Remove major grid lines
          panel.grid.minor = element_blank())  # Remove minor grid lines

  print(p)
    
  dev.off()

  # Print progress
  cat("Plot generated for", group1, "vs", group2, "\n")
}

Generating plot for faexcess_female vs control_female ...
Plot generated for FAE Female vs Control Female 
Generating plot for faexcess vs control ...
Plot generated for FAE vs Control 


In [22]:
# Mapping of original sheet names to database names
sheet_mapping <- c(
  "GO_Biological_Process_2023" = "GO_Biological_Process_2023",
  "GO_Cellular_Component_2023" = "GO_Cellular_Component_2023",
  "GO_Molecular_Function_2023" = "GO_Molecular_Function_2023",
  "KEGG_2019_Mouse" = "KEGG_2019_Mouse",
  "Panther_2016" = "Panther_2016",
  "Reactome_2016" = "Reactome_2016",
  "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO" = "RNAseq_DiseaseGene_DrugSigs_GEO"
)

# Loop through each Excel file and plot Enrichr results per database
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    # Case for filename "control_female_vs_faexcess_female"
    group1 <- tolower(split_name[1])  
    group2 <- tolower(gsub("_genes_enrichr_results", "", split_name[2]))  
  } else if (length(split_name) == 1) {
    # Case for filename "control_vs_faexcess"
    group1 <- tolower(split_name[1])
    group2 <- "faexcess"
  }
  
  # Initialize a flag to track if valid data was found and processed
  valid_data_found <- FALSE
  
  # List of GO term sheets
  sheets <- names(sheet_mapping)  # Use the names of the mapping as sheets
  
  # Loop through each sheet and process
  for (sheet in sheets) {
    tryCatch({
      # Read the sheet using the mapping
      GO_terms <- read_excel(file, sheet = sheet_mapping[sheet])  # Read each sheet

      # Only proceed if data is non-null and has rows
      if (!is.null(GO_terms) && nrow(GO_terms) > 0) {
        # Mark that valid data was found
        valid_data_found <- TRUE
        
        # Define the PDF filename
        pdf_filename <- paste0("05_DEGs/", group1, "_vs_", group2, "_", sheet_mapping[sheet], "_all.pdf")
        
        # Create the plot and save it
        pdf(pdf_filename, height = 7, width = 15)
        p <- plotEnrich(GO_terms, showTerms = 25, numChar = 75, y = "Count", orderBy = "P.value") +
          ggtitle(paste(sheet_mapping[sheet], "for all DEGs in", group1, "vs.", group2))
        
        print(p)
        dev.off()
        
        # Print a message indicating the plot was saved
        cat("Plot saved for:", group1, "vs", group2, "->", pdf_filename, "\n")
      } else {
        # If no valid data is found, skip plotting for this sheet
        cat("No data found for", sheet, "in:", file, "\n")
      }
      
    }, error = function(e) {
      # Handle any read errors
      cat("Error reading", sheet, "in:", file, "\n")
    })
  }
  
  # If no valid data was found for the entire file, print that it was skipped
  if (!valid_data_found) {
    cat("No data found for:", group1, "vs", group2, ". Skipping plots.\n")
  }
}

Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_GO_Biological_Process_2023_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_GO_Cellular_Component_2023_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_GO_Molecular_Function_2023_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_KEGG_2019_Mouse_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_Panther_2016_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_Reactome_2016_all.pdf 
Plot saved for: faexcess_female vs control_female -> 05_DEGs/faexcess_female_vs_control_female_RNAseq_DiseaseGene_DrugSigs_GEO_all.pdf 
Plot saved for: faexcess vs control -> 05_DEGs/faexcess_vs_control_GO_Biological_Process_2023_all.pdf 
Plot saved fo

## Plot Upregulated Genes

In [37]:
# List of result files
files <- list.files("05_DEGs", pattern = "_genes\\.csv$", full.names = TRUE)

# Filter to significant DEGs only
significant_DEGs_files <- c()

# Loop through each file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file)
  
  # Filter for significant DEGs
  significant_DEGs <- data[data$adj.P.Val < 0.05, ]
  
  # Print the file name and number of significant DEGs
  cat("File:", basename(file), "- Number of significant DEGs:", nrow(significant_DEGs), "\n")
  
  # Check if there are significant DEGs
  if (nrow(significant_DEGs) > 0) {
    significant_DEGs_files <- c(significant_DEGs_files, file)
  }
}

# Overwrite to reflect only significant DEGs
files <- significant_DEGs_files

# View files with significant DEGs
cat("\nFiles with significant DEGs:\n")
print(files)

File: faexcess_female_vs_control_female_genes.csv - Number of significant DEGs: 161 
File: faexcess_vs_control_genes.csv - Number of significant DEGs: 646 

Files with significant DEGs:
[1] "05_DEGs/faexcess_female_vs_control_female_genes.csv"
[2] "05_DEGs/faexcess_vs_control_genes.csv"              


In [38]:
# Loop through each result file
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    group1 <- tolower(split_name[1])  
    group2 <- tolower(split_name[2])
    # Remove "_genes" from group2 
    group2 <- gsub("_genes", "", group2)
    # Adjust group names based on specific cases
    if (grepl("faexcess_female", group1)) {
      group1 <- "faexcess_female"
    } else if (grepl("control_female", group2)) {
      group2 <- "control_female"
    }
  } else if (length(split_name) == 1) {
    group1 <- tolower(split_name[1])
    group2 <- "control"
  }

  # Confirm grouping
  cat("Generating plot for", group1, "vs", group2, "...\n")
    
  # Read in data
  df <- read.csv(file)
    
  # Filter to only significant DEGs
  df <- df[df$adj.P.Val < 0.05, ]
  
  # Filter for only upregulated genes
  tmp <- df[df$logFC > 0, ]
  
  # Check if there are any upregulated genes
  if (nrow(tmp) == 0) {
    cat("No significant upregulated genes found for:", group1, "vs", group2, "\n")
    next
  }
  
  # Perform the enrichR analysis on the gene list
  enrichr_results <- enrichr(tmp$external_gene_name, c("GO_Biological_Process_2023",
                                                       "GO_Cellular_Component_2023",
                                                       "GO_Molecular_Function_2023"))
    
  # Plot Enrichr results
  GO_BP <- enrichr_results$GO_Biological_Process_2023
  GO_CC <- enrichr_results$GO_Cellular_Component_2023
  GO_MF <- enrichr_results$GO_Molecular_Function_2023
  
  # Apply functions to each data frame
  GO_BP <- GO_BP %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_CC <- GO_CC %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_MF <- GO_MF %>%
    filter_top_10() %>%
    calculate_neg_log_p()

  # Combine data frames for plotting
  combined_df <- rbind(mutate(GO_BP, Category = "Biological Process"),
                       mutate(GO_CC, Category = "Cellular Component"),
                       mutate(GO_MF, Category = "Molecular Function"))
    
  # Filter out rows with NA in neg_log_p
  combined_df <- combined_df %>% filter(!is.na(neg_log_p))

  # Calculate the maximum limit for the x-axis
  x_max <- ceiling(max(combined_df$neg_log_p, na.rm = TRUE))
    
  # Plot
  pdf(paste0("05_DEGs/", group1, "_vs_", group2, "_enrichr_results_barplot_up.pdf"), height = 5, width = 12)

  # Rename groups for plotting  
  group1 <- ifelse(group1 == "faexcess_female", "FAE Female", "FAE")
  group2 <- ifelse(group2 == "control_female", "Control Female", "Control")
    
  p <- ggplot(combined_df, aes(x = neg_log_p, y = reorder(Term, neg_log_p), fill = Category)) +
    geom_col(position = position_dodge(width = 0.9)) +
    scale_fill_manual(values = c("Biological Process" = "#D1D2F9", 
                                  "Cellular Component" = "#8FD3FE", 
                                  "Molecular Function" = "#BEE3BA")) +
    labs(x = "-log10(P-value)", y = NULL, title = paste("Enrichr Results for", group1, "vs.", group2, "for Upregulated DEGs")) +
    scale_x_continuous(limits = c(0, x_max)) +
    facet_grid(Category ~ ., scales = "free_y", space = "free_y") +
    theme_minimal() +
    theme(legend.position = "right",
          strip.text.y = element_blank(),  # Remove y-axis labels
          panel.grid.major = element_blank(),  # Remove major grid lines
          panel.grid.minor = element_blank())  # Remove minor grid lines

  print(p)
    
  dev.off()

  # Print progress
  cat("Plot generated for", group1, "vs", group2, "\n")
}

Generating plot for faexcess_female vs control_female ...
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
Plot generated for FAE Female vs Control Female 
Generating plot for faexcess vs control ...
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
Plot generated for FAE vs Control 


In [24]:
# Loop through each result file
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    # Case for filename "control_female_vs_faexcess_female"
    group1 <- tolower(split_name[1])  
    group2 <- tolower(gsub("_genes_enrichr_results", "", split_name[2]))  
  } else if (length(split_name) == 1) {
    # Case for filename "control_vs_faexcess"
    group1 <- tolower(split_name[1])
    group2 <- "faexcess"
  }

  # Read in data
  df <- read.csv(file)
    
  # Filter to only significant DEGs
  df <- df[df$adj.P.Val < 0.05, ]
  
  # Filter for only upregulated genes
  tmp <- df[df$logFC > 0, ]
  
  # Check if there are any upregulated genes
  if (nrow(tmp) == 0) {
    cat("No significant upregulated genes found for:", group1, "vs", group2, "\n")
    next
  }
  
  # Perform the enrichR analysis on the gene list
  enrichr_results <- enrichr(tmp$external_gene_name, c("GO_Biological_Process_2023",
                                                       "GO_Cellular_Component_2023",
                                                       "GO_Molecular_Function_2023",
                                                       "KEGG_2019_Mouse",
                                                       "Panther_2016",
                                                       "Reactome_2016",
                                                       "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"))

  # Define the output file paths for each enrichment plot
  output_files <- list(
    GO_Biological_Process_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Biological_Process_2023_up.pdf"),
    GO_Cellular_Component_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Cellular_Component_2023_up.pdf"),
    GO_Molecular_Function_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Molecular_Function_2023_up.pdf"),
    KEGG_2021_Human = paste0("05_DEGs/", group1, "_vs_", group2, "_KEGG_2019_Mouse_up.pdf"),
    Panther_2016 = paste0("05_DEGs/", group1, "_vs_", group2, "_Panther_2016_up.pdf"),
    Reactome_2016 = paste0("05_DEGs/", group1, "_vs_", group2, "_Reactome_2016_up.pdf"),
    RNA_Seq_Disease_Gene_and_Drug_Signatures_from_GEO = paste0("05_DEGs/", group1, "_vs_", group2, "_RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO_up.pdf")
  )

  # Loop through each output file and plot the results
  for (i in seq_along(enrichr_results)) {
    if (!is.null(enrichr_results[[i]]) && nrow(enrichr_results[[i]]) > 0) {
      pdf(output_files[[i]], height = 7, width = 15)
      p <- plotEnrich(enrichr_results[[i]], showTerms = 25, numChar = 75, y = "Count", orderBy = "P.value") +
        ggtitle(paste(names(output_files)[i], "for DEGs Upregulated in", group1, "vs", group2))
      print(p)
      dev.off()
      
      # Print message indicating the plot was saved
      cat("Plot saved for:", names(output_files)[i], "for", group1, "vs", group2, "->", output_files[[i]], "\n")
    } else {
      # Print a message indicating that there were no results to plot
      cat("No results to plot for:", names(output_files)[i], "for", group1, "vs", group2, "\n")
    }
  }
}

Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
  Querying KEGG_2019_Mouse... Done.
  Querying Panther_2016... Done.
  Querying Reactome_2016... Done.
  Querying RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO... Done.
Parsing results... Done.
Plot saved for: GO_Biological_Process_2023 for faexcess_female vs control_female_genes -> 05_DEGs/faexcess_female_vs_control_female_genes_GO_Biological_Process_2023_up.pdf 
Plot saved for: GO_Cellular_Component_2023 for faexcess_female vs control_female_genes -> 05_DEGs/faexcess_female_vs_control_female_genes_GO_Cellular_Component_2023_up.pdf 
Plot saved for: GO_Molecular_Function_2023 for faexcess_female vs control_female_genes -> 05_DEGs/faexcess_female_vs_control_female_genes_GO_Molecular_Function_2023_up.pdf 
Plot saved for: KEGG_2021_Human for faexcess_female vs control_female_genes -> 05_DEGs/faexcess_female_vs_c

## Plot Downregulated Genes

In [39]:
# List of result files
files <- list.files("05_DEGs", pattern = "_genes\\.csv$", full.names = TRUE)

# Filter to significant DEGs only
significant_DEGs_files <- c()

# Loop through each file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file)
  
  # Filter for significant DEGs
  significant_DEGs <- data[data$adj.P.Val < 0.05, ]
  
  # Print the file name and number of significant DEGs
  cat("File:", basename(file), "- Number of significant DEGs:", nrow(significant_DEGs), "\n")
  
  # Check if there are significant DEGs
  if (nrow(significant_DEGs) > 0) {
    significant_DEGs_files <- c(significant_DEGs_files, file)
  }
}

# Overwrite to reflect only significant DEGs
files <- significant_DEGs_files

# View files with significant DEGs
cat("\nFiles with significant DEGs:\n")
print(files)

File: faexcess_female_vs_control_female_genes.csv - Number of significant DEGs: 161 
File: faexcess_vs_control_genes.csv - Number of significant DEGs: 646 

Files with significant DEGs:
[1] "05_DEGs/faexcess_female_vs_control_female_genes.csv"
[2] "05_DEGs/faexcess_vs_control_genes.csv"              


In [40]:
# Loop through each result file
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    group1 <- tolower(split_name[1])  
    group2 <- tolower(split_name[2])
    # Remove "_genes" from group2 
    group2 <- gsub("_genes", "", group2)
    # Adjust group names based on specific cases
    if (grepl("faexcess_female", group1)) {
      group1 <- "faexcess_female"
    } else if (grepl("control_female", group2)) {
      group2 <- "control_female"
    }
  } else if (length(split_name) == 1) {
    group1 <- tolower(split_name[1])
    group2 <- "control"
  }

  # Confirm grouping
  cat("Generating plot for", group1, "vs", group2, "...\n")
    
  # Read in data
  df <- read.csv(file)
    
  # Filter to only significant DEGs
  df <- df[df$adj.P.Val < 0.05, ]
  
  # Filter for only downregulated genes
  tmp <- df[df$logFC < 0, ]
  
  # Check if there are any downregulated genes
  if (nrow(tmp) == 0) {
    cat("No significant downregulated genes found for:", group1, "vs", group2, "\n")
    next
  }
  
  # Perform the enrichR analysis on the gene list
  enrichr_results <- enrichr(tmp$external_gene_name, c("GO_Biological_Process_2023",
                                                       "GO_Cellular_Component_2023",
                                                       "GO_Molecular_Function_2023"))
    
  # Plot Enrichr results
  GO_BP <- enrichr_results$GO_Biological_Process_2023
  GO_CC <- enrichr_results$GO_Cellular_Component_2023
  GO_MF <- enrichr_results$GO_Molecular_Function_2023
  
  # Apply functions to each data frame
  GO_BP <- GO_BP %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_CC <- GO_CC %>%
    filter_top_10() %>%
    calculate_neg_log_p()
  
  GO_MF <- GO_MF %>%
    filter_top_10() %>%
    calculate_neg_log_p()

  # Combine data frames for plotting
  combined_df <- rbind(mutate(GO_BP, Category = "Biological Process"),
                       mutate(GO_CC, Category = "Cellular Component"),
                       mutate(GO_MF, Category = "Molecular Function"))
    
  # Filter out rows with NA in neg_log_p
  combined_df <- combined_df %>% filter(!is.na(neg_log_p))

  # Calculate the maximum limit for the x-axis
  x_max <- ceiling(max(combined_df$neg_log_p, na.rm = TRUE))
    
  # Plot
  pdf(paste0("05_DEGs/", group1, "_vs_", group2, "_enrichr_results_barplot_down.pdf"), height = 5, width = 12)

  # Rename groups for plotting  
  group1 <- ifelse(group1 == "faexcess_female", "FAE Female", "FAE")
  group2 <- ifelse(group2 == "control_female", "Control Female", "Control")
    
  p <- ggplot(combined_df, aes(x = neg_log_p, y = reorder(Term, neg_log_p), fill = Category)) +
    geom_col(position = position_dodge(width = 0.9)) +
    scale_fill_manual(values = c("Biological Process" = "#D1D2F9", 
                                  "Cellular Component" = "#8FD3FE", 
                                  "Molecular Function" = "#BEE3BA")) +
    labs(x = "-log10(P-value)", y = NULL, title = paste("Enrichr Results for", group1, "vs.", group2, "for Downregulated DEGs")) +
    scale_x_continuous(limits = c(0, x_max)) +
    facet_grid(Category ~ ., scales = "free_y", space = "free_y") +
    theme_minimal() +
    theme(legend.position = "right",
          strip.text.y = element_blank(),  # Remove y-axis labels
          panel.grid.major = element_blank(),  # Remove major grid lines
          panel.grid.minor = element_blank())  # Remove minor grid lines

  print(p)
    
  dev.off()

  # Print progress
  cat("Plot generated for", group1, "vs", group2, "\n")
}

Generating plot for faexcess_female vs control_female ...
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
Plot generated for FAE Female vs Control Female 
Generating plot for faexcess vs control ...
Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
Plot generated for FAE vs Control 


In [26]:
# Loop through each result file
for (file in files) {
  # Extract the base filename without extension
  filename <- tools::file_path_sans_ext(basename(file))
  
  # Split the filename into parts based on '_vs_'
  split_name <- unlist(strsplit(filename, "_vs_"))
  
  # Handle different naming conventions and assign group1 and group2
  if (length(split_name) == 2) {
    # Case for filename "control_female_vs_faexcess_female"
    group1 <- tolower(split_name[1])  
    group2 <- tolower(gsub("_genes_enrichr_results", "", split_name[2]))  
  } else if (length(split_name) == 1) {
    # Case for filename "control_vs_faexcess"
    group1 <- tolower(split_name[1])
    group2 <- "faexcess"
  }

  # Read in data
  df <- read.csv(file)
    
  # Filter to only significant DEGs
  df <- df[df$adj.P.Val < 0.05, ]
    
  # Filter for only downregulated genes
  tmp <- df[df$logFC < 0, ]
  
  # Check if there are any downregulated genes
  if (nrow(tmp) == 0) {
    cat("No significant downregulated genes found for:", group1, "vs", group2, "\n")
    next
  }
  
  # Perform the enrichR analysis on the gene list
  enrichr_results <- enrichr(tmp$external_gene_name, c("GO_Biological_Process_2023",
                                                       "GO_Cellular_Component_2023",
                                                       "GO_Molecular_Function_2023",
                                                       "KEGG_2019_Mouse",
                                                       "Panther_2016",
                                                       "Reactome_2016",
                                                       "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"))

  # Log the number of enrichment results
  for (i in seq_along(enrichr_results)) {
    if (!is.null(enrichr_results[[i]])) {
      cat("Enrichment results for:", names(enrichr_results)[i], "- Rows:", nrow(enrichr_results[[i]]), "\n")
    } else {
      cat("No results for:", names(enrichr_results)[i], "\n")
    }
  }

  # Define the output file paths for each enrichment plot
  output_files <- list(
    GO_Biological_Process_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Biological_Process_2023_down.pdf"),
    GO_Cellular_Component_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Cellular_Component_2023_down.pdf"),
    GO_Molecular_Function_2023 = paste0("05_DEGs/", group1, "_vs_", group2, "_GO_Molecular_Function_2023_down.pdf"),
    KEGG_2021_Human = paste0("05_DEGs/", group1, "_vs_", group2, "_KEGG_2019_Mouse_down.pdf"),
    Panther_2016 = paste0("05_DEGs/", group1, "_vs_", group2, "_Panther_2016_down.pdf"),
    Reactome_2016 = paste0("05_DEGs/", group1, "_vs_", group2, "_Reactome_2016_down.pdf"),
    RNA_Seq_Disease_Gene_and_Drug_Signatures_from_GEO = paste0("05_DEGs/", group1, "_vs_", group2, "_RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO_down.pdf")
  )

  # Loop through each output file and plot the results
  for (i in seq_along(enrichr_results)) {
    if (!is.null(enrichr_results[[i]]) && nrow(enrichr_results[[i]]) > 0) {
      pdf(output_files[[i]], height = 7, width = 15)
      p <- plotEnrich(enrichr_results[[i]], showTerms = 25, numChar = 75, y = "Count", orderBy = "P.value") +
        ggtitle(paste(names(output_files)[i], "for DEGs Downregulated in", group1, "vs", group2))
      print(p)
      dev.off()
      
      # Print message indicating the plot was saved
      cat("Plot saved for:", names(output_files)[i], "for", group1, "vs", group2, "->", output_files[[i]], "\n")
    } else {
      cat("No results to plot for:", names(output_files)[i], "for", group1, "vs", group2, "\n")
    }
  }
}

Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
  Querying KEGG_2019_Mouse... Done.
  Querying Panther_2016... Done.
  Querying Reactome_2016... Done.
  Querying RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO... Done.
Parsing results... Done.
Enrichment results for: GO_Biological_Process_2023 - Rows: 368 
Enrichment results for: GO_Cellular_Component_2023 - Rows: 93 
Enrichment results for: GO_Molecular_Function_2023 - Rows: 101 
Enrichment results for: KEGG_2019_Mouse - Rows: 41 
Enrichment results for: Panther_2016 - Rows: 4 
Enrichment results for: Reactome_2016 - Rows: 460 
Enrichment results for: RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO - Rows: 1162 
Plot saved for: GO_Biological_Process_2023 for faexcess_female vs control_female_genes -> 05_DEGs/faexcess_female_vs_control_female_genes_GO_Biological_Process_2023_down.pdf 
Plot saved for: GO_Ce