# Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb/lib/R/library")

# Load Libraries

In [3]:
library(dplyr)
library(rtracklayer)
library(GenomicRanges)
library(tibble)
library(readr)
library(tidyr)
library(ggplot2)
library(readxl)
library(enrichR)
library(biomaRt)
library(openxlsx)

# Process RNA-seq Data

## Load Data

In [None]:
# Read in normalized gene counts from DEG analysis
counts <- read.delim("rnaseq/05_gene_counts/normalized_counts.txt")

# View
head(counts)

In [None]:
# Prepare to assign gene names 
names <- rownames(counts) %>% as.data.frame()
colnames(names) <- "gene_names"
rownames(counts) <- NULL

# View
head(counts)

In [None]:
# Load data
rnaseq <- read.csv("rnaseq/05_DEGs/control_vs_faexcess_genes.csv", header = TRUE) %>%
dplyr::select("gene_names","external_gene_name")

new_names <- names %>% dplyr::left_join(rnaseq, by = "gene_names")

# View
head(new_names)

In [None]:
# Combine data to get external_gene_name
counts <- cbind(new_names, counts)

# Remove rows where external_gene_name is NA
counts <- counts[!is.na(counts$external_gene_name), ]

# View
head(counts)

In [None]:
# Assign gene names as row names
rownames(counts) <- counts$external_gene_name

# Remove the gene_names and external_gene_name columns
counts <- counts[, !(names(counts) %in% c("gene_names", "external_gene_name"))]

# View the updated counts data frame
head(counts)

## Convert Gene Lengths to Kilobases for TPM Calculation

In [None]:
# Read the GTF annotation file
gtf_file <- "/share/lasallelab/genomes/mm10/mm10.refGene.gtf"
gtf_data <- import(gtf_file)

# View
head(gtf_data)

In [None]:
# Remove rows from counts if genes are not found in the annotation file

# Assign unique genes for easy searching
unique_genes <- unique(gtf_data$gene_name)

# Filter counts_data to keep only rows where the row names are in unique_genes
counts <- counts[rownames(counts) %in% unique_genes, ]

# View
head(counts)
print(length(rownames(counts)))

In [None]:
# Calculate gene lengths

# Filter for exon features
exon_data <- gtf_data[gtf_data$type == "exon"]

# Calculate the length of each exon
exon_lengths <- width(exon_data)

# Create a data frame to store gene lengths
gene_lengths <- data.frame(gene_id = exon_data$gene_id, length = exon_lengths)

# Sum the lengths for each gene
gene_length_summary <- aggregate(length ~ gene_id, data = gene_lengths, FUN = sum)

# Print the gene lengths
head(gene_length_summary)
print(length(gene_length_summary$gene_id))

In [None]:
# Convert gene length to kilobases
gene_length_summary <- gene_length_summary %>%
  mutate(length_kb = length / 1000)

head(gene_length_summary)

## Calculate Reads Per Kilobase

In [None]:
# Convert counts to a data frame 
counts <- as.data.frame(counts)

# Add gene_id as a column to counts
counts$gene_id <- rownames(counts)

# Merge counts with gene_length_summary
merged_data <- merge(counts, gene_length_summary, by = "gene_id", all.x = TRUE)

# Calculate RPK for each gene (RPK = counts / length in kilobases)
count_columns <- names(merged_data)[-which(names(merged_data) %in% c("gene_id", "length_kb"))]

# Divide the count columns by length_kb
merged_data[count_columns] <- merged_data[count_columns] / merged_data$length_kb

# Set the gene_id back as row names
rownames(merged_data) <- merged_data$gene_id

# Clean data
merged_data$gene_id <- NULL
merged_data$length_kb <- NULL
merged_data$length <- NULL

# View the RPK data
head(merged_data)

In [None]:
# Verify proper RPK calculations
specific_row <- counts["0610005C13Rik", , drop = FALSE]

# Convert the row to a numeric vector
specific_row_vector <- as.numeric(specific_row)

# Divide every value in the row by the gene length (in kb) of 0610005C13Rik
expected_counts <- specific_row_vector / 2.037

# Print the output
print(expected_counts)

## Calculate Total Reads Per Kilobase

In [None]:
# Calculate the total RPK for each sample 
total_rpk <- colSums(merged_data)

## Calculate TPM per Sample

In [None]:
# Calculate TPM for each gene (TPM = (RPK / Total RPK) * 1,000,000)
tpm <- sweep(merged_data, 2, total_rpk, FUN = "/") * 1e6

# Convert the result to a data frame (optional)
tpm_df <- as.data.frame(tpm)

# View
head(tpm_df)

In [None]:
# Save TPM data to a CSV
write.csv(tpm_df, file = "RNAseq_TPM_Values.csv", row.names = TRUE)

In [None]:
colnames(tpm_df)

# Process WGBS Data

## Annotate Genes

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "\\.deduplicated\\.bismark\\.cov\\.gz\\.CpG_report\\.merged_CpG_evidence\\.cov\\.gz$", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Read the GTF annotation file
gtf_file <- "/share/lasallelab/genomes/mm10/mm10.refGene.gtf"
gtf_data <- import(gtf_file)

# View
head(gtf_data)

In [None]:
gtf_transcripts <- gtf_data[gtf_data$type == "transcript"]

# Create GRanges object for gene annotations
gr_genes <- GRanges(seqnames = seqnames(gtf_transcripts),
                    ranges = IRanges(start = start(gtf_transcripts), end = end(gtf_transcripts)),
                    gene_name = mcols(gtf_transcripts)$gene_name)

# View 
head(gr_genes)

In [None]:
# Loop through each file in the files list
for (file in files) {
  # Read the gzipped file
  regions <- read.table(gzfile(file), header = FALSE, stringsAsFactors = FALSE)
  
  # Create GRanges object for regions
  gr_regions <- GRanges(seqnames = regions$V1,  
                        ranges = IRanges(start = regions$V2, end = regions$V3))  
  
  # Find overlaps between regions and gene annotations
  overlaps <- findOverlaps(gr_regions, gr_genes)
  
  # Create a new column for gene names in the regions data frame
  regions$gene_name <- NA 
  regions$gene_name[queryHits(overlaps)] <- gr_genes$gene_name[subjectHits(overlaps)]

  # Do not save columns where gene names are NA
  regions <- regions %>% filter(!is.na(gene_name))
  
  # Create a sample basename for saving the results
  sample_basename <- sub("\\..*$", "", basename(file))
  
  # Save the annotated regions to a CSV file
  output_directory <- "wgbs/08_cytosine_reports/"
  write.csv(regions, file = paste0(output_directory, sample_basename, "_annotated_regions.csv"), row.names = FALSE)

  # Print progress
  cat(sprintf("Regions have been assigned for %s...\n", sample_basename))
}

## Calculate Percent Methylation Per Gene

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "\\.csv$", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Loop through each file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file, stringsAsFactors = FALSE)
  
  # Aggregate counts by gene_name
  aggregated_data <- data %>%
    group_by(gene_name) %>%
    summarise(
      methylated_cytosines = sum(V5, na.rm = TRUE),
      unmethylated_cytosines = sum(V6, na.rm = TRUE),
      .groups = 'drop'  # This ensures that the grouping is dropped after summarising
    ) %>%
    mutate(
      percent_methylated = (methylated_cytosines / (methylated_cytosines + unmethylated_cytosines)) * 100
    )
    
  # Specify the output directory
  output_directory <- "wgbs/08_cytosine_reports/"
  
  # Extract the first part of the basename before the first underscore
  base_name <- tools::file_path_sans_ext(basename(file))
  first_part <- strsplit(base_name, "_")[[1]][1]
  
  # Save the aggregated data to a CSV file in the specified directory
  write.csv(aggregated_data, file = paste0(output_directory, first_part, "_percent_methylated.csv"), row.names = FALSE)
}

## Create Percent Methylation Table Comparable to TPM Table

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "FA\\d+_percent_methylated\\.csv", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Initialize an empty data frame
df <- data.frame()

# Loop through each CSV file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file)
     
  # Delete unnecessary files
  data$methylated_cytosines <- NULL
  data$unmethylated_cytosines <- NULL
    
  # Extract sample names from the file names
  sample_name <- gsub(".*?(FA\\d+)_percent_methylated\\.csv", "\\1", file)

  # Rename percent_methylated to sample name
  colnames(data)[colnames(data) == "percent_methylated"] <- sample_name
    
  # Merge the current data frame with the main data frame
  if (nrow(df) == 0) {
    df <- data  
  } else {
    df <- merge(df, data, by = "gene_name", all = TRUE) 
  }
}

# View
head(df)

In [None]:
# Assign gene names as row names to match formatting of RNA-seq TPM counts
rownames(df) <- df$gene_name
df$gene_name <- NULL

# View
head(df)

In [None]:
# Save percent methylation data to a CSV
write.csv(df, file = "WGBS_Percent_Methylation_Values.csv", row.names = TRUE)

# Integrate Data Frames

In [None]:
# Load RNA-seq data
rnaseq <- read.csv("RNAseq_TPM_Values.csv")

# Assign row names
rownames(rnaseq) <- rnaseq$X
rnaseq$X <- NULL

# View
head(rnaseq)

In [None]:
# Load WGBS data
wgbs <- read.csv("WGBS_Percent_Methylation_Values.csv")

# Assign row names
rownames(wgbs) <- wgbs$X
wgbs$X <- NULL

# View
head(wgbs)

In [None]:
# Sample mapping from WGBS to RNA-seq
sample_map <- c(
  "G1_1_1_4" = "FA114",
  "G1_1_1_5" = "FA115",
  "G1_1_1_6" = "FA116",
  "G1_1_1_7" = "FA117",
  "G1_1_2_4" = "FA124",
  "G1_1_2_5" = "FA125",
  "G1_1_2_6" = "FA126",
  "G2_2_6_5" = "FA265",
  "G2_2_6_6" = "FA266",
  "G2_2_7_10" = "FA2710",
  "G2_2_7_11" = "FA2711",
  "G2_2_7_7" = "FA277"
)

In [None]:
# Create a new row with the column names
new_row <- as.data.frame(t(colnames(rnaseq)))

# Set the column names for the new row
colnames(new_row) <- colnames(rnaseq)

# Add the new row to the top of the dataframe
rnaseq <- rbind(new_row, rnaseq)

# View
head(rnaseq)

In [None]:
# Match the RNA-seq column names to the sample map (visually confirm mapping of names)
colnames(rnaseq) <- sample_map[colnames(rnaseq)]

# View
head(rnaseq)

In [None]:
# Remove row containing original sample names
rnaseq <- rnaseq[rownames(rnaseq) != "1", ]

# Remove columns where NA is the column name
rnaseq <- rnaseq[, !is.na(colnames(rnaseq))]

# View
head(rnaseq)

In [None]:
# Determine length of each dataframe
print(length(rownames(rnaseq)))
print(length(rownames(wgbs)))

In [None]:
# Keep only the rows overlapping between rnaseq and wgbs
wgbs <- wgbs[rownames(wgbs) %in% rownames(rnaseq), ]

# Confirm length
print(length(rownames(wgbs)))

In [None]:
# Keep only the rows overlapping between rnaseq and wgbs
rnaseq <- rnaseq[rownames(rnaseq) %in% rownames(wgbs), ]

# Confirm length
print(length(rownames(rnaseq)))

In [None]:
# Confirm that all row names match to ensure paired data
print(length(rownames(rnaseq)))
print(length(rownames(wgbs)))

print(all(rownames(rnaseq) %in% rownames(wgbs)))
print(all(rownames(wgbs) %in% rownames(rnaseq)))

In [None]:
# View both dataframes
head(rnaseq)
head(wgbs)

In [None]:
# Merge datasets
merged_data <- rnaseq %>%
  rownames_to_column(var = "Gene") %>%
  inner_join(wgbs %>% rownames_to_column(var = "Gene"), by = "Gene")

# View
head(merged_data)

In [None]:
# Convert columns containing RNA-seq data to numeric and log2 transform to normalize
merged_data <- merged_data %>%
  filter(if_all(ends_with(".x"), ~ as.numeric(.) >= 0)) %>%
  mutate(across(ends_with(".x"), ~ log2(as.numeric(.))))

# Reformat
rownames(merged_data) <- merged_data$Gene
merged_data$Gene <- NULL

# View
head(merged_data)

# Calculate Correlations

In [None]:
# Create a named vector for the mapping with updated group name
mapping <- c(
  FA114 = "Control",
  FA115 = "Control",
  FA116 = "Control",
  FA117 = "Control",
  FA124 = "Control",
  FA125 = "Control",
  FA126 = "Control",
  FA265 = "FAE",
  FA266 = "FAE",
  FA2710 = "FAE",
  FA2711 = "FAE",
  FA277 = "FAE"
)

In [None]:
# Create a data frame for the mapping
group_info <- data.frame(Sample = names(mapping), Group = mapping, stringsAsFactors = FALSE)

# View
print(group_info)

In [None]:
# Convert row names to a column in merged_data
merged_data <- cbind(Gene = rownames(merged_data), merged_data)

# Reshape the data to long format for easier manipulation
long_data <- merged_data %>%
  pivot_longer(cols = -Gene, names_to = c("Sample", "Type"), names_sep = "\\.", values_to = "Value") %>%
  left_join(group_info, by = "Sample") %>%
  filter(!is.na(Group))

# View
head(long_data)
length(unique(long_data$Gene))
unique(long_data$Sample)
unique(long_data$Type)
unique(long_data$Group)

In [None]:
# Calculate Spearman correlation coefficients and their significance for each gene by group
correlation_results <- long_data %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  group_by(Gene, Group) %>%
  summarise(
    correlation = ifelse(
      sd(Type_x, na.rm = TRUE) > 0 & sd(Type_y, na.rm = TRUE) > 0,
      cor(Type_x, Type_y, method = "spearman", use = "complete.obs"),
      NA
    ),
    p_value = ifelse(
      sd(Type_x, na.rm = TRUE) > 0 & sd(Type_y, na.rm = TRUE) > 0,
      cor.test(Type_x, Type_y, method = "spearman", use = "complete.obs")$p.value,
      NA
    ),
    .groups = 'drop'
  ) %>%
  pivot_wider(names_from = Group, values_from = c(correlation, p_value), names_prefix = "Correlation_")  

# View
head(correlation_results)

In [None]:
# Save the correlation results to a CSV file
write.csv(correlation_results, file = "spearman_correlation_results.csv", row.names = FALSE)

In [None]:
# See significant correlations
significant_results <- correlation_results %>%
  filter(
    p_value_Correlation_Control < 0.05 | 
    p_value_Correlation_FAE < 0.05
  )

# View
head(significant_results)
print(length(significant_results$Gene))

In [None]:
# Save the correlation results to a CSV file
write.csv(correlation_results, file = "spearman_correlation_significant_results.csv", row.names = FALSE)

# Visualize Correlations

## Load Data

In [None]:
# Load data
degs <- read.csv("rnaseq/05_DEGs/faexcess_vs_control_genes.csv", header = TRUE)

# View
head(degs)

In [None]:
# Load data
dmrs <- read_excel("wgbs/08_cytosine_reports/DMRs/DMRs_annotated.xlsx")

# View
head(dmrs)

## Significant DEGs Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant DEGs Positively Correlated Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset for positive correlations only
positive_correlations <- correlation_results[correlation_results$Correlation_Control > 0 & 
                                            correlation_results$Correlation_FAE > 0, ]

# View the subsetted results
head(positive_correlations)

In [None]:
# Subset significant_DEGs for only positively correlated ones
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% positive_correlations$Gene, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_positively_correlated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant DEGs Negatively Correlated Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset for negative correlations only
negative_correlations <- correlation_results[correlation_results$Correlation_Control < 0 & 
                                            correlation_results$Correlation_FAE < 0, ]

# View the subsetted results
head(negative_correlations)

In [None]:
# Subset significant_DEGs for only negatively correlated ones
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% negative_correlations$Gene, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_negatively_correlated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant Upregulated DEGs Only

In [None]:
# Subset to significant DEGs only
significant__upregulated_DEGs <- degs[degs$adj.P.Val < 0.05 & degs$logFC > 0, ]

# View
head(significant__upregulated_DEGs)
length(significant__upregulated_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significantly upregulated DEGs
long_data_sigUpDEGs <- long_data[long_data$Gene %in% significant__upregulated_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigUpDEGs)
length(unique(long_data_sigUpDEGs$Gene))
unique(long_data_sigUpDEGs$Sample)
unique(long_data_sigUpDEGs$Type)
unique(long_data_sigUpDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigUpDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_upregulated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant Downregulated DEGs Only

In [None]:
# Subset to significant downregulated DEGs only
significant_downregulated_DEGs <- degs[degs$adj.P.Val < 0.05 & degs$logFC < 0, ]

# View
head(significant_downregulated_DEGs)
length(significant_downregulated_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significantly downregulated DEGs
long_data_sigDownDEGs <- long_data[long_data$Gene %in% significant_downregulated_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDownDEGs)
length(unique(long_data_sigDownDEGs$Gene))
unique(long_data_sigDownDEGs$Sample)
unique(long_data_sigDownDEGs$Type)
unique(long_data_sigDownDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDownDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_downregulated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Overlapping Significant DEGs and DMRs

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset to significant DMRs only
significant_DMRs <- dmrs[dmrs$p.value < 0.05, ]

# View
head(significant_DMRs)
length(significant_DMRs$geneSymbol)

In [None]:
# Subset significant_DEGs for ones found in significant DMRs
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% dmrs$geneSymbol, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

# See overlapping genes
print(unique(significant_DEGs$external_gene_name))

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initialize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_overlapping_DEGs_and_DMRs_labeled.pdf", width = 20, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  geom_text(aes(label = Gene), vjust = -1, size = 2.5) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

In [None]:
# Initialize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_overlapping_DEGs_and_DMRs.pdf", width = 8, height = 6)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

# Assess Significance of DEG and DMR Overlap

In [None]:
# Load data
degs <- read.csv("rnaseq/05_DEGs/faexcess_vs_control_genes.csv", header = TRUE)

# Subset to significant DEGs only
degs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(degs)

In [None]:
# Load data
dmrs <- read_excel("wgbs/08_cytosine_reports/DMRs/DMRs_annotated.xlsx")

# Subset to significant DMRs only
dmrs <- dmrs[dmrs$p.value < 0.05, ]

# View
head(dmrs)

## Make Contingency Table for Fisher's Exact Test

In [None]:
# Overlap
overlap_count <- length(intersect(degs$external_gene_name, dmrs$geneSymbol))

In [None]:
# DEGs not in DMRs
b <- length(setdiff(degs$external_gene_name, dmrs$geneSymbol))

In [None]:
# DMRs not in DEGs
c <- length(setdiff(dmrs$geneSymbol, degs$external_gene_name))

In [None]:
# Total genes
total_genes <- length(unique(c(degs$external_gene_name, dmrs$geneSymbol)))
d <- total_genes - (overlap_count + b + c)

In [None]:
# Create the contingency table
contingency_table <- matrix(c(overlap_count, b, c, d), nrow = 2)
colnames(contingency_table) <- c("In DMRs", "Not in DMRs")
rownames(contingency_table) <- c("Overlap", "No Overlap")

# View
print(contingency_table)

In [None]:
# Perform Fisher's Exact Test
fisher_result <- fisher.test(contingency_table)

# View
print(fisher_result)

## Assess Significance of Correlations

In [None]:
# Load data
correlation_results <- read.csv("spearman_correlation_results.csv")

# View
head(correlation_results)

In [None]:
# Identify significant correlations for each condition
control_total <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05, ]
control_only <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 & correlation_results$p_value_Correlation_FAE >= 0.05, ]
FAE_total <- correlation_results[correlation_results$p_value_Correlation_FAE < 0.05, ]
FAE_only <- correlation_results[correlation_results$p_value_Correlation_FAE < 0.05 & correlation_results$p_value_Correlation_Control >= 0.05, ]
both <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 & correlation_results$p_value_Correlation_FAE < 0.05, ]
either <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 | correlation_results$p_value_Correlation_FAE < 0.05, ]

# Identify significant positive correlations for each condition
control_positive <- control_total[control_total$correlation_Correlation_Control > 0, ]
FAE_positive <- FAE_total[FAE_total$correlation_Correlation_FAE > 0, ]

# Identify significant negative correlations for each condition
control_negative <- control_total[control_total$correlation_Correlation_Control < 0, ]
FAE_negative <- FAE_total[FAE_total$correlation_Correlation_FAE < 0, ]

# Print the number of significant genes for each condition
print(paste("Number of significant genes in Control (total):", length(control_total$Gene)))
print(paste("Number of significant genes in Control (only):", length(control_only$Gene)))
print(paste("Number of significant genes in FAE (total):", length(FAE_total$Gene)))
print(paste("Number of significant genes in FAE (only):", length(FAE_only$Gene)))
print(paste("Number of significant genes in both conditions:", length(both$Gene)))
print(paste("Number of significant genes in either condition:", length(either$Gene)))

# Print the number of significant positive and negative genes for each condition
print(paste("Number of significant positive correlations in Control:", length(control_positive$Gene)))  # Significant positive correlations in Control
print(paste("Number of significant negative correlations in Control:", length(control_negative$Gene)))  # Significant negative correlations in Control
print(paste("Number of significant positive correlations in FAE:", length(FAE_positive$Gene)))      # Significant positive correlations in FAE
print(paste("Number of significant negative correlations in FAE:", length(FAE_negative$Gene)))      # Significant negative correlations in FAE

In [None]:
# List of genes to check
genes_to_check <- c("Naa20", "Med10", "Epb41l4a", "Katnal2", "D3Ertd751e", 
                     "Ccdc93", "Itga6", "Tacr3", "Cald1", "Ccl17", 
                     "Atp2c1", "Asprv1", "Kcnk10", "Fau", "Egr1", 
                     "Mest", "Fbh1", "Syt10", "Urm1", "Arrdc3")

# Check if these genes are in the either category
genes_in_either <- genes_to_check[genes_to_check %in% either$Gene]

# Print the genes that are found in the either category
print(genes_in_either)

# Test Correlation Significance with Chi-Square Test

In [64]:
# Create the contingency table
correlation_data <- matrix(c(238, 110, 
                              345, 178, 
                              553, 256), 
                            nrow = 3, 
                            byrow = TRUE)

# Assign row and column names
rownames(correlation_data) <- c("Significant Positive Correlations", 
                                 "Significant Negative Correlations", 
                                 "Unique Significant Correlations")
colnames(correlation_data) <- c("Control", "FAE")

# Print the contingency table
print("Contingency Table:")
head(correlation_data)

# Perform the Chi-Square test
chi_square_test <- chisq.test(correlation_data)

# Print the results of the Chi-Square test
print("Chi-Square Test Results:")
print(chi_square_test)

[1] "Contingency Table:"


Unnamed: 0,Control,FAE
Significant Positive Correlations,238,110
Significant Negative Correlations,345,178
Unique Significant Correlations,553,256


[1] "Chi-Square Test Results:"

	Pearson's Chi-squared test

data:  correlation_data
X-squared = 0.94835, df = 2, p-value = 0.6224



In [65]:
# Perform Fisher's Exact Test
fisher_test <- fisher.test(correlation_data)

# Print the results of Fisher's Exact Test
print("Fisher's Exact Test Results:")
print(fisher_test)

[1] "Fisher's Exact Test Results:"

	Fisher's Exact Test for Count Data

data:  correlation_data
p-value = 0.6276
alternative hypothesis: two.sided



# Add Information to Correlations Output

In [None]:
# Load data
correlation_results <- read.csv("spearman_correlation_results.csv")

# View
head(correlation_results)

In [None]:
# Add distanceToTSS
distance_vector <- setNames(dmrs$distanceToTSS, dmrs$geneSymbol)
correlation_results$DMRdistanceToTSS <- distance_vector[correlation_results$Gene]

# Add gene annotation
annotation_vector <- setNames(dmrs$annotation, dmrs$geneSymbol)
correlation_results$WGBS_annotation <- annotation_vector[correlation_results$Gene]

# Add gene description
gene_vector <- setNames(dmrs$gene, dmrs$geneSymbol)
correlation_results$gene_description <- gene_vector[correlation_results$Gene]

# Add methylation direction
methdir <- setNames(dmrs$direction, dmrs$geneSymbol)
correlation_results$Methylation_Direction <- methdir[correlation_results$Gene]

# Add p-value
dmr_pval <- setNames(dmrs$p.value, dmrs$geneSymbol)
correlation_results$DMR_PVal <- dmr_pval[correlation_results$Gene]

# View
head(correlation_results)

In [None]:
# Reload data to annotate all logFC regardless of significance
degs <- read.csv("rnaseq/05_DEGs/faexcess_vs_control_genes.csv", header = TRUE)

# Add RNA-seq Log-fold change
logfc_vector <- setNames(degs$logFC, degs$external_gene_name)
correlation_results$RNAseq_LogFC <- logfc_vector[correlation_results$Gene]

# Add RNA-seq p-value
deg_pval <- setNames(degs$adj.P.Val, degs$external_gene_name)
correlation_results$RNAseq_PVal <- deg_pval[correlation_results$Gene]

# View
head(correlation_results)

In [None]:
# Save the correlation results to a CSV file
write.csv(correlation_results, file = "spearman_correlation_results.csv", row.names = FALSE)

# Compare Enrichment Terms for DEGs and DMRs

In [5]:
# Load data
dmrs <- read_excel("wgbs/08_cytosine_reports/DMRs/DMRs_annotated.xlsx")

# Subset to significant DMRs only
dmrs <- dmrs[dmrs$p.value < 0.05, ]

# View
head(dmrs)

chr,start,end,width,CpGs,betaCoefficient,statistic,p.value,q.value,direction,⋯,CpG.Island,CpG.Shore,CpG.Shelf,Open.Sea,annotation,geneId,distanceToTSS,ENSEMBL,geneSymbol,gene
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr2,87525906,87526271,366,27,-0.7075803,-8.439746,0.0001484955,0.8146332,Hypomethylated,⋯,Yes,Yes,No,No,Exon,110511,-5525,ENSMUSG00000061520,Or5w22,olfactory receptor family 5 subfamily W member 22
chr5,131236777,131237235,459,9,0.6906494,7.767359,0.0004611176,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Intron,212996,71262,ENSMUSG00000034040,Galnt17,polypeptide N-acetylgalactosaminyltransferase 17
chr7,92448940,92449667,728,8,0.6094742,7.635594,0.0005392732,0.8146332,Hypermethylated,⋯,No,No,No,Yes,3' UTR,23859,1971709,ENSMUSG00000052572,Dlg2,discs large MAGUK scaffold protein 2
chr4,24077950,24078197,248,5,0.7599784,7.631384,0.0005392732,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Distal Intergenic,212377,-418254,ENSMUSG00000045751,Mms22l,"MMS22-like, DNA repair protein"
chr9,118085608,118085943,336,7,0.8877527,7.520019,0.0006017976,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Intron,67899,64234,ENSMUSG00000039163,Cmc1,COX assembly mitochondrial protein 1
chr17,74424279,74424751,473,13,-0.6255215,-7.46747,0.0006799531,0.8146332,Hypomethylated,⋯,No,No,No,Yes,Downstream,210148,14453,ENSMUSG00000024069,Slc30a6,"solute carrier family 30 (zinc transporter), member 6"


In [6]:
# Perform the enrichR analysis on the gene list
enrichr_results <- enrichr(dmrs$geneSymbol, c("GO_Biological_Process_2023",
                                                    "GO_Cellular_Component_2023",
                                                    "GO_Molecular_Function_2023",
                                                    "KEGG_2019_Mouse",
                                                    "Panther_2016",
                                                    "Reactome_2016",
                                                    "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"))

# Create a new Excel workbook for the current file
wb <- createWorkbook()

# Loop through each table in enrichr_results and save it as a separate sheet in the Excel workbook
for (i in seq_along(enrichr_results)) {
  # Extract the data frame from the list
  enrichr_df <- enrichr_results[[i]]
  
  # Define the original sheet name
  original_sheet_name <- names(enrichr_results)[i]
  
  # Modify the sheet name if it's specifically "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO"
  sheet_name <- if (original_sheet_name == "RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO") {
    "RNAseq_DiseaseGene_DrugSigs_GEO"
  } else {
    original_sheet_name
  }
  
  # Add the data frame as a new sheet in the Excel workbook
  addWorksheet(wb, sheet_name)
  writeData(wb, sheet = sheet_name, x = enrichr_df)
}

# Define the output Excel filename based on the input file name
output_filename <- paste0("wgbs/08_cytosine_reports/DMRs/DMR_enrichr_results.xlsx")

# Save the Excel workbook for this file
saveWorkbook(wb, output_filename, overwrite = TRUE)

# Print message indicating successful save
cat("Enrichr results saved: wgbs/08_cytosine_reports/DMRs/DMR_enrichr_results.xlsx")

Uploading data to Enrichr... Done.
  Querying GO_Biological_Process_2023... Done.
  Querying GO_Cellular_Component_2023... Done.
  Querying GO_Molecular_Function_2023... Done.
  Querying KEGG_2019_Mouse... Done.
  Querying Panther_2016... Done.
  Querying Reactome_2016... Done.
  Querying RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO... Done.
Parsing results... Done.
Enrichr results saved: wgbs/08_cytosine_reports/DMRs/DMR_enrichr_results.xlsx

In [24]:
# Read in EnrichR data
files <- c("wgbs/08_cytosine_reports/DMRs/DMR_enrichr_results.xlsx", "rnaseq/05_DEGs/control_vs_faexcess_genes_enrichr_results.xlsx")

# View
print(files)

[1] "wgbs/08_cytosine_reports/DMRs/DMR_enrichr_results.xlsx"       
[2] "rnaseq/05_DEGs/control_vs_faexcess_genes_enrichr_results.xlsx"


In [25]:
# List of GO databases
databases <- c("GO_Biological_Process_2023", "GO_Cellular_Component_2023", "GO_Molecular_Function_2023",
               "KEGG_2019_Mouse", "Panther_2016", "Reactome_2016", "RNAseq_DiseaseGene_DrugSigs_GEO")

In [36]:
# Initialize an empty dataframe
all_data <- data.frame()

# Iterate over each file and read in the corresponding Excel sheets
for (file in files) {
  # Determine the sample name based on the file name
  if (grepl("DMR_enrichr_results.xlsx", file)) {
    sample_name <- "DMRs"
  } else if (grepl("control_vs_faexcess_genes_enrichr_results.xlsx", file)) {
    sample_name <- "DEGs"
  } else {
    next  
  }
  
  # Read in all sheets from the Excel file
  sheets <- excel_sheets(file)
  
  for (database in databases) {
    # Check if the database sheet exists in the current file
    if (database %in% sheets) {
      # Read the data from the specific sheet
      data <- read_excel(file, sheet = database)
      
      # Add the Sample_Name and Database columns
      data <- data %>%
        mutate(Sample_Name = sample_name, Database = database)
      
      # Combine the data into the master dataframe
      all_data <- bind_rows(all_data, data)
    }
  }
}

# View 
head(all_data)

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,Sample_Name,Database
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,Neuron Development (GO:0048666),18/150,4.335769e-05,0.04692262,0,0,3.242935,32.57861,ROBO2;APP;NTRK2;TENM2;TENM3;PBX3;DTNBP1;BICDL1;PTPRM;ETV1;HMGB1;HS6ST1;POU4F1;PBX1;ADCYAP1;BLOC1S5;NPY;TMEM108,DMRs,GO_Biological_Process_2023
2,Actin Cytoskeleton Reorganization (GO:0031532),10/53,4.943108e-05,0.04692262,0,0,5.50148,54.5468,FGF7;RAP2A;SIPA1L1;THSD7B;AUTS2;S1PR1;DTNBP1;TNIK;ATP2C1;FGF10,DMRs,GO_Biological_Process_2023
3,Regulation Of Potassium Ion Transmembrane Transporter Activity (GO:1901016),7/26,6.253209e-05,0.04692262,0,0,8.694192,84.15831,KCNG1;KCNE1;NETO1;KCNS1;AKAP9;STK39;CACNA1D,DMRs,GO_Biological_Process_2023
4,Glutamate Receptor Signaling Pathway (GO:0007215),8/35,6.720032e-05,0.04692262,0,0,6.997854,67.23421,GRIN2A;GRM7;GNAQ;GRM8;GRIK3;PTK2B;GRIK1;GRIN2B,DMRs,GO_Biological_Process_2023
5,Regulation Of Neuron Projection Development (GO:0010975),19/174,9.827547e-05,0.05489668,0,0,2.915274,26.90138,FZD1;EPHA4;MYLIP;NTRK2;TENM3;BDNF;NTRK3;ULK4;PTPRO;MBOAT1;SIPA1L1;PTK2B;SPOCK1;ITGA6;PRKD1;ROR2;SERPINI1;LRRC4C;EPHA3,DMRs,GO_Biological_Process_2023
6,Negative Regulation Of Response To Stimulus (GO:0048585),9/52,0.0002340946,0.10269912,0,0,4.94522,41.34097,EPHA4;COL3A1;DDAH1;IGFBP3;VPS13C;KLF4;SIRT1;ADRA2A;MAP2K5,DMRs,GO_Biological_Process_2023


In [37]:
# Filter out rows where the Adjusted.P.value is less than 0.05
all_data <- all_data %>%
  filter(Adjusted.P.value <= 0.05)

# View
head(all_data)

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,Sample_Name,Database
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,Neuron Development (GO:0048666),18/150,4.335769e-05,0.0469226249,0,0,3.242935,32.57861,ROBO2;APP;NTRK2;TENM2;TENM3;PBX3;DTNBP1;BICDL1;PTPRM;ETV1;HMGB1;HS6ST1;POU4F1;PBX1;ADCYAP1;BLOC1S5;NPY;TMEM108,DMRs,GO_Biological_Process_2023
2,Actin Cytoskeleton Reorganization (GO:0031532),10/53,4.943108e-05,0.0469226249,0,0,5.50148,54.5468,FGF7;RAP2A;SIPA1L1;THSD7B;AUTS2;S1PR1;DTNBP1;TNIK;ATP2C1;FGF10,DMRs,GO_Biological_Process_2023
3,Regulation Of Potassium Ion Transmembrane Transporter Activity (GO:1901016),7/26,6.253209e-05,0.0469226249,0,0,8.694192,84.15831,KCNG1;KCNE1;NETO1;KCNS1;AKAP9;STK39;CACNA1D,DMRs,GO_Biological_Process_2023
4,Glutamate Receptor Signaling Pathway (GO:0007215),8/35,6.720032e-05,0.0469226249,0,0,6.997854,67.23421,GRIN2A;GRM7;GNAQ;GRM8;GRIK3;PTK2B;GRIK1;GRIN2B,DMRs,GO_Biological_Process_2023
5,Postsynaptic Density (GO:0014069),20/151,3.704816e-06,0.0009150897,0,0,3.640046,45.52197,NSF;GRIA1;NTRK2;GRIK3;DTNBP1;GRIK1;SHISA9;GRIN2B;SHISA6;GRIN2A;GAP43;DLG2;SIPA1L1;NETO1;TMEM108;PTK2B;SPOCK1;RPL38;FYN;CACNG2,DMRs,GO_Cellular_Component_2023
6,Asymmetric Synapse (GO:0032279),18/133,8.268768e-06,0.0010211928,0,0,3.725647,43.60134,NSF;GRIA1;NTRK2;DTNBP1;SHISA9;GRIN2B;SHISA6;GRIN2A;GAP43;DLG2;SIPA1L1;GRM7;NETO1;TMEM108;PTK2B;SPOCK1;RPL38;FYN,DMRs,GO_Cellular_Component_2023


In [38]:
# Count the number of total terms and unique terms for each database
database_term_counts <- all_data %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 7 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023               44           44
[90m2[39m GO_Cellular_Component_2023               55           54
[90m3[39m GO_Molecular_Function_2023               10           10
[90m4[39m KEGG_2019_Mouse                          12           12
[90m5[39m Panther_2016                              7            7
[90m6[39m RNAseq_DiseaseGene_DrugSigs_GEO         875          842
[90m7[39m Reactome_2016                           223          223


In [54]:
# Subset the data frame to only DMRs
DMRs <- all_data[all_data$Sample_Name == "DMRs", ]

# View
head(DMRs)

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,Sample_Name,Database
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
1,Neuron Development (GO:0048666),18/150,4.335769e-05,0.0469226249,0,0,3.242935,32.57861,ROBO2;APP;NTRK2;TENM2;TENM3;PBX3;DTNBP1;BICDL1;PTPRM;ETV1;HMGB1;HS6ST1;POU4F1;PBX1;ADCYAP1;BLOC1S5;NPY;TMEM108,DMRs,GO_Biological_Process_2023
2,Actin Cytoskeleton Reorganization (GO:0031532),10/53,4.943108e-05,0.0469226249,0,0,5.50148,54.5468,FGF7;RAP2A;SIPA1L1;THSD7B;AUTS2;S1PR1;DTNBP1;TNIK;ATP2C1;FGF10,DMRs,GO_Biological_Process_2023
3,Regulation Of Potassium Ion Transmembrane Transporter Activity (GO:1901016),7/26,6.253209e-05,0.0469226249,0,0,8.694192,84.15831,KCNG1;KCNE1;NETO1;KCNS1;AKAP9;STK39;CACNA1D,DMRs,GO_Biological_Process_2023
4,Glutamate Receptor Signaling Pathway (GO:0007215),8/35,6.720032e-05,0.0469226249,0,0,6.997854,67.23421,GRIN2A;GRM7;GNAQ;GRM8;GRIK3;PTK2B;GRIK1;GRIN2B,DMRs,GO_Biological_Process_2023
5,Postsynaptic Density (GO:0014069),20/151,3.704816e-06,0.0009150897,0,0,3.640046,45.52197,NSF;GRIA1;NTRK2;GRIK3;DTNBP1;GRIK1;SHISA9;GRIN2B;SHISA6;GRIN2A;GAP43;DLG2;SIPA1L1;NETO1;TMEM108;PTK2B;SPOCK1;RPL38;FYN;CACNG2,DMRs,GO_Cellular_Component_2023
6,Asymmetric Synapse (GO:0032279),18/133,8.268768e-06,0.0010211928,0,0,3.725647,43.60134,NSF;GRIA1;NTRK2;DTNBP1;SHISA9;GRIN2B;SHISA6;GRIN2A;GAP43;DLG2;SIPA1L1;GRM7;NETO1;TMEM108;PTK2B;SPOCK1;RPL38;FYN,DMRs,GO_Cellular_Component_2023


In [55]:
# Count the number of total terms and unique terms for each database
database_term_counts <- DMRs %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 6 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023                4            4
[90m2[39m GO_Cellular_Component_2023               13           13
[90m3[39m KEGG_2019_Mouse                           3            3
[90m4[39m Panther_2016                              7            7
[90m5[39m RNAseq_DiseaseGene_DrugSigs_GEO         147          147
[90m6[39m Reactome_2016                            62           62


In [56]:
# Subset the data frame to only DEGs
DEGs <- all_data[all_data$Sample_Name == "DEGs", ]

# View
head(DEGs)

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,Sample_Name,Database
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
237,Cytoplasmic Translation (GO:0002181),44/93,2.015845e-35,5.225072e-32,0,0,21.116417,1686.9787,RPL30;RPL32;RPL34;RPLP1;RPL12;RPLP0;RPL11;RPL10A;RPS15;RPS14;RPS16;RPS15A;RPL18A;RPS19;RPS18;RPL36;RPL14;RPS3;RPL35;RPL13;RPL15;RPL18;RPS27A;RPL39;RPL17;RPS13;RWDD1;RPS9;RPL41;RPS7;RPL22;RPS6;RPL35A;RPS25;EIF3M;RPL27A;RPL37A;RPL27;RPS20;RPL26;FAU;RPS21;RPS24;RPS23,DEGs,GO_Biological_Process_2023
238,Translation (GO:0006412),60/234,3.934749e-30,5.099435e-27,0,0,8.217813,556.4093,RPL30;RPL32;RPL34;RPLP1;RPLP0;RPL10A;RPS15;MRPL41;EEF1B2;RPS14;MRPL2;RPS16;MRPL1;RPL18A;RPS19;RPS18;RPL36;RPL35;RPL39;CHCHD1;RPS13;MRPS28;RPS9;MRPS24;RPS7;RPL22;RPS6;MRPS18C;TUFM;EEF1A1;RPL37A;RPL27;RPL26;RPL12;RPL11;SRBD1;MRPL12;MRPL57;MRPL11;MRPL20;RPS15A;RPL14;RPS3;RPL13;RPL15;RPL18;RPS27A;RPL17;RWDD1;RPL41;NDUFA7;RPL35A;MRPL22;RPS25;RPL27A;RPS20;FAU;RPS21;RPS24;RPS23,DEGs,GO_Biological_Process_2023
239,Macromolecule Biosynthetic Process (GO:0009059),52/183,1.331542e-28,1.150452e-25,0,0,9.386996,602.5142,RPL30;RPL32;RPL34;RPLP1;RPLP0;RPL10A;EEF1B2;RPS15;MRPL41;RPS14;RPS16;RPL18A;RPS19;RPS18;RPL36;RPL35;RPL39;RPS13;RPS9;RPS7;RPL22;RPS6;MRPS18C;TUFM;EEF1A1;RPL37A;RPL27;RPL26;RPL12;RPL11;SRBD1;MRPL12;MRPL11;RPS15A;TERT;POLD1;RPL14;RPS3;RPL13;RPL15;RPL18;RPS27A;RPL17;RPL41;RPL35A;RPS25;RPL27A;RPS20;FAU;RPS21;RPS24;RPS23,DEGs,GO_Biological_Process_2023
240,Peptide Biosynthetic Process (GO:0043043),48/158,6.106997e-28,3.957334e-25,0,0,10.279388,644.1367,RPL30;RPL32;RPL34;RPLP1;RPL12;RPLP0;RPL11;SRBD1;RPL10A;MRPL12;MRPL11;RPS15;MRPL41;RPS14;RPS16;RPS15A;RPL18A;RPS19;RPS18;RPL36;RPL14;RPS3;RPL35;RPL13;RPL15;RPL18;RPS27A;RPL39;RPL17;RPS13;RPS9;RPL41;RPS7;RPL22;RPS6;RPL35A;MRPS18C;EEF1A1;RPS25;RPL27A;RPL37A;RPL27;RPS20;RPL26;FAU;RPS21;RPS24;RPS23,DEGs,GO_Biological_Process_2023
241,Gene Expression (GO:0010467),53/296,5.066564e-19,2.626507e-16,0,0,5.133799,216.2687,RPL30;RPL32;RPL34;RPLP1;RPLP0;RPL10A;RPS15;MRPL41;RBM3;RPS14;HHEX;RPS16;RPL18A;RPS19;RPS18;MAGOH;RPL36;RPL35;RPL39;RPS13;DDX17;RPS9;RPS7;RPL22;RPS6;MRPS18C;EEF1A1;RPL37A;RPL27;RPL26;RPL12;RPL11;SRBD1;MRPL12;MRPL11;RPS15A;RPL14;RPS3;RPL13;RPL15;RPL18;RPS27A;RPL17;RPL41;RPL35A;RPS25;RPL27A;HNRNPA2B1;RPS20;FAU;RPS21;RPS24;RPS23,DEGs,GO_Biological_Process_2023
242,Aerobic Electron Transport Chain (GO:0019646),18/68,3.291575e-10,1.42196e-07,0,0,8.202673,179.1011,NDUFB9;COX7B;NDUFA7;NDUFB6;NDUFA5;NDUFA4;COX4I1;NDUFB4;NDUFB1;UQCR10;COX6A1;COX6C;COX6A2;UQCRH;NDUFS7;UQCRQ;NDUFS4;NDUFV2,DEGs,GO_Biological_Process_2023


In [57]:
# Count the number of total terms and unique terms for each database
database_term_counts <- DEGs %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 6 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Biological_Process_2023               40           40
[90m2[39m GO_Cellular_Component_2023               42           42
[90m3[39m GO_Molecular_Function_2023               10           10
[90m4[39m KEGG_2019_Mouse                           9            9
[90m5[39m RNAseq_DiseaseGene_DrugSigs_GEO         728          728
[90m6[39m Reactome_2016                           161          161


In [62]:
# Extract the Term columns
dmr_terms <- DMRs$Term
deg_terms <- DEGs$Term

# Find overlapping terms
overlapping_terms <- intersect(dmr_terms, deg_terms)

# Get full rows from DMRs for overlapping terms
overlapping_dmrs <- DMRs[DMRs$Term %in% overlapping_terms, ]

# Print the results
head(overlapping_dmrs)
print(length(overlapping_dmrs$Term))

Unnamed: 0_level_0,Term,Overlap,P.value,Adjusted.P.value,Old.P.value,Old.Adjusted.P.value,Odds.Ratio,Combined.Score,Genes,Sample_Name,Database
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
6,Asymmetric Synapse (GO:0032279),18/133,8.268768e-06,0.001021193,0,0,3.725647,43.60134,NSF;GRIA1;NTRK2;DTNBP1;SHISA9;GRIN2B;SHISA6;GRIN2A;GAP43;DLG2;SIPA1L1;GRM7;NETO1;TMEM108;PTK2B;SPOCK1;RPL38;FYN,DMRs,GO_Cellular_Component_2023
95,ZNF2 NPC Knockdown GSE54112 down,62/496,4.242785e-15,9.199772e-13,0,0,3.537837,117.07962,ROBO2;SPON1;ATP8A1;RPLP1;CELF2;CELF4;SLC6A1;RND3;ELAVL2;AKAP12;SALL1;KIF5C;PSD3;PDE4B;RPL38;MAGI1;EBF2;GFRA1;PPP4R3B;ZEB1;DDAH1;SETBP1;RASA4;AKAP9;TMEM106B;TERF2IP;TRIB2;XPR1;ASTN1;ASAH1;CAMK2D;TNKS;ITPR2;FIGN;PIK3R1;SLIT1;SPOCK1;MEST;NTRK2;CADM1;AUTS2;GOLM1;SORT1;PBX3;FN1;LSAMP;SMAD9;SORBS1;MEIS2;PBX1;WDR11;COL3A1;MEIS1;CDK6;NFIA;NBEA;PTPRA;NOVA1;KLHDC8A;GNAQ;TCF4;ADGRL3,DMRs,RNAseq_DiseaseGene_DrugSigs_GEO
99,Ezh2 Embryonic cerebellum Knockout GSE80222 up,53/496,1.842375e-10,2.39693e-08,0,0,2.926622,65.59963,APP;ZFP521;TENM3;ATP8A1;RPLP1;CELF4;GRIK3;TTC28;SALL1;TRIM28;TRIM2;KHSRP;TLE3;RBFOX1;DST;PRKCE;GFRA1;FRMD4B;AXIN2;NAV2;TMEFF1;MAF;SETBP1;CDH13;PDE1C;ZFP703;SHOX2;AGAP1;PIK3R1;ATP1A1;APCDD1;ABLIM1;ERBB4;SLIT1;MEST;FZD1;TMEM132C;LINGO1;GOLM1;PTCH1;PBX3;GRIN2B;POU4F1;MEIS2;NELL2;MEIS1;LHX2;NOVA1;EIF3H;CTNNB1;FAT3;FAT4;SSBP3,DMRs,RNAseq_DiseaseGene_DrugSigs_GEO
113,Nova1 Brain - Cortex Knockout GSE69711 up,47/495,8.124164e-08,4.227815e-06,0,0,2.545716,41.56094,ROBO2;TENM2;ATP8A1;TNKS;CTTNBP2;DOCK9;PIK3R1;SLC7A11;TTC28;AKAP12;TRIM28;CALD1;ERBB4;TRIM2;NOCT;PAPOLA;MEST;CDK5RAP2;ZFP462;SOX5;NTRK2;EPHA4;SLC38A1;AGTPBP1;DST;ATRNL1;NTRK3;ARRDC3;FN1;FRMD4A;FRMD4B;MEIS2;PBX1;COL3A1;NBEA;MPPED1;KITL;HNRNPH1;AKAP9;EXOC4;G2E3;CTNNB1;FAT3;TCF4;FAT4;EPHA3;XPR1,DMRs,RNAseq_DiseaseGene_DrugSigs_GEO
114,Olig2 Brain tumor conditional Knockout GSE71493 up,47/495,8.124164e-08,4.227815e-06,0,0,2.545716,41.56094,CLIC5;ATP8A1;TNFAIP6;EPAS1;RPLP1;ITPR1;SLC6A1;NR3C1;CAMKK2;SALL1;SIPA1L1;KIF5C;CALD1;TRIM2;PSD3;TSPAN5;NRCAM;MEST;CDK5RAP2;FZD1;BCHE;NTRK2;DST;GOLM1;SORT1;PTCH1;SORCS1;ETV1;OLIG1;GRIN2B;SORCS2;DUSP6;ETV5;NELL2;ZEB1;DLG2;NFIA;DDAH1;ADAM12;ALDH1A1;TMEM106B;SPATS2L;CTNNB1;ITGA6;LRRN1;TCF4;RAPGEF4,DMRs,RNAseq_DiseaseGene_DrugSigs_GEO
130,Casz1 Heart Knockout GSE55394 up,43/497,3.417367e-06,0.000105857,0,0,2.285699,28.76927,HCN4;SPON1;NRP2;TENM3;TNKS;SHOX2;ITPR1;LPL;CACNA1D;HMGB1;RBPJ;HS6ST1;PCDH18;CALD1;TBX20;PAPOLA;BEND4;CORIN;ZFP462;FZD1;TLE3;CADM1;SPHKAP;MMD;SORT1;PTCH1;FN1;PAWR;FRMD4B;TBX5;PBX1;COL3A1;DIAPH3;DDAH1;KITL;HNRNPH1;CTNNB1;ITGA6;SIK1;BCAT1;SSBP3;ADGRL2;XPR1,DMRs,RNAseq_DiseaseGene_DrugSigs_GEO


[1] 34


In [63]:
# Count the number of total terms and unique terms for each database
database_term_counts <- overlapping_dmrs %>%
  group_by(Database) %>%
  summarise(
    Total_Terms = n(),
    Unique_Terms = n_distinct(Term)
  )

# Print the table
print(database_term_counts)

[90m# A tibble: 2 × 3[39m
  Database                        Total_Terms Unique_Terms
  [3m[90m<chr>[39m[23m                                 [3m[90m<int>[39m[23m        [3m[90m<int>[39m[23m
[90m1[39m GO_Cellular_Component_2023                1            1
[90m2[39m RNAseq_DiseaseGene_DrugSigs_GEO          33           33
