# Set Library Path

In [None]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb/lib/R/library")

# Load Libraries

In [None]:
library(dplyr)
library(rtracklayer)
library(GenomicRanges)
library(tibble)
library(readr)
library(tidyr)
library(ggplot2)
library(readxl)

# Process RNA-seq Data

## Load Data

In [None]:
# Read in normalized gene counts from DEG analysis
counts <- read.delim("rnaseq/05_gene_counts/normalized_counts.txt")

# View
head(counts)

In [None]:
# Prepare to assign gene names 
names <- rownames(counts) %>% as.data.frame()
colnames(names) <- "gene_names"
rownames(counts) <- NULL

# View
head(counts)

In [None]:
# Load data
rnaseq <- read.csv("rnaseq/05_DEGs/control_vs_faexcess_genes.csv", header = TRUE) %>%
dplyr::select("gene_names","external_gene_name")

new_names <- names %>% dplyr::left_join(rnaseq, by = "gene_names")

# View
head(new_names)

In [None]:
# Combine data to get external_gene_name
counts <- cbind(new_names, counts)

# Remove rows where external_gene_name is NA
counts <- counts[!is.na(counts$external_gene_name), ]

# View
head(counts)

In [None]:
# Assign gene names as row names
rownames(counts) <- counts$external_gene_name

# Remove the gene_names and external_gene_name columns
counts <- counts[, !(names(counts) %in% c("gene_names", "external_gene_name"))]

# View the updated counts data frame
head(counts)

## Convert Gene Lengths to Kilobases for TPM Calculation

In [None]:
# Read the GTF annotation file
gtf_file <- "/share/lasallelab/genomes/mm10/mm10.refGene.gtf"
gtf_data <- import(gtf_file)

# View
head(gtf_data)

In [None]:
# Remove rows from counts if genes are not found in the annotation file

# Assign unique genes for easy searching
unique_genes <- unique(gtf_data$gene_name)

# Filter counts_data to keep only rows where the row names are in unique_genes
counts <- counts[rownames(counts) %in% unique_genes, ]

# View
head(counts)
print(length(rownames(counts)))

In [None]:
# Calculate gene lengths

# Filter for exon features
exon_data <- gtf_data[gtf_data$type == "exon"]

# Calculate the length of each exon
exon_lengths <- width(exon_data)

# Create a data frame to store gene lengths
gene_lengths <- data.frame(gene_id = exon_data$gene_id, length = exon_lengths)

# Sum the lengths for each gene
gene_length_summary <- aggregate(length ~ gene_id, data = gene_lengths, FUN = sum)

# Print the gene lengths
head(gene_length_summary)
print(length(gene_length_summary$gene_id))

In [None]:
# Convert gene length to kilobases
gene_length_summary <- gene_length_summary %>%
  mutate(length_kb = length / 1000)

head(gene_length_summary)

## Calculate Reads Per Kilobase

In [None]:
# Convert counts to a data frame 
counts <- as.data.frame(counts)

# Add gene_id as a column to counts
counts$gene_id <- rownames(counts)

# Merge counts with gene_length_summary
merged_data <- merge(counts, gene_length_summary, by = "gene_id", all.x = TRUE)

# Calculate RPK for each gene (RPK = counts / length in kilobases)
count_columns <- names(merged_data)[-which(names(merged_data) %in% c("gene_id", "length_kb"))]

# Divide the count columns by length_kb
merged_data[count_columns] <- merged_data[count_columns] / merged_data$length_kb

# Set the gene_id back as row names
rownames(merged_data) <- merged_data$gene_id

# Clean data
merged_data$gene_id <- NULL
merged_data$length_kb <- NULL
merged_data$length <- NULL

# View the RPK data
head(merged_data)

In [None]:
# Verify proper RPK calculations
specific_row <- counts["0610005C13Rik", , drop = FALSE]

# Convert the row to a numeric vector
specific_row_vector <- as.numeric(specific_row)

# Divide every value in the row by the gene length (in kb) of 0610005C13Rik
expected_counts <- specific_row_vector / 2.037

# Print the output
print(expected_counts)

## Calculate Total Reads Per Kilobase

In [None]:
# Calculate the total RPK for each sample 
total_rpk <- colSums(merged_data)

## Calculate TPM per Sample

In [None]:
# Calculate TPM for each gene (TPM = (RPK / Total RPK) * 1,000,000)
tpm <- sweep(merged_data, 2, total_rpk, FUN = "/") * 1e6

# Convert the result to a data frame (optional)
tpm_df <- as.data.frame(tpm)

# View
head(tpm_df)

In [None]:
# Save TPM data to a CSV
write.csv(tpm_df, file = "RNAseq_TPM_Values.csv", row.names = TRUE)

In [None]:
colnames(tpm_df)

# Process WGBS Data

## Annotate Genes

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "\\.deduplicated\\.bismark\\.cov\\.gz\\.CpG_report\\.merged_CpG_evidence\\.cov\\.gz$", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Read the GTF annotation file
gtf_file <- "/share/lasallelab/genomes/mm10/mm10.refGene.gtf"
gtf_data <- import(gtf_file)

# View
head(gtf_data)

In [None]:
gtf_transcripts <- gtf_data[gtf_data$type == "transcript"]

# Create GRanges object for gene annotations
gr_genes <- GRanges(seqnames = seqnames(gtf_transcripts),
                    ranges = IRanges(start = start(gtf_transcripts), end = end(gtf_transcripts)),
                    gene_name = mcols(gtf_transcripts)$gene_name)

# View 
head(gr_genes)

In [None]:
# Loop through each file in the files list
for (file in files) {
  # Read the gzipped file
  regions <- read.table(gzfile(file), header = FALSE, stringsAsFactors = FALSE)
  
  # Create GRanges object for regions
  gr_regions <- GRanges(seqnames = regions$V1,  
                        ranges = IRanges(start = regions$V2, end = regions$V3))  
  
  # Find overlaps between regions and gene annotations
  overlaps <- findOverlaps(gr_regions, gr_genes)
  
  # Create a new column for gene names in the regions data frame
  regions$gene_name <- NA 
  regions$gene_name[queryHits(overlaps)] <- gr_genes$gene_name[subjectHits(overlaps)]

  # Do not save columns where gene names are NA
  regions <- regions %>% filter(!is.na(gene_name))
  
  # Create a sample basename for saving the results
  sample_basename <- sub("\\..*$", "", basename(file))
  
  # Save the annotated regions to a CSV file
  output_directory <- "wgbs/08_cytosine_reports/"
  write.csv(regions, file = paste0(output_directory, sample_basename, "_annotated_regions.csv"), row.names = FALSE)

  # Print progress
  cat(sprintf("Regions have been assigned for %s...\n", sample_basename))
}

## Calculate Percent Methylation Per Gene

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "\\.csv$", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Loop through each file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file, stringsAsFactors = FALSE)
  
  # Aggregate counts by gene_name
  aggregated_data <- data %>%
    group_by(gene_name) %>%
    summarise(
      methylated_cytosines = sum(V5, na.rm = TRUE),
      unmethylated_cytosines = sum(V6, na.rm = TRUE),
      .groups = 'drop'  # This ensures that the grouping is dropped after summarising
    ) %>%
    mutate(
      percent_methylated = (methylated_cytosines / (methylated_cytosines + unmethylated_cytosines)) * 100
    )
    
  # Specify the output directory
  output_directory <- "wgbs/08_cytosine_reports/"
  
  # Extract the first part of the basename before the first underscore
  base_name <- tools::file_path_sans_ext(basename(file))
  first_part <- strsplit(base_name, "_")[[1]][1]
  
  # Save the aggregated data to a CSV file in the specified directory
  write.csv(aggregated_data, file = paste0(output_directory, first_part, "_percent_methylated.csv"), row.names = FALSE)
}

## Create Percent Methylation Table Comparable to TPM Table

In [None]:
# Load data
files <- list.files(path = "wgbs/08_cytosine_reports", 
                    pattern = "FA\\d+_percent_methylated\\.csv", 
                    full.names = TRUE)

# View the list of files
print(files)

In [None]:
# Initialize an empty data frame
df <- data.frame()

# Loop through each CSV file
for (file in files) {
  # Read the CSV file
  data <- read.csv(file)
     
  # Delete unnecessary files
  data$methylated_cytosines <- NULL
  data$unmethylated_cytosines <- NULL
    
  # Extract sample names from the file names
  sample_name <- gsub(".*?(FA\\d+)_percent_methylated\\.csv", "\\1", file)

  # Rename percent_methylated to sample name
  colnames(data)[colnames(data) == "percent_methylated"] <- sample_name
    
  # Merge the current data frame with the main data frame
  if (nrow(df) == 0) {
    df <- data  
  } else {
    df <- merge(df, data, by = "gene_name", all = TRUE) 
  }
}

# View
head(df)

In [None]:
# Assign gene names as row names to match formatting of RNA-seq TPM counts
rownames(df) <- df$gene_name
df$gene_name <- NULL

# View
head(df)

In [None]:
# Save percent methylation data to a CSV
write.csv(df, file = "WGBS_Percent_Methylation_Values.csv", row.names = TRUE)

# Integrate Data Frames

In [None]:
# Load RNA-seq data
rnaseq <- read.csv("RNAseq_TPM_Values.csv")

# Assign row names
rownames(rnaseq) <- rnaseq$X
rnaseq$X <- NULL

# View
head(rnaseq)

In [None]:
# Load WGBS data
wgbs <- read.csv("WGBS_Percent_Methylation_Values.csv")

# Assign row names
rownames(wgbs) <- wgbs$X
wgbs$X <- NULL

# View
head(wgbs)

In [None]:
# Sample mapping from WGBS to RNA-seq
sample_map <- c(
  "G1_1_1_4" = "FA114",
  "G1_1_1_5" = "FA115",
  "G1_1_1_6" = "FA116",
  "G1_1_1_7" = "FA117",
  "G1_1_2_4" = "FA124",
  "G1_1_2_5" = "FA125",
  "G1_1_2_6" = "FA126",
  "G2_2_6_5" = "FA265",
  "G2_2_6_6" = "FA266",
  "G2_2_7_10" = "FA2710",
  "G2_2_7_11" = "FA2711",
  "G2_2_7_7" = "FA277"
)

In [None]:
# Create a new row with the column names
new_row <- as.data.frame(t(colnames(rnaseq)))

# Set the column names for the new row
colnames(new_row) <- colnames(rnaseq)

# Add the new row to the top of the dataframe
rnaseq <- rbind(new_row, rnaseq)

# View
head(rnaseq)

In [None]:
# Match the RNA-seq column names to the sample map (visually confirm mapping of names)
colnames(rnaseq) <- sample_map[colnames(rnaseq)]

# View
head(rnaseq)

In [None]:
# Remove row containing original sample names
rnaseq <- rnaseq[rownames(rnaseq) != "1", ]

# Remove columns where NA is the column name
rnaseq <- rnaseq[, !is.na(colnames(rnaseq))]

# View
head(rnaseq)

In [None]:
# Determine length of each dataframe
print(length(rownames(rnaseq)))
print(length(rownames(wgbs)))

In [None]:
# Keep only the rows overlapping between rnaseq and wgbs
wgbs <- wgbs[rownames(wgbs) %in% rownames(rnaseq), ]

# Confirm length
print(length(rownames(wgbs)))

In [None]:
# Keep only the rows overlapping between rnaseq and wgbs
rnaseq <- rnaseq[rownames(rnaseq) %in% rownames(wgbs), ]

# Confirm length
print(length(rownames(rnaseq)))

In [None]:
# Confirm that all row names match to ensure paired data
print(length(rownames(rnaseq)))
print(length(rownames(wgbs)))

print(all(rownames(rnaseq) %in% rownames(wgbs)))
print(all(rownames(wgbs) %in% rownames(rnaseq)))

In [None]:
# View both dataframes
head(rnaseq)
head(wgbs)

In [None]:
# Merge datasets
merged_data <- rnaseq %>%
  rownames_to_column(var = "Gene") %>%
  inner_join(wgbs %>% rownames_to_column(var = "Gene"), by = "Gene")

# View
head(merged_data)

In [None]:
# Convert columns containing RNA-seq data to numeric and log2 transform to normalize
merged_data <- merged_data %>%
  filter(if_all(ends_with(".x"), ~ as.numeric(.) >= 0)) %>%
  mutate(across(ends_with(".x"), ~ log2(as.numeric(.))))

# Reformat
rownames(merged_data) <- merged_data$Gene
merged_data$Gene <- NULL

# View
head(merged_data)

# Calculate Correlations

In [None]:
# Create a named vector for the mapping with updated group name
mapping <- c(
  FA114 = "Control",
  FA115 = "Control",
  FA116 = "Control",
  FA117 = "Control",
  FA124 = "Control",
  FA125 = "Control",
  FA126 = "Control",
  FA265 = "FAE",
  FA266 = "FAE",
  FA2710 = "FAE",
  FA2711 = "FAE",
  FA277 = "FAE"
)

In [None]:
# Create a data frame for the mapping
group_info <- data.frame(Sample = names(mapping), Group = mapping, stringsAsFactors = FALSE)

# View
print(group_info)

In [None]:
# Convert row names to a column in merged_data
merged_data <- cbind(Gene = rownames(merged_data), merged_data)

# Reshape the data to long format for easier manipulation
long_data <- merged_data %>%
  pivot_longer(cols = -Gene, names_to = c("Sample", "Type"), names_sep = "\\.", values_to = "Value") %>%
  left_join(group_info, by = "Sample") %>%
  filter(!is.na(Group))

# View
head(long_data)
length(unique(long_data$Gene))
unique(long_data$Sample)
unique(long_data$Type)
unique(long_data$Group)

In [None]:
# Calculate Spearman correlation coefficients and their significance for each gene by group
correlation_results <- long_data %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  group_by(Gene, Group) %>%
  summarise(
    correlation = ifelse(
      sd(Type_x, na.rm = TRUE) > 0 & sd(Type_y, na.rm = TRUE) > 0,
      cor(Type_x, Type_y, method = "spearman", use = "complete.obs"),
      NA
    ),
    p_value = ifelse(
      sd(Type_x, na.rm = TRUE) > 0 & sd(Type_y, na.rm = TRUE) > 0,
      cor.test(Type_x, Type_y, method = "spearman", use = "complete.obs")$p.value,
      NA
    ),
    .groups = 'drop'
  ) %>%
  pivot_wider(names_from = Group, values_from = c(correlation, p_value), names_prefix = "Correlation_")  

# View
head(correlation_results)

In [None]:
# Save the correlation results to a CSV file
write.csv(correlation_results, file = "spearman_correlation_results.csv", row.names = FALSE)

In [None]:
# See significant correlations
significant_results <- correlation_results %>%
  filter(
    p_value_Correlation_Control < 0.05 | 
    p_value_Correlation_FAE < 0.05
  )

# View
head(significant_results)
print(length(significant_results$Gene))

In [None]:
# Save the correlation results to a CSV file
write.csv(correlation_results, file = "spearman_correlation_significant_results.csv", row.names = FALSE)

# Visualize Correlations

## Load Data

In [None]:
# Load data
degs <- read.csv("rnaseq/05_DEGs/faexcess_vs_control_genes.csv", header = TRUE)

# View
head(degs)

In [None]:
# Load data
dmrs <- read_excel("wgbs/08_cytosine_reports/DMRs/DMRs_annotated.xlsx")

# View
head(dmrs)

## Significant DEGs Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant DEGs Positively Correlated Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset for positive correlations only
positive_correlations <- correlation_results[correlation_results$Correlation_Control > 0 & 
                                            correlation_results$Correlation_FAE > 0, ]

# View the subsetted results
head(positive_correlations)

In [None]:
# Subset significant_DEGs for only positively correlated ones
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% positive_correlations$Gene, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_positively_correlated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant DEGs Negatively Correlated Only

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset for negative correlations only
negative_correlations <- correlation_results[correlation_results$Correlation_Control < 0 & 
                                            correlation_results$Correlation_FAE < 0, ]

# View the subsetted results
head(negative_correlations)

In [None]:
# Subset significant_DEGs for only negatively correlated ones
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% negative_correlations$Gene, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_negatively_correlated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant Upregulated DEGs Only

In [None]:
# Subset to significant DEGs only
significant__upregulated_DEGs <- degs[degs$adj.P.Val < 0.05 & degs$logFC > 0, ]

# View
head(significant__upregulated_DEGs)
length(significant__upregulated_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significantly upregulated DEGs
long_data_sigUpDEGs <- long_data[long_data$Gene %in% significant__upregulated_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigUpDEGs)
length(unique(long_data_sigUpDEGs$Gene))
unique(long_data_sigUpDEGs$Sample)
unique(long_data_sigUpDEGs$Type)
unique(long_data_sigUpDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigUpDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_upregulated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Significant Downregulated DEGs Only

In [None]:
# Subset to significant downregulated DEGs only
significant_downregulated_DEGs <- degs[degs$adj.P.Val < 0.05 & degs$logFC < 0, ]

# View
head(significant_downregulated_DEGs)
length(significant_downregulated_DEGs$external_gene_name)

In [None]:
# Subset long data to have only significantly downregulated DEGs
long_data_sigDownDEGs <- long_data[long_data$Gene %in% significant_downregulated_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDownDEGs)
length(unique(long_data_sigDownDEGs$Gene))
unique(long_data_sigDownDEGs$Sample)
unique(long_data_sigDownDEGs$Type)
unique(long_data_sigDownDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDownDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initalize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_significant_downregulated_DEGs.pdf", width = 16, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

## Overlapping Significant DEGs and DMRs

In [None]:
# Subset to significant DEGs only
significant_DEGs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

In [None]:
# Subset to significant DMRs only
significant_DMRs <- dmrs[dmrs$p.value < 0.05, ]

# View
head(significant_DMRs)
length(significant_DMRs$geneSymbol)

In [None]:
# Subset significant_DEGs for ones found in significant DMRs
significant_DEGs <- significant_DEGs[significant_DEGs$external_gene_name %in% dmrs$geneSymbol, ]

# View
head(significant_DEGs)
length(significant_DEGs$external_gene_name)

# See overlapping genes
print(unique(significant_DEGs$external_gene_name))

In [None]:
# Subset long data to have only significant DEGs
long_data_sigDEGs <- long_data[long_data$Gene %in% significant_DEGs$external_gene_name, ]

# View the first few rows of the new object
head(long_data_sigDEGs)
length(unique(long_data_sigDEGs$Gene))
unique(long_data_sigDEGs$Sample)
unique(long_data_sigDEGs$Type)
unique(long_data_sigDEGs$Group)

In [None]:
# Prepare the data for plotting
plot_data <- long_data_sigDEGs %>%
  filter(Type %in% c("x", "y")) %>%
  pivot_wider(names_from = Type, values_from = Value, names_prefix = "Type_") %>%
  filter(!is.na(Type_x) & !is.na(Type_y)) 

In [None]:
# Initialize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_overlapping_DEGs_and_DMRs_labeled.pdf", width = 20, height = 12)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  geom_text(aes(label = Gene), vjust = -1, size = 2.5) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

In [None]:
# Initialize PDF
pdf("scatter_plot_RNAseq_vs_WGBS_overlapping_DEGs_and_DMRs.pdf", width = 8, height = 6)

# Create a combined scatter plot
p <- ggplot(plot_data, aes(x = Type_y, y = Type_x, color = Group)) +
  geom_point(alpha = 0.7) +  
  labs(title = "Scatter Plot of RNA-seq TPMs vs. WGBS Percent Methylation",
       x = "WGBS Percent Methylation",
       y = "RNA-seq TPMs") +
  scale_color_manual(values = c("Control" = "blue", "FAE" = "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

# View 
print(p)

# Close the PDF device
dev.off()

# Assess Significance of DEG and DMR Overlap

In [None]:
# Load data
degs <- read.csv("rnaseq/05_DEGs/faexcess_vs_control_genes.csv", header = TRUE)

# Subset to significant DEGs only
degs <- degs[degs$adj.P.Val < 0.05, ]

# View
head(degs)

In [None]:
# Load data
dmrs <- read_excel("wgbs/08_cytosine_reports/DMRs/DMRs_annotated.xlsx")

# Subset to significant DMRs only
dmrs <- dmrs[dmrs$p.value < 0.05, ]

# View
head(dmrs)

## Make Contingency Table for Fisher's Exact Test

In [None]:
# Overlap
overlap_count <- length(intersect(degs$external_gene_name, dmrs$geneSymbol))

In [None]:
# DEGs not in DMRs
b <- length(setdiff(degs$external_gene_name, dmrs$geneSymbol))

In [None]:
# DMRs not in DEGs
c <- length(setdiff(dmrs$geneSymbol, degs$external_gene_name))

In [None]:
# Total genes
total_genes <- length(unique(gtf_data$gene_name))  
d <- total_genes - (overlap_count + b + c)

In [None]:
# Create the contingency table
contingency_table <- matrix(c(overlap_count, b, c, d), nrow = 2)
colnames(contingency_table) <- c("In DMRs", "Not in DMRs")
rownames(contingency_table) <- c("Overlap", "No Overlap")

# View
print(contingency_table)

In [None]:
# Perform Fisher's Exact Test
fisher_result <- fisher.test(contingency_table)

# View
print(fisher_result)

## Assess Significance of Correlations

In [None]:
# Load data
correlation_results <- read.csv("spearman_correlation_results.csv")

# View
head(correlation_results)

In [None]:
# Identify significant correlations for each condition
control_total <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05, ]
control_only <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 & correlation_results$p_value_Correlation_FAE >= 0.05, ]
FAE_total <- correlation_results[correlation_results$p_value_Correlation_FAE < 0.05, ]
FAE_only <- correlation_results[correlation_results$p_value_Correlation_FAE < 0.05 & correlation_results$p_value_Correlation_Control >= 0.05, ]
both <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 & correlation_results$p_value_Correlation_FAE < 0.05, ]
either <- correlation_results[correlation_results$p_value_Correlation_Control < 0.05 | correlation_results$p_value_Correlation_FAE < 0.05, ]

# Print the number of significant genes for each condition
print(length(control_total$Gene))
print(length(control_only$Gene))
print(length(FAE_total$Gene))
print(length(FAE_only$Gene))
print(length(both$Gene))
print(length(either$Gene))

In [None]:
# List of genes to check
genes_to_check <- c("Naa20", "Med10", "Epb41l4a", "Katnal2", "D3Ertd751e", 
                     "Ccdc93", "Itga6", "Tacr3", "Cald1", "Ccl17", 
                     "Atp2c1", "Asprv1", "Kcnk10", "Fau", "Egr1", 
                     "Mest", "Fbh1", "Syt10", "Urm1", "Arrdc3")

# Check if these genes are in the either category
genes_in_either <- genes_to_check[genes_to_check %in% either$Gene]

# Print the genes that are found in the either category
print(genes_in_either)