## Set Library Path

In [1]:
.libPaths("/share/korflab/home/viki/anaconda3/jupyter_nb_R4.3/lib/R/library")

## Load Libraries

In [3]:
library(openxlsx)
library(readxl)
library(dplyr)
library(glue)

## Read in Data

In [4]:
# Read in RNA-seq data
rnaseq <- read.csv("05_DEGs/faexcess_vs_control_genes.csv") 

# Filter to significant genes only
rnaseq <- rnaseq[rnaseq$adj.P.Val < 0.05, ]

# View
head(rnaseq)

Unnamed: 0_level_0,gene_names,external_gene_name,X,logFC,AveExpr,t,P.Value,adj.P.Val,B,entrez_gene_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
6,ENSMUSG00000000056,Narf,ENSMUSG00000000056,0.5745014,6.264568,5.302825,4.478946e-05,0.01708118,2.2039706,67608
15,ENSMUSG00000000127,Fer,ENSMUSG00000000127,0.6462261,3.584868,3.784648,0.001309626,0.0434303,-0.8852651,14158
37,ENSMUSG00000000295,Hddc2,ENSMUSG00000000295,-0.7434656,6.007363,-5.565035,2.549262e-05,0.01708118,2.744937,69692
68,ENSMUSG00000000555,Itga5,ENSMUSG00000000555,0.9858357,2.035853,4.689121,0.0001721743,0.02023809,0.8528376,16402
75,ENSMUSG00000000581,C1d,ENSMUSG00000000581,-0.5972541,8.032868,-4.401301,0.0003271859,0.0265197,0.2916874,57316
90,ENSMUSG00000000740,Rpl13,ENSMUSG00000000740,-0.4829455,10.772814,-4.353117,0.0003644852,0.02763054,0.1959312,270106


In [5]:
# Read in WGBS data
wgbs <- read_excel("/share/lasallelab/Viki/epigenerator/10_DMRichR_SexCombined/DMRs/DMRs_annotated.xlsx") 

head(wgbs)

chr,start,end,width,CpGs,betaCoefficient,statistic,p.value,q.value,direction,⋯,CpG.Island,CpG.Shore,CpG.Shelf,Open.Sea,annotation,geneId,distanceToTSS,ENSEMBL,geneSymbol,gene
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>
chr2,87525906,87526271,366,27,-0.7075803,-8.439746,0.0001484955,0.8146332,Hypomethylated,⋯,Yes,Yes,No,No,Exon,110511,-5525,ENSMUSG00000061520,Or5w22,olfactory receptor family 5 subfamily W member 22
chr5,131236777,131237235,459,9,0.6906494,7.767359,0.0004611176,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Intron,212996,71262,ENSMUSG00000034040,Galnt17,polypeptide N-acetylgalactosaminyltransferase 17
chr7,92448940,92449667,728,8,0.6094742,7.635594,0.0005392732,0.8146332,Hypermethylated,⋯,No,No,No,Yes,3' UTR,23859,1971709,ENSMUSG00000052572,Dlg2,discs large MAGUK scaffold protein 2
chr4,24077950,24078197,248,5,0.7599784,7.631384,0.0005392732,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Distal Intergenic,212377,-418254,ENSMUSG00000045751,Mms22l,"MMS22-like, DNA repair protein"
chr9,118085608,118085943,336,7,0.8877527,7.520019,0.0006017976,0.8146332,Hypermethylated,⋯,No,No,No,Yes,Intron,67899,64234,ENSMUSG00000039163,Cmc1,COX assembly mitochondrial protein 1
chr17,74424279,74424751,473,13,-0.6255215,-7.46747,0.0006799531,0.8146332,Hypomethylated,⋯,No,No,No,Yes,Downstream,210148,14453,ENSMUSG00000024069,Slc30a6,"solute carrier family 30 (zinc transporter), member 6"


## Find Intersections

In [6]:
# Find the overlapping genes
overlapping_genes <- intersect(rnaseq$external_gene_name, wgbs$geneSymbol)

# Get the total number of overlapping genes
total_overlapping <- length(overlapping_genes)

# Print the results
cat("Total number of overlapping genes:", total_overlapping, "\n")
cat("Overlapping genes:\n", paste(overlapping_genes, collapse = ", "), "\n")

Total number of overlapping genes: 20 
Overlapping genes:
 Naa20, Med10, Epb41l4a, Katnal2, D3Ertd751e, Ccdc93, Itga6, Tacr3, Cald1, Ccl17, Atp2c1, Asprv1, Kcnk10, Fau, Egr1, Mest, Fbh1, Syt10, Urm1, Arrdc3 


In [7]:
# Create a data frame to store the results
results <- data.frame(
  Gene_Name = overlapping_genes,
  Gene_Function = NA,  # Placeholder for gene functions
  RNAseqlogfoldchange = NA,
  RNAseqadjpval = NA,
  WGBSpercentmeth = NA,
  WGBSdirection = NA,
  WGBScpgisland = NA,
  WGBScpgshelf = NA,
  WGBSopensea = NA,
  WGBSannot = NA,
  stringsAsFactors = FALSE
)

# Fill in the data frame with corresponding values from rnaseq and wgbs
for (gene in overlapping_genes) {
  # Get RNA-seq data for the gene
  rnaseq_row <- rnaseq[rnaseq$external_gene_name == gene, ]
  if (nrow(rnaseq_row) > 0) {
    results$RNAseqlogfoldchange[results$Gene_Name == gene] <- rnaseq_row$logFC
    results$RNAseqadjpval[results$Gene_Name == gene] <- rnaseq_row$adj.P.Val
  }
  
  # Get WGBS data for the gene
  wgbs_row <- wgbs[wgbs$geneSymbol == gene, ]
  if (nrow(wgbs_row) > 0) {
    results$Gene_Function[results$Gene_Name == gene] <- wgbs_row$gene
    results$WGBSpercentmeth[results$Gene_Name == gene] <- wgbs_row$difference
    results$WGBSdirection[results$Gene_Name == gene] <- wgbs_row$direction
    results$WGBScpgisland[results$Gene_Name == gene] <- wgbs_row$CpG.Island
    results$WGBScpgshelf[results$Gene_Name == gene] <- wgbs_row$CpG.Shelf
    results$WGBSopensea[results$Gene_Name == gene] <- wgbs_row$Open.Sea
    results$WGBSannot[results$Gene_Name == gene] <- wgbs_row$annotation
  }
}

# Write the results to a CSV file
write.csv(results, "overlapping_genes_analysis.csv", row.names = FALSE)