In [None]:
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("ChIPpeakAnno")
BiocManager::install("ChIPseeker")
BiocManager::install("clusterProfiler")

library(GenomicFeatures)
library(ChIPpeakAnno)
library(ChIPseeker)
library(clusterProfiler)

In [None]:
download.file("http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz", destfile="gencode.vM25.annotation.gtf.gz", method='wget')
txdb <- makeTxDbFromGFF('gencode.vM25.annotation.gtf.gz')

annoData <- toGRanges(txdb, format='gene')
annoData[1:2]

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
"The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored."
OK



GRanges object with 2 ranges and 0 metadata columns:
                        seqnames              ranges strand
                           <Rle>           <IRanges>  <Rle>
   ENSMUSG00000000001.4     chr3 108107280-108146146      -
  ENSMUSG00000000003.15     chrX   77837901-77853623      -
  -------
  seqinfo: 22 sequences (1 circular) from an unspecified genome; no seqlengths

In [None]:
peaks <- toGRanges(paste0('mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop500.bed'), format="BED")
peaks[1:2]

duplicated or NA names found. Rename all the names by numbers.



GRanges object with 2 ranges and 1 metadata column:
        seqnames          ranges strand |     score
           <Rle>       <IRanges>  <Rle> | <numeric>
  X0001     chr1 6886784-6886808      + |         0
  X0002     chr1 7524225-7524272      + |         0
  -------
  seqinfo: 21 sequences from an unspecified genome; no seqlengths

In [None]:
txdb_all_annotation <- makeTxDbFromGFF('gencode.vM25.annotation.gtf.gz')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
"The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored."
OK



In [None]:
# https://www.biostars.org/p/140471/
gtf.file = "gencode.vM25.annotation.gtf.gz"
gtf.gr = rtracklayer::import(gtf.file) # creates a GRanges object
gtf.df = as.data.frame(gtf.gr)
genes = unique(gtf.df[ ,c("gene_id","gene_name")])
genes

Unnamed: 0_level_0,gene_id,gene_name
Unnamed: 0_level_1,<chr>,<chr>
1,ENSMUSG00000102693.1,4933401J01Rik
4,ENSMUSG00000064842.1,Gm26206
7,ENSMUSG00000051951.5,Xkr4
25,ENSMUSG00000102851.1,Gm18956
28,ENSMUSG00000103377.1,Gm37180
31,ENSMUSG00000104017.1,Gm37363
34,ENSMUSG00000103025.1,Gm37686
37,ENSMUSG00000089699.1,Gm1992
41,ENSMUSG00000103201.1,Gm37329
44,ENSMUSG00000103147.1,Gm7341


In [None]:
library(dplyr)
library(stringr)

ssdna_list <- list(
  'mm10_kouzine_actb_ssdna_enriched_z-dna.bed',
  'mm10_kouzine_actb_ssdna_enriched_sidd.bed',
  'mm10_kouzine_actb_ssdna_enriched_quadruplex.bed',
  'mm10_kouzine_actb_ssdna_enriched_h-dna.bed',
  'mm10_kouzine_actb_ssdna_enriched_z-dna_miRNA.bed',
  'mm10_kouzine_actb_ssdna_enriched_sidd_miRNA.bed',
  'mm10_kouzine_actb_ssdna_enriched_quadruplex_miRNA.bed',
  'mm10_kouzine_actb_ssdna_enriched_h-dna_miRNA.bed',
  'mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop100.bed',
  'mm10_kouzine_actb_ssdna_enriched_sidd_slop100.bed',
  'mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop100.bed',
  'mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop100.bed',
  'mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop200.bed',
  'mm10_kouzine_actb_ssdna_enriched_sidd_slop200.bed',
  'mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop200.bed',
  'mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop200.bed',
  'mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop500.bed',
  'mm10_kouzine_actb_ssdna_enriched_sidd_slop500.bed',
  'mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop500.bed',
  'mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop500.bed'
)


df_final <- data.frame()

for (ssdna in ssdna_list) {
  print('--------------------------------------------')
  print(ssdna)
  print('--------------------------------------------')
  peakAnno <- annotatePeak(ssdna, tssRegion=c(-3000, 3000), TxDb=txdb_all_annotation)
  print(peakAnno)

  anno_df <- data.frame(peakAnno@anno)
  res_df <- anno_df[grepl("Promoter", anno_df$annotation),][c('annotation', 'geneId')]
  res_unique_genes_df <- distinct(res_df, geneId, .keep_all=TRUE)
  res_unique_genes_df$group <- str_replace(str_split(ssdna, 'enriched_')[[1]][2], '.bed', '')
  res_unique_genes_df <- merge(res_unique_genes_df, genes, by.x='geneId', by.y='gene_id')

  df_final <- bind_rows(df_final, res_unique_genes_df)
  
  print(dim(res_unique_genes_df))
}

write.table(df_final, 'geneIds_by_group.tsv', append=FALSE, sep="\t", dec=".", row.names=FALSE, col.names = TRUE)
df_final


[1] "--------------------------------------------"
[1] "mm10_kouzine_actb_ssdna_enriched_z-dna.bed"
[1] "--------------------------------------------"
>> loading peak file...				 2022-05-14 04:22:16 PM 
>> preparing features information...		 2022-05-14 04:22:17 PM 
>> identifying nearest features...		 2022-05-14 04:22:17 PM 
>> calculating distance from peak to TSS...	 2022-05-14 04:22:17 PM 
>> assigning genomic annotation...		 2022-05-14 04:22:17 PM 
>> assigning chromosome lengths			 2022-05-14 04:22:20 PM 
>> done...					 2022-05-14 04:22:20 PM 
Annotated peaks generated by ChIPseeker
25059/25059  peaks were annotated
Genomic Annotation Summary:
              Feature   Frequency
9    Promoter (<=1kb) 40.95135480
10   Promoter (1-2kb)  5.89808053
11   Promoter (2-3kb)  3.73518496
4              5' UTR  0.02394349
3              3' UTR  0.81806936
1            1st Exon  0.01596233
7          Other Exon  3.09669181
2          1st Intron  4.93235963
8        Other Intron 12.46258829
6 

geneId,annotation,group,gene_name
<chr>,<chr>,<chr>,<chr>
ENSMUSG00000000093.6,Promoter (<=1kb),z-dna,Tbx2
ENSMUSG00000000094.12,Promoter (<=1kb),z-dna,Tbx4
ENSMUSG00000000120.6,Promoter (<=1kb),z-dna,Ngfr
ENSMUSG00000000125.5,Promoter (<=1kb),z-dna,Wnt3
ENSMUSG00000000126.11,Promoter (<=1kb),z-dna,Wnt9a
ENSMUSG00000000127.15,Promoter (<=1kb),z-dna,Fer
ENSMUSG00000000142.15,Promoter (<=1kb),z-dna,Axin2
ENSMUSG00000000183.6,Promoter (<=1kb),z-dna,Fgf6
ENSMUSG00000000197.8,Promoter (<=1kb),z-dna,Nalcn
ENSMUSG00000000223.13,Promoter (1-2kb),z-dna,Drp2
