## Enable R

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R

library(GenomicFeatures)
library(ChIPpeakAnno)
library(ChIPseeker)
library(clusterProfiler)
library(dplyr)
library(stringr)

R[write to console]: Loading required package: BiocGenerics

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


R[write to console]: Loading required package: S4Vectors

R[write to console]: Loading required package: stats4

R[write to console]: 
Attaching package: ‘S4Vectors’


R[write to console]: The following objects are masked from ‘package:base’:

    expand.grid, I, unname


R[write to console]: Lo

## Python imports and paths

In [17]:
from pathlib import Path
import pandas as pd

d_root = Path("/home/fpavlov/projects/article_conserved_miRNA")
d_data = d_root / "data"

# output
f_mm_v25_annotation = d_data / "gencode.vM25.annotation.gtf.gz"
f_mm_v25_annotation_str = str(d_data / "gencode.vM25.annotation.gtf.gz")
f_flipon_to_gene_anno = d_data / "flipon_to_gene_anno.tsv"


## Gene annotation

In [4]:
# Download gencode gene annotation
!wget -c -O {f_mm_v25_annotation} -q --show-progress http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz

# Move the variable with path to R
%R -i f_mm_v25_annotation_str



In [7]:
%%R

# Make a TxDb annotation
txdb_all_annotation <- makeTxDbFromGFF(f_mm_v25_annotation_str)

R[write to console]: Import genomic features from the file as a GRanges object ... 
R[write to console]: OK

R[write to console]: Prepare the 'metadata' data frame ... 
R[write to console]: OK

R[write to console]: Make the TxDb object ... 
R[write to console]: OK



In [8]:
%%R

# Get gene names from the annotation
gtf.file = f_mm_v25_annotation_str
gtf.gr = rtracklayer::import(gtf.file) # creates a GRanges object
gtf.df = as.data.frame(gtf.gr)
genes = unique(gtf.df[ ,c("gene_id","gene_name")])

In [9]:
# 
ssdna_list = [
    str(d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_z-dna.bed"),
    str(d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_quadruplex.bed"),
    str(d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_sidd.bed"),
    str(d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_h-dna.bed"),
]

%R -i ssdna_list

In [14]:
%%R

df_final <- data.frame()

for (ssdna in ssdna_list) {
  peakAnno <- annotatePeak(ssdna, tssRegion=c(-3000, 3000), TxDb=txdb_all_annotation)

  anno_df <- data.frame(peakAnno@anno)
  res_df <- anno_df[c('V4', 'seqnames','start','end','geneStrand','annotation','geneId','distanceToTSS','transcriptId','geneStart','geneEnd')]
  res_df$group <- str_replace(str_split(ssdna, 'enriched_')[[1]][2], '.bed', '')
  res_df <- merge(res_df, genes, by.x='geneId', by.y='gene_id')

  df_final <- bind_rows(df_final, res_df)
  
  print('--------------------------------------------')
  print(ssdna)
  print('--------------------------------------------')
  print(peakAnno)
  print(dim(res_df))
}

# write.table(df_final, 'flipon_to_gene_anno.tsv', append=FALSE, sep="\t", dec=".", row.names=FALSE, col.names = TRUE)
# df_final


>> loading peak file...				 2022-11-14 20:54:54 
>> preparing features information...		 2022-11-14 20:54:54 
>> identifying nearest features...		 2022-11-14 20:54:54 
>> calculating distance from peak to TSS...	 2022-11-14 20:54:55 
>> assigning genomic annotation...		 2022-11-14 20:54:55 
>> assigning chromosome lengths			 2022-11-14 20:54:56 
>> done...					 2022-11-14 20:54:56 
>> loading peak file...				 2022-11-14 20:54:57 
>> preparing features information...		 2022-11-14 20:54:57 
>> identifying nearest features...		 2022-11-14 20:54:57 
>> calculating distance from peak to TSS...	 2022-11-14 20:54:57 
>> assigning genomic annotation...		 2022-11-14 20:54:57 
>> assigning chromosome lengths			 2022-11-14 20:54:59 
>> done...					 2022-11-14 20:54:59 
>> loading peak file...				 2022-11-14 20:54:59 
>> preparing features information...		 2022-11-14 20:54:59 
>> identifying nearest features...		 2022-11-14 20:54:59 
>> calculating distance from peak to TSS...	 2022-11-14 20:54:59 

In [17]:
df_final = %R df_final

df_final = df_final[['V4', 'seqnames', 'start', 'end', 'geneId', 'gene_name', 'geneStart', 'geneEnd', 'transcriptId', 'distanceToTSS', 'geneStrand', 'annotation', 'group']]

df_final.to_csv(f_flipon_to_gene_anno, sep='\t', index=False)
df_final

Unnamed: 0,V4,seqnames,start,end,geneId,gene_name,geneStart,geneEnd,transcriptId,distanceToTSS,geneStrand,annotation,group
1,chr7:142572121-142572208,7,142572122,142572208,ENSMUSG00000000031.16,H19,142575634,142576538,ENSMUST00000140716.1,4330.0,2,Distal Intergenic,z-dna
2,chr4:120414009-120414024,4,120414010,120414024,ENSMUSG00000000085.16,Scmh1,120405428,120463084,ENSMUST00000132116.7,8582.0,1,Intron (ENSMUST00000134375.7/ENSMUSG0000000008...,z-dna
3,chr11:85832528-85832543,11,85832529,85832543,ENSMUSG00000000093.6,Tbx2,85832551,85841948,ENSMUST00000000095.6,-8.0,1,Promoter (<=1kb),z-dna
4,chr11:85832843-85832860,11,85832844,85832860,ENSMUSG00000000093.6,Tbx2,85832551,85841948,ENSMUST00000000095.6,293.0,1,Promoter (<=1kb),z-dna
5,chr11:85833014-85833035,11,85833015,85833035,ENSMUSG00000000093.6,Tbx2,85832551,85841948,ENSMUST00000000095.6,464.0,1,Promoter (<=1kb),z-dna
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77696,chr7:27899690-27899773,7,27899691,27899773,ENSMUSG00000118641.1,AC139063.1,27894062,27894118,ENSMUST00000239484.1,-5573.0,2,Distal Intergenic,h-dna
77697,chr7:27899779-27899799,7,27899780,27899799,ENSMUSG00000118641.1,AC139063.1,27894062,27894118,ENSMUST00000239484.1,-5662.0,2,Distal Intergenic,h-dna
77698,chr7:27899863-27899888,7,27899864,27899888,ENSMUSG00000118641.1,AC139063.1,27894062,27894118,ENSMUST00000239484.1,-5746.0,2,Distal Intergenic,h-dna
77699,chr17:39848378-39848395,17,39848379,39848395,ENSMUSG00000118642.1,CT010467.2,39848103,39848827,ENSMUST00000205406.1,432.0,2,Promoter (<=1kb),h-dna
