# Theoretical NTE

In [1]:
library(dplyr)
library(GenomicFeatures)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
 

In [1]:
# this scripts transform local (transcriptomic) coordinates of extensions into global (genomic) coordinates using .gtf file. 
txdb <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')
cr <- read.csv('tmp_res/local_coo_primary_NTE.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$N_term_start1 + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$N_term_end1

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/primary_NTE_global_coo.txt', sep='\t')

ERROR: Error in makeTxDbFromGFF("data/gencode.v25.annotation.gtf"): could not find function "makeTxDbFromGFF"


# 50-codons of NTE 

In [5]:
cr <- read.csv('tmp_res/local_coo_50codons_and_less.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$X50len_flag + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$N_term_end1 

print (length(tr_names))
print (length(start))
print (length(end))

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/global_coo_50codons_and_less.txt', sep='\t')

[1] 77820
[1] 77820
[1] 77820


# Gencode25: coding exons 

In [6]:
cr <- read.csv('tmp_res/metadata_pc_g25.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$cds_start_pos + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$cds_stop_pos 

print (length(tr_names))
print (length(start))
print (length(end))

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/coding_exons_g25_global.txt', sep='\t')

[1] 94359
[1] 94359
[1] 94359


# Gencode35: coding exons 

In [8]:
txdb_g35 <- makeTxDbFromGFF('data/gencode.v35.annotation.gtf')

cr <- read.csv('tmp_res/metadata_pc_g35.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$cds_start_pos + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$cds_stop_pos 

print (length(tr_names))
print (length(start))
print (length(end))

exons <- exonsBy(txdb_g35, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/coding_exons_g35_global.txt', sep='\t')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



[1] 101486
[1] 101486
[1] 101486


# PCSF per codon: global coordinates of codons in NTE+50 codons of CDS

### PhyloSET

In [2]:
txdb <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')
cr <- read.csv('tmp_res/local_coo_of_codons_for_PCSF_plot_PhyloSET_plus50codonsCDS.txt',  sep='\t', stringsAsFactors = FALSE)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [3]:
tr_names <- cr$tr_id
start <- cr$codon_start+ 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$codon_end

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/global_coo_of_codons_for_PCSF_plot_PhyloSET_plus50codonsCDS.txt', sep='\t')

### RiboSET

In [5]:
cr <- read.csv('tmp_res/local_coo_of_codons_for_PCSF_plot_RiboSET_plus50codonsCDS.txt',  sep='\t', stringsAsFactors = FALSE)


tr_names <- cr$tr_id
start <- cr$codon_start+ 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$codon_end

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/global_coo_of_codons_for_PCSF_plot_RiboSET_plus50codonsCDS.txt', sep='\t')

# CYTH2: NTE+50codons of CDS 

In [8]:
tr_names <- c('ENST00000452733.6')
start <- c(302+1)
end <- c(626)

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))

out %>% filter(hit == TRUE)

group,group_name,seqnames,start,end,width,strand,exon_id,exon_name,exon_rank,hit
<int>,<chr>,<fct>,<int>,<int>,<int>,<fct>,<int>,<chr>,<int>,<lgl>
1,ENST00000452733.6,chr19,48469334,48469526,193,+,592437,ENSE00001833864.1,1,True
1,ENST00000452733.6,chr19,48470353,48470483,131,+,592447,ENSE00003716163.1,2,True


In [None]:
chr19:48469334-48469526+chr19:48470353-48470483

# MARVELD1: NTE+50codons of CDS 

In [3]:
txdb <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [3]:
tr_names <- c('ENST00000285605.7')
start <- c(27+1)
end <- c(303)

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))

out %>% filter(hit == TRUE)

group,group_name,seqnames,start,end,width,strand,exon_id,exon_name,exon_rank,hit
<int>,<chr>,<fct>,<int>,<int>,<int>,<fct>,<int>,<chr>,<int>,<lgl>
1,ENST00000285605.7,chr10,97713751,97714026,276,+,331305,ENSE00001019945.6,1,True


In [None]:
chr10:97713751-97714026, +

# ARPC1A

In [4]:
tr_names <- c('ENST00000262942.9')
start <- c(1+1)
end <- c(274)

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))

out %>% filter(hit == TRUE)

group,group_name,seqnames,start,end,width,strand,exon_id,exon_name,exon_rank,hit
<int>,<chr>,<fct>,<int>,<int>,<int>,<fct>,<int>,<chr>,<int>,<lgl>
1,ENST00000262942.9,chr7,99325911,99326004,94,+,250551,ENSE00001055468.3,1,True
1,ENST00000262942.9,chr7,99333325,99333417,93,+,250553,ENSE00003508295.1,2,True
1,ENST00000262942.9,chr7,99338181,99338266,86,+,250555,ENSE00003551555.1,3,True


In [None]:
chr7:99325911-99326004+chr7:99333325-99333417+chr7:99338181-99338266

In [4]:
# SPFQ: NTE+50codons of CDS 

In [6]:
tr_names <- c('ENST00000357214.5')
start <- c(0+1)
end <- c(99+50*3)

exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))

out %>% filter(hit == TRUE)

group,group_name,seqnames,start,end,width,strand,exon_id,exon_name,exon_rank,hit
<int>,<chr>,<fct>,<int>,<int>,<int>,<fct>,<int>,<chr>,<int>,<lgl>
1,ENST00000357214.5,chr1,35192900,35193148,249,-,39231,ENSE00001460236.4,1,True


In [None]:
chr1:35192900-35193148, -

# Reproducibility of PhyloCSF-based approach on 24/59 genes from Ivanov et al 2011 paper 

In [2]:
txdb_g35 <- makeTxDbFromGFF('data/gencode.v35.annotation.gtf')


Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [4]:
cr <- read.csv('tmp_res/reprod_24genes_2011_local_coo_df.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$X50len_flag + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$new_N_term_end 

print (length(tr_names))
print (length(start))
print (length(end))

[1] 60
[1] 60
[1] 60


In [5]:
exons <- exonsBy(txdb_g35, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/reprod_24genes_2011_global_coo_df.txt', sep='\t')

# Predicted starts&ext in RiboSET

In [2]:
txdb_g25 <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [11]:
cr <- read.csv('tmp_res/RiboSET_local_coo_pred_start.txt',  sep='\t', stringsAsFactors = FALSE)

In [16]:
tr_names <- cr$tr_id
start <- cr$start_codon_local_coo_start #it was zero  # cds_start_pos	utr5_start
end <- cr$ start_codon_local_coo_stop-1

print (length(tr_names))
print (length(start))
print (length(end))

[1] 395
[1] 395
[1] 395


In [17]:
exons <- exonsBy(txdb_g25, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/RiboSET_global_coo_pred_start.txt', sep='\t')

In [8]:
tr_names <- cr$tr_id
start <- cr$start_codon_local_coo_start #it was zero  # cds_start_pos	utr5_start
end <- cr$N_term_end1

print (length(tr_names))
print (length(start))
print (length(end))

[1] 395
[1] 395
[1] 395


In [10]:
exons <- exonsBy(txdb_g25, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/RiboSET_global_coo_pred_start_ext.txt', sep='\t')

# Codons coo (riboset) for COSMIC variants 

In [3]:
# this scripts transform local (transcriptomic) coordinates of extensions into global (genomic) coordinates using .gtf file. 
txdb <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')
cr <- read.csv('tmp_res/ext_codons_local_coo_cosmic.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$codon_start + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$codon_stop

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [5]:
exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]

x <- GRanges(tr_names, IRanges(start=start, end=end)) #, 

out <- data.frame(pmapFromTranscripts(x, exons))

write.table(out, 'tmp_res/ext_codons_global_coo_cosmic.txt', sep='\t')

In [6]:
out

group,group_name,seqnames,start,end,width,strand,exon_id,exon_name,exon_rank,hit
<int>,<chr>,<fct>,<int>,<int>,<int>,<fct>,<int>,<chr>,<int>,<lgl>
1,ENST00000379389.4,chr1,1013520,1013522,3,+,240,ENSE00001480807.4,1,TRUE
1,ENST00000379389.4,chr1,1013984,1013983,0,+,242,ENSE00001480799.4,2,FALSE
2,ENST00000379389.4,chr1,1013523,1013525,3,+,240,ENSE00001480807.4,1,TRUE
2,ENST00000379389.4,chr1,1013984,1013983,0,+,242,ENSE00001480799.4,2,FALSE
3,ENST00000379389.4,chr1,1013526,1013528,3,+,240,ENSE00001480807.4,1,TRUE
3,ENST00000379389.4,chr1,1013984,1013983,0,+,242,ENSE00001480799.4,2,FALSE
4,ENST00000379389.4,chr1,1013529,1013531,3,+,240,ENSE00001480807.4,1,TRUE
4,ENST00000379389.4,chr1,1013984,1013983,0,+,242,ENSE00001480799.4,2,FALSE
5,ENST00000379389.4,chr1,1013532,1013534,3,+,240,ENSE00001480807.4,1,TRUE
5,ENST00000379389.4,chr1,1013984,1013983,0,+,242,ENSE00001480799.4,2,FALSE


# mice CDS v14 

In [10]:
# this scripts transform local (transcriptomic) coordinates of extensions into global (genomic) coordinates using .gtf file. 
txdb <- makeTxDbFromGFF('data/gencode.vM14.annotation.gtf')


Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [11]:
cr <- read.csv('tmp_res/mice14_nte_riboset.txt',  sep='\t', stringsAsFactors = FALSE)

tr_names <- cr$tr_id
start <- cr$start + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$stop

length(tr_names)
length(start)
length(end)

In [14]:
exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]


In [15]:
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/mice14_nte_riboset_global_coo.txt', sep='\t')

# mouse CDS

In [17]:
cr <- read.csv('tmp_res/mice14_CDS_local_coo.txt',  sep='\t', stringsAsFactors = FALSE)
head(cr)

Unnamed: 0_level_0,tr_id,cds_start_pos,cds_stop_pos
Unnamed: 0_level_1,<chr>,<int>,<int>
1,ENSMUST00000070533.4,150,2094
2,ENSMUST00000208660.1,54,4170
3,ENSMUST00000194992.6,0,858
4,ENSMUST00000027032.5,127,6415
5,ENSMUST00000027035.9,1082,2342
6,ENSMUST00000195555.1,635,1511


In [18]:
tr_names <- cr$tr_id
start <- cr$cds_start_pos + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$cds_stop_pos

length(tr_names)
length(start)
length(end)

In [19]:
exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/mice14_CDS_global_coo.txt', sep='\t')

# Gencode annotation genes 

In [2]:
# this scripts transform local (transcriptomic) coordinates of extensions into global (genomic) coordinates using .gtf file. 
txdb <- makeTxDbFromGFF('data/gencode.v25.annotation.gtf')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [23]:
cr <- read.csv('tmp_res/human_genes_for_annotation_in_gencode_local_coo.txt',  
               sep='\t', stringsAsFactors = FALSE)

head(cr)

Unnamed: 0_level_0,Gene,tr_id.in.gencode25,new_cds_start,cds_stop,tr_id
Unnamed: 0_level_1,<chr>,<chr>,<int>,<int>,<chr>
1,SFPQ,ENST00000357214,36,2223,ENST00000357214.5
2,VANGL2,ENST00000368061,330,2040,ENST00000368061.2
3,CCDC8,ENST00000307522,564,2391,ENST00000307522.3
4,USP27X,ENST00000621775,51,1677,ENST00000621775.1
5,WWC3,ENST00000380861,199,3670,ENST00000380861.8
6,IFFO2,ENST00000455833,300,1908,ENST00000455833.6


In [24]:
tr_names <- cr$tr_id
start <- cr$new_cds_start + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$cds_stop

In [25]:
exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/human_genes_for_annotation_in_gencode_global_coo.txt', sep='\t')

# Genes in mice 

In [None]:
# 

In [26]:
# this scripts transform local (transcriptomic) coordinates of extensions into global (genomic) coordinates using .gtf file. 
txdb <- makeTxDbFromGFF('data/gencode.vM14.annotation.gtf')

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK



In [40]:
cr <- read.csv('tmp_res/mouse_genes_for_annotation_local_coo.txt',  
               sep='\t', stringsAsFactors = FALSE)

head(cr)

Unnamed: 0_level_0,Gene,mouse.gene,mouse_codon,mouse.tr_id,mouse_new_cds_start,mouse_cds_stop
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,SFPQ,Sfpq,GUG,ENSMUST00000030623.7,30,2193
2,VANGL2,Vangl2,ATA,ENSMUST00000027837.12,346,2056
3,CCDC8,Ccdc8,CTG,ENSMUST00000094805.4,1670,3938
4,USP27X,Usp27X,CTG,ENSMUST00000178293.7,656,2282
5,IFFO2,Iffo2,CTG,ENSMUST00000123827.8,511,1387
6,PELI2,Peli2,ACG,ENSMUST00000073150.4,129,1548


In [41]:
tr_names <- cr$mouse.tr_id
start <- cr$mouse_new_cds_start + 1 #it was zero  # cds_start_pos	utr5_start
end <- cr$mouse_cds_stop

length(tr_names)
length(start)
length(end)

In [42]:
exons <- exonsBy(txdb, by="tx", use.names=TRUE)[tr_names]
x <- GRanges(tr_names, IRanges(start=start, end=end)) #,  
out <- data.frame(pmapFromTranscripts(x, exons))
write.table(out, 'tmp_res/mouse_genes_for_annotation_global_coo.txt', sep='\t')