# Downloads for CAGE binding prediction

If necessary, download prerequisites first.

In [11]:
#!conda install --yes r-base
#!pip install rpy2
#!pip install tzlocal
#!conda install --yes -c bioconda bedtools samtools 
#!conda install --yes r-stringi

In [3]:
import os
import pandas as pd
from pybedtools import BedTool
%load_ext rpy2.ipython

In [4]:
output = '../data'
os.makedirs(output, exist_ok=True)

## Downloading the data for CAGE prediction


In [5]:
# HepG2, Dnase, paired-end
!wget https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam -O {output}/dnase.hepg2.bam;
!samtools index {output}/dnase.hepg2.bam

# HepG2, H3K4me3
!wget https://www.encodeproject.org/files/ENCFF736LHE/@@download/ENCFF736LHE.bigWig -O {output}/h3k4me3.hepg2.bigWig;

!wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz -O {output}/gencode.v29.annotation.gtf.gz
!gunzip -f {output}/gencode.v29.annotation.gtf.gz

#CAGE, HepG2, rep1
!wget https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam -O {output}/cage.hepg2.rep1.bam


--2019-11-07 22:32:03--  https://www.encodeproject.org/files/ENCFF591XCX/@@download/ENCFF591XCX.bam
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2017/08/24/ac0d9343-0435-490b-aa5d-2f14e8275a9e/ENCFF591XCX.bam?response-content-disposition=attachment%3B%20filename%3DENCFF591XCX.bam&Signature=jmijhfrSNqpgHeqcJfMGh2gdvo0%3D&Expires=1573291923&AWSAccessKeyId=ASIATGZNGCNXYJFYFYZZ&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93l


2019-11-07 23:06:20 (2,66 MB/s) - ‘../data/gencode.v29.annotation.gtf.gz’ saved [39387922]

--2019-11-07 23:06:33--  https://www.encodeproject.org/files/ENCFF177HHM/@@download/ENCFF177HHM.bam
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2016/08/04/4d16576e-cc0f-4dc2-942e-d0fe8b473276/ENCFF177HHM.bam?response-content-disposition=attachment%3B%20filename%3DENCFF177HHM.bam&x-amz-security-token=IQoJb3JpZ2luX2VjEH0aCXVzLXdlc3QtMiJHMEUCIADRZwNNtlxpuTIEzjOg1IKZkI1CsGztb7yLNxcHl5%2FzAiEAtSQFSUPnjORUcIz5d5qIg95ep9sGLSalxxwmeucUQDIq2gIIlv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgwyMjA3NDg3MTQ4NjMiDAPjrqM8CxMwi5LNYyquArkCTa8a4F6afLeNoxbBpgT3cTvSsmr7bS7HD%2BmxlwMMjWtzxoTEo6M%2BiANFNyKUhYbjMhKIfdRxdWW%2BEzf2iehVPGEWy4cAlSoasOAOuwthxDBsHr3sokbyBb0zYEVdEY2%2BI%2BAX8eDj93lJbd

In [6]:
!samtools index {output}/cage.hepg2.rep1.bam

In [7]:
inputannotation = os.path.join(output, "gencode.v29.annotation.gtf")
outputannotation = os.path.join(output, "gencode.v29.tss.gtf")

In [8]:
%R library(stringi)

array(['stringi', 'tools', 'stats', 'graphics', 'grDevices', 'utils',
       'datasets', 'methods', 'base'], dtype='<U9')

We only need TSSs from protein coding genes

In [9]:
%%R -i inputannotation

df = read.table(inputannotation, header=F, sep="\t", stringsAsFactor=FALSE)

df = subset(df, V3=="gene")

df = df[stri_detect_fixed( df$V9,  "protein_coding"), ]

df[df$V7 == "+",]$V5 = as.integer(df[df$V7 == "+",]$V4)
df[df$V7 == "+",]$V4 = as.integer(df[df$V7 == "+",]$V4 - 200)

df[df$V7 == "-",]$V4 = as.integer(df[df$V7 == "-",]$V5)
df[df$V7 == "-",]$V5 = as.integer(df[df$V7 == "-",]$V5 + 200)


In [10]:
%%R -i outputannotation
write.table(df, outputannotation, quote=F, sep="\t", col.names=F, row.names=F)