In [1]:
import sys
import numpy as np 
import pandas as pd
import pybedtools

## Load data

In [2]:
genome_dir = '/home/braunger/masterthesis/data/genome_data/'
epigenome_dir = '/home/braunger/masterthesis/data/regulatory_data/regulatory_data_old_fibroblasts/'

Data about the positions of the transcripts on the chromosomes were downloaded using the UCSC Table Browser (assembly: hg19, group: Genes and Gene Predictions, track: Ensembl Genes, table: ensGene)

In [3]:
transcript_annotations = pd.read_csv(genome_dir+'gene_annotation.tsv', sep = '\t', header=0)
selected_cols = ["name", "chrom", "strand", "txStart", "txEnd"]
transcript_annotations = transcript_annotations.loc[:, selected_cols]

transcript_annotations.head()

Unnamed: 0,name,chrom,strand,txStart,txEnd
0,ENST00000237247,chr1,+,66999065,67210057
1,ENST00000371039,chr1,+,66999274,67210768
2,ENST00000424320,chr1,+,66999297,67145425
3,ENST00000371035,chr1,+,66999822,67208882
4,ENST00000468286,chr1,+,66999838,67142779


In [4]:
rna_seq_counts = pd.read_csv(epigenome_dir + 'GSM2072585_ENCFF913ZKI_transcript_quantifications_hg19.tsv', sep = '\t', header = 0)
#select columns
selected_cols = ["transcript_id", "FPKM"]
rna_seq_counts = rna_seq_counts.loc[:, selected_cols]
#convert counts to ints
rna_seq_counts["FPKM"] = rna_seq_counts["FPKM"].astype(int)
#remove zeroes
rna_seq_counts = rna_seq_counts[rna_seq_counts.FPKM > 0]
#remove version number of transcript_id
rna_seq_counts['transcript_id'] = rna_seq_counts['transcript_id'].str.split(r'.').str.get(0)
#rename columns
rna_seq_counts.columns = ["name", "count"]

rna_seq_counts.head()

Unnamed: 0,name,count
624,ENST00000373020,12
631,ENST00000371584,1
632,ENST00000371588,22
639,ENST00000423670,3
642,ENST00000359326,1


## Merge into one data frame

In [5]:
#Merge the two data frames
rna = pd.merge(rna_seq_counts, transcript_annotations, on = "name")

#Duplicate rows according to the count number
rna_long = rna.loc[rna.index.repeat(rna.pop('count'))]
rna_long['name'] = rna_long['name'].astype(str) + '_' + rna_long.groupby(level=0).cumcount().add(1).astype(str)
rna_long = rna_long.reset_index(drop=True)

rna_long.head()

Unnamed: 0,name,chrom,strand,txStart,txEnd
0,ENST00000373020_1,chrX,-,99883666,99891803
1,ENST00000373020_2,chrX,-,99883666,99891803
2,ENST00000373020_3,chrX,-,99883666,99891803
3,ENST00000373020_4,chrX,-,99883666,99891803
4,ENST00000373020_5,chrX,-,99883666,99891803


## Convert to bed format

In [6]:
rna_long = rna_long[["chrom", "txStart", "txEnd", "name", "strand"]]
rna_long.columns = ["chrom", "start", "end", "name", "score"]
rna_bed = pybedtools.BedTool.from_dataframe(rna_long)

In [7]:
rna_bed.saveas(epigenome_dir + "rna-seq.bed")

<BedTool(/home/braunger/masterthesis/data/regulatory_data/regulatory_data_old_fibroblasts/rna-seq.bed)>

In [8]:
rna_long

Unnamed: 0,chrom,start,end,name,score
0,chrX,99883666,99891803,ENST00000373020_1,-
1,chrX,99883666,99891803,ENST00000373020_2,-
2,chrX,99883666,99891803,ENST00000373020_3,-
3,chrX,99883666,99891803,ENST00000373020_4,-
4,chrX,99883666,99891803,ENST00000373020_5,-
...,...,...,...,...,...
5911399,chr19,58790317,58807254,ENST00000608843_1,+
5911400,chr19,58790317,58807254,ENST00000608843_2,+
5911401,chr19,58790317,58807254,ENST00000608843_3,+
5911402,chr4,164450922,164451849,ENST00000609356_1,+
