In [16]:
from importlib import reload
import sys
import numpy as np 
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
plt.ioff()
import seaborn as sns
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import pybedtools
from joblib import Parallel, delayed
import get_feature_matrix_perchr as gfm
from tqdm import tqdm
import time

## Load transcript annotation data

In [17]:
genome_dir = '/home/braunger/masterthesis/data/genome_data/'
epigenome_dir = '/home/braunger/masterthesis/data/regulatory_data/regulatory_data_old_fibroblasts/'

In [18]:
cell_type = 'old_fibroblasts'
resol_str = '250kb'
resol = 250000
quality = 'MAPQGE30'

Data about the positions of the transcripts on the chromosomes were downloaded using the UCSC Table Browser (assembly: hg19, group: Genes and Gene Predictions, track: Ensembl Genes, table: ensGene)

In [19]:
transcript_annotations = pd.read_csv(genome_dir+'gene_annotation.tsv', sep = '\t', header=0)
selected_cols = ["chrom", "txStart", "txEnd", "name", "strand"]
transcript_annotations = transcript_annotations.loc[:, selected_cols]
transcript_annotations.columns = ["chrom", "start", "end", "name", "score"]

transcript_annotations.head()

Unnamed: 0,chrom,start,end,name,score
0,chr1,66999065,67210057,ENST00000237247,+
1,chr1,66999274,67210768,ENST00000371039,+
2,chr1,66999297,67145425,ENST00000424320,+
3,chr1,66999822,67208882,ENST00000371035,+
4,chr1,66999838,67142779,ENST00000468286,+


### Load RNA-Seq data

In [20]:
rna_seq_counts = pd.read_csv(epigenome_dir + 'GSM2072585_ENCFF913ZKI_transcript_quantifications_hg19.tsv', sep = '\t', header = 0)
#select columns
selected_cols = ["transcript_id", "TPM"]
rna_seq_counts = rna_seq_counts.loc[:, selected_cols]
#remove version number of transcript_id
rna_seq_counts['transcript_id'] = rna_seq_counts['transcript_id'].str.split(r'.').str.get(0)
#rename columns
rna_seq_counts.columns = ["name", "count"]

rna_seq_counts.loc[1000:1005,:]

Unnamed: 0,name,count
1000,ENST00000470664,0.0
1001,ENST00000474842,0.0
1002,ENST00000479953,0.01
1003,ENST00000490965,0.76
1004,ENST00000494258,0.0
1005,ENST00000233156,1.42


### Processing for one chromosome

In [21]:
chrom = 1
# Get chromosome size
sizes_filename = genome_dir+'chrom_hg19.sizes'
df_sizes = pd.read_csv(sizes_filename, sep = '\t', header = None, names=['chr','size'])
chrom_size = int(df_sizes.loc[df_sizes['chr']=='chr'+str(chrom)]['size'])
print(chrom_size)
    
# Divide the chromosome into segments of HIC_RESOLN length
stop_pos = np.arange(resol, chrom_size + resol, resol, dtype = 'int')
df_chrom = pd.DataFrame()
df_chrom['chrom'] = ['chr' + str(chrom)]*len(stop_pos)
df_chrom['start'] = stop_pos - resol
df_chrom['stop'] = stop_pos
df_chrom.head()

249250621


Unnamed: 0,chrom,start,stop
0,chr1,0,250000
1,chr1,250000,500000
2,chr1,500000,750000
3,chr1,750000,1000000
4,chr1,1000000,1250000


In [24]:
# Convert to bed file
bed_chrom = pybedtools.BedTool.from_dataframe(df_chrom)
bed_chrom_df = bed_chrom.to_dataframe()
    
# Get bed file of the feature
bed = pybedtools.BedTool.from_dataframe(transcript_annotations).sort()
# Get counts for this feature and this chromosome
out = pybedtools.bedtool.BedTool.map(bed_chrom, bed, c = 4, o = 'collapse', F = 0.5)
counts = out.to_dataframe()['name'].values

In [26]:
counts

array(['ENST00000456328,ENST00000515242,ENST00000518655,ENST00000450305,ENST00000438504,ENST00000423562,ENST00000541675,ENST00000488147,ENST00000538476,ENST00000473358,ENST00000469289,ENST00000607096,ENST00000417324,ENST00000461467,ENST00000606857,ENST00000594647,ENST00000492842,ENST00000335137,ENST00000466430,ENST00000495576,ENST00000477740,ENST00000471248,ENST00000453576,ENST00000442987,ENST00000423372,ENST00000494149,ENST00000595919,ENST00000493797,ENST00000484859,ENST00000490997,ENST00000466557,ENST00000410691,ENST00000496488,ENST00000491962,ENST00000442116,ENST00000448958,ENST00000424429,ENST00000450734',
       'ENST00000424587,ENST00000335577,ENST00000440038,ENST00000608420,ENST00000423728,ENST00000432964,ENST00000601486,ENST00000599771,ENST00000455464,ENST00000419160,ENST00000601814,ENST00000425496,ENST00000514436,ENST00000432723,ENST00000431812,ENST00000445840,ENST00000455207,ENST00000426406,ENST00000437905,ENST00000440163,ENST00000540477,ENST00000453935,ENST00000431321,ENST00

In [52]:
loci_counts = []
gene_locus_df = pd.DataFrame({'locus': [], 'gene': []})
loc = 0

for loc_genes in tqdm(counts):
    time.sleep(.001)
    genes = loc_genes.split(r",")
    # save df with genes and locus
    locus_list = ["chr_"+str(chrom)+"_loc_"+str(loc)] * len(genes)
    gene_locus_df = pd.concat([gene_locus_df, pd.DataFrame({'locus': locus_list, 'gene': genes})])
    # create df with genes in loci and corresponding TPM counts
    genes_df = pd.DataFrame({'name': genes})
    gene_counts = pd.merge(genes_df, rna_seq_counts, on = "name", how = "left")
    # sum TPMs of all genes per locus
    loci_count = sum(gene_counts['count'])
    loci_counts.append(loci_count)
    loc = loc + resol

loci_counts = pd.DataFrame({'RNA-seq':loci_counts}).fillna(0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 998/998 [00:58<00:00, 16.97it/s]


In [55]:
gene_locus_df

Unnamed: 0,locus,gene
0,chr_1_loc_0,ENST00000456328
1,chr_1_loc_0,ENST00000515242
2,chr_1_loc_0,ENST00000518655
3,chr_1_loc_0,ENST00000450305
4,chr_1_loc_0,ENST00000438504
...,...,...
50,chr_1_loc_249000000,ENST00000539153
51,chr_1_loc_249000000,ENST00000462488
52,chr_1_loc_249000000,ENST00000363625
53,chr_1_loc_249000000,ENST00000430973


In [38]:
df_chrom['counts'] = loci_counts
df_chrom.head()

Unnamed: 0,chrom,start,stop,counts
0,chr1,0,250000,2.22
1,chr1,250000,500000,0.19
2,chr1,500000,750000,4.81
3,chr1,750000,1000000,23.87
4,chr1,1000000,1250000,31.85


In [39]:
test = loci_counts.T
test.index = ["abc"]

In [40]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,988,989,990,991,992,993,994,995,996,997
abc,2.22,0.19,4.81,23.87,31.85,132.82,18.56,6.25,4.41,12.87,...,4.63,5.79,0.16,0.0,0.0,0.0,0.0,0.0,7.05,0.0
