# RefSeq GRCh38 parsing to retrieve exon-intron boundaries

In [2]:
import pandas as pd
pd.set_option('display.width', 1000)
refseq_grch38_gff = "/gstock/biolo_datasets/refseq/2020/GRCh38_latest_genomic.gff.gz"
refseq_38_df = pd.read_csv(refseq_grch38_gff, compression='gzip', sep='\t', skiprows=9, names=['NC', 'RefSeq_validation', 'Region_type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes'])
refseq_38_df = refseq_38_df.dropna(subset=['Start', 'End'])
refseq_38_df['Start'] = refseq_38_df['Start'].astype(int)
refseq_38_df['End'] = refseq_38_df['End'].astype(int)
# refseq_38_df['Attributes']
refseq_38_df

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
0,NC_000001.11,RefSeq,region,1,248956422,.,+,.,ID=NC_000001.11:1..248956422;Dbxref=taxon:9606...
1,NC_000001.11,BestRefSeq,pseudogene,11874,14409,.,+,.,"ID=gene-DDX11L1;Dbxref=GeneID:100287102,HGNC:H..."
2,NC_000001.11,BestRefSeq,transcript,11874,14409,.,+,.,ID=rna-NR_046018.2;Parent=gene-DDX11L1;Dbxref=...
3,NC_000001.11,BestRefSeq,exon,11874,12227,.,+,.,ID=exon-NR_046018.2-1;Parent=rna-NR_046018.2;D...
4,NC_000001.11,BestRefSeq,exon,12613,12721,.,+,.,ID=exon-NR_046018.2-2;Parent=rna-NR_046018.2;D...
...,...,...,...,...,...,...,...,...,...
3781987,NC_012920.1,RefSeq,exon,15888,15953,.,+,.,ID=exon-TRNT-1;Parent=rna-TRNT;Dbxref=GeneID:4...
3781988,NC_012920.1,RefSeq,gene,15956,16023,.,-,.,"ID=gene-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494..."
3781989,NC_012920.1,RefSeq,tRNA,15956,16023,.,-,.,ID=rna-TRNP;Parent=gene-TRNP;Dbxref=GeneID:457...
3781990,NC_012920.1,RefSeq,exon,15956,16023,.,-,.,ID=exon-TRNP-1;Parent=rna-TRNP;Dbxref=GeneID:4...


In [3]:
refseq_38_df_chroms = refseq_38_df.loc[refseq_38_df['Region_type'] == 'region']
index_list = list(refseq_38_df_chroms.index)
index_list

chroms = [(i,index_list[j+1]-1) for j,i in enumerate(index_list) if j < (len(index_list)-1)]
# chroms = [i for i,j in enumerate(index_list)]
refseq_38_df_chroms

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
0,NC_000001.11,RefSeq,region,1,248956422,.,+,.,ID=NC_000001.11:1..248956422;Dbxref=taxon:9606...
345657,NT_187361.1,RefSeq,region,1,175055,.,+,.,ID=NT_187361.1:1..175055;Dbxref=taxon:9606;Nam...
345685,NT_187362.1,RefSeq,region,1,32032,.,+,.,ID=NT_187362.1:1..32032;Dbxref=taxon:9606;Name...
345688,NT_187363.1,RefSeq,region,1,127682,.,+,.,ID=NT_187363.1:1..127682;Dbxref=taxon:9606;Nam...
345696,NT_187364.1,RefSeq,region,1,66860,.,+,.,ID=NT_187364.1:1..66860;Dbxref=taxon:9606;Name...
...,...,...,...,...,...,...,...,...,...
3780867,NT_187685.1,RefSeq,region,1,170148,.,+,.,ID=NT_187685.1:1..170148;Dbxref=taxon:9606;Nam...
3781063,NT_187686.1,RefSeq,region,1,215732,.,+,.,ID=NT_187686.1:1..215732;Dbxref=taxon:9606;Nam...
3781390,NT_187687.1,RefSeq,region,1,170537,.,+,.,ID=NT_187687.1:1..170537;Dbxref=taxon:9606;Nam...
3781593,NT_113949.2,RefSeq,region,1,177381,.,+,.,ID=NT_113949.2:1..177381;Dbxref=taxon:9606;Nam...


In [4]:
pd.options.display.max_colwidth = 200
refseq_38_df_pc_genes = refseq_38_df.loc[(refseq_38_df['Attributes'].str.contains('gene_biotype=protein_coding'))]
refseq_38_df_pc_genes

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
43,NC_000001.11,BestRefSeq,gene,65419,71585,.,+,.,"ID=gene-OR4F5;Dbxref=GeneID:79501,HGNC:HGNC:14825;Name=OR4F5;description=olfactory receptor family 4 subfamily F member 5;gbkey=Gene;gene=OR4F5;gene_biotype=protein_coding"
181,NC_000001.11,Gnomon,gene,350706,476822,.,-,.,ID=gene-LOC112268260;Dbxref=GeneID:112268260;Name=LOC112268260;gbkey=Gene;gene=LOC112268260;gene_biotype=protein_coding
243,NC_000001.11,BestRefSeq,gene,450740,451678,.,-,.,"ID=gene-OR4F29;Dbxref=GeneID:729759,HGNC:HGNC:31275;Name=OR4F29;description=olfactory receptor family 4 subfamily F member 29;gbkey=Gene;gene=OR4F29;gene_biotype=protein_coding;gene_synonym=OR7-21"
255,NC_000001.11,Gnomon,gene,586287,611297,.,-,.,ID=gene-LOC105378947;Dbxref=GeneID:105378947;Name=LOC105378947;gbkey=Gene;gene=LOC105378947;gene_biotype=protein_coding
287,NC_000001.11,BestRefSeq%2CGnomon,gene,683910,720115,.,-,.,"ID=gene-OR4F16;Dbxref=GeneID:81399,HGNC:HGNC:15079;Name=OR4F16;description=olfactory receptor family 4 subfamily F member 16;gbkey=Gene;gene=OR4F16;gene_biotype=protein_coding;gene_synonym=OR1-1,O..."
...,...,...,...,...,...,...,...,...,...
3781963,NC_012920.1,RefSeq,gene,10470,10766,.,+,.,"ID=gene-ND4L;Dbxref=GeneID:4539,HGNC:HGNC:7460,MIM:516004;Name=ND4L;gbkey=Gene;gene=ND4L;gene_biotype=protein_coding;gene_synonym=MTND4L"
3781965,NC_012920.1,RefSeq,gene,10760,12137,.,+,.,"ID=gene-ND4;Dbxref=GeneID:4538,HGNC:HGNC:7459,MIM:516003;Name=ND4;gbkey=Gene;gene=ND4;gene_biotype=protein_coding;gene_synonym=MTND4"
3781976,NC_012920.1,RefSeq,gene,12337,14148,.,+,.,"ID=gene-ND5;Dbxref=GeneID:4540,HGNC:HGNC:7461,MIM:516005;Name=ND5;gbkey=Gene;gene=ND5;gene_biotype=protein_coding;gene_synonym=MTND5"
3781978,NC_012920.1,RefSeq,gene,14149,14673,.,-,.,"ID=gene-ND6;Dbxref=GeneID:4541,HGNC:HGNC:7462,MIM:516006;Name=ND6;gbkey=Gene;gene=ND6;gene_biotype=protein_coding;gene_synonym=MTND6"


In [6]:
refseq_38_df_pc_genes.loc[refseq_38_df_pc_genes['Region_type'].str.contains('OR4F5')]

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
43,NC_000001.11,BestRefSeq,gene,65419,71585,.,+,.,"ID=gene-OR4F5;Dbxref=GeneID:79501,HGNC:HGNC:14825;Name=OR4F5;description=olfactory receptor family 4 subfamily F member 5;gbkey=Gene;gene=OR4F5;gene_biotype=protein_coding"


In [8]:
refseq_38_df.loc[43:45]

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
43,NC_000001.11,BestRefSeq,gene,65419,71585,.,+,.,"ID=gene-OR4F5;Dbxref=GeneID:79501,HGNC:HGNC:14825;Name=OR4F5;description=olfactory receptor family 4 subfamily F member 5;gbkey=Gene;gene=OR4F5;gene_biotype=protein_coding"
44,NC_000001.11,BestRefSeq,mRNA,65419,71585,.,+,.,"ID=rna-NM_001005484.2;Parent=gene-OR4F5;Dbxref=GeneID:79501,Genbank:NM_001005484.2,HGNC:HGNC:14825;Name=NM_001005484.2;gbkey=mRNA;gene=OR4F5;product=olfactory receptor family 4 subfamily F member ..."
45,NC_000001.11,BestRefSeq,exon,65419,65433,.,+,.,"ID=exon-NM_001005484.2-1;Parent=rna-NM_001005484.2;Dbxref=GeneID:79501,Genbank:NM_001005484.2,HGNC:HGNC:14825;gbkey=mRNA;gene=OR4F5;product=olfactory receptor family 4 subfamily F member 5;tag=Ref..."


In [5]:
pd.options.display.max_colwidth = 200
refseq_38_df_mrna = refseq_38_df.loc[(refseq_38_df['Attributes'].str.contains('NM_'))  & (refseq_38_df['Region_type'] == 'mRNA')]
refseq_38_df_mrna

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
44,NC_000001.11,BestRefSeq,mRNA,65419,71585,.,+,.,"ID=rna-NM_001005484.2;Parent=gene-OR4F5;Dbxref=GeneID:79501,Genbank:NM_001005484.2,HGNC:HGNC:14825;Name=NM_001005484.2;gbkey=mRNA;gene=OR4F5;product=olfactory receptor family 4 subfamily F member ..."
244,NC_000001.11,BestRefSeq,mRNA,450740,451678,.,-,.,"ID=rna-NM_001005221.2;Parent=gene-OR4F29;Dbxref=GeneID:729759,Genbank:NM_001005221.2,HGNC:HGNC:31275;Name=NM_001005221.2;gbkey=mRNA;gene=OR4F29;product=olfactory receptor family 4 subfamily F memb..."
321,NC_000001.11,BestRefSeq,mRNA,685716,686654,.,-,.,"ID=rna-NM_001005277.1;Parent=gene-OR4F16;Dbxref=GeneID:81399,Genbank:NM_001005277.1,HGNC:HGNC:15079;Name=NM_001005277.1;gbkey=mRNA;gene=OR4F16;product=olfactory receptor family 4 subfamily F membe..."
453,NC_000001.11,BestRefSeq,mRNA,923923,944574,.,+,.,"ID=rna-NM_001385640.1;Parent=gene-SAMD11;Dbxref=GeneID:148398,Genbank:NM_001385640.1,HGNC:HGNC:28706,MIM:616765;Name=NM_001385640.1;gbkey=mRNA;gene=SAMD11;product=sterile alpha motif domain contai..."
482,NC_000001.11,BestRefSeq,mRNA,923923,944574,.,+,.,"ID=rna-NM_001385641.1;Parent=gene-SAMD11;Dbxref=GeneID:148398,Genbank:NM_001385641.1,HGNC:HGNC:28706,MIM:616765;Name=NM_001385641.1;gbkey=mRNA;gene=SAMD11;product=sterile alpha motif domain contai..."
...,...,...,...,...,...,...,...,...,...
3781787,NT_113949.2,BestRefSeq,mRNA,122955,137265,.,-,.,"ID=rna-NM_001291696.1-13;Parent=gene-KIR2DS2-14;Dbxref=GeneID:100132285,Genbank:NM_001291696.1,HGNC:HGNC:6334,MIM:604953;Name=NM_001291696.1;Note=The RefSeq transcript has 1 substitution compared ..."
3781802,NT_113949.2,BestRefSeq,mRNA,122955,137265,.,-,.,"ID=rna-NM_001291701.1-13;Parent=gene-KIR2DS2-14;Dbxref=GeneID:100132285,Genbank:NM_001291701.1,HGNC:HGNC:6334,MIM:604953;Name=NM_001291701.1;Note=The RefSeq transcript has 1 substitution compared ..."
3781817,NT_113949.2,BestRefSeq,mRNA,122955,137265,.,-,.,"ID=rna-NM_001291695.1-13;Parent=gene-KIR2DS2-14;Dbxref=GeneID:100132285,Genbank:NM_001291695.1,HGNC:HGNC:6334,MIM:604953;Name=NM_001291695.1;Note=The RefSeq transcript has 1 substitution compared ..."
3781834,NT_113949.2,BestRefSeq,mRNA,122955,137265,.,-,.,"ID=rna-NM_012312.4-13;Parent=gene-KIR2DS2-14;Dbxref=GeneID:100132285,Genbank:NM_012312.4,HGNC:HGNC:6334,MIM:604953;Name=NM_012312.4;Note=The RefSeq transcript has 1 substitution compared to this g..."


In [22]:
refseq_38_df.loc[(refseq_38_df['Attributes'].str.contains('NM_001005484.2')) & (refseq_38_df['Region_type'] == 'mRNA')]

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
44,NC_000001.11,BestRefSeq,mRNA,65419,71585,.,+,.,"ID=rna-NM_001005484.2;Parent=gene-OR4F5;Dbxref=GeneID:79501,Genbank:NM_001005484.2,HGNC:HGNC:14825;Name=NM_001005484.2;gbkey=mRNA;gene=OR4F5;product=olfactory receptor family 4 subfamily F member ..."


In [31]:
refseq_38_df.loc[2802]

NC                                                                                                                                                                                                              NC_000001.11
RefSeq_validation                                                                                                                                                                                                 BestRefSeq
Region_type                                                                                                                                                                                                             mRNA
Start                                                                                                                                                                                                                1292391
End                                                                                                                 

In [6]:
pd.options.display.max_colwidth = 200
refseq_38_df_cds = refseq_38_df.loc[(refseq_38_df['Attributes'].str.contains('NP_'))  & (refseq_38_df['Region_type'] == 'CDS')]
refseq_38_df_cds

Unnamed: 0,NC,RefSeq_validation,Region_type,Start,End,Score,Strand,Phase,Attributes
48,NC_000001.11,BestRefSeq,CDS,65565,65573,.,+,0,"ID=cds-NP_001005484.2;Parent=rna-NM_001005484.2;Dbxref=CCDS:CCDS30547.1,GeneID:79501,Genbank:NP_001005484.2,HGNC:HGNC:14825;Name=NP_001005484.2;gbkey=CDS;gene=OR4F5;product=olfactory receptor 4F5;..."
49,NC_000001.11,BestRefSeq,CDS,69037,70008,.,+,0,"ID=cds-NP_001005484.2;Parent=rna-NM_001005484.2;Dbxref=CCDS:CCDS30547.1,GeneID:79501,Genbank:NP_001005484.2,HGNC:HGNC:14825;Name=NP_001005484.2;gbkey=CDS;gene=OR4F5;product=olfactory receptor 4F5;..."
246,NC_000001.11,BestRefSeq,CDS,450740,451678,.,-,0,"ID=cds-NP_001005221.2;Parent=rna-NM_001005221.2;Dbxref=CCDS:CCDS72675.1,GeneID:729759,Genbank:NP_001005221.2,HGNC:HGNC:31275;Name=NP_001005221.2;gbkey=CDS;gene=OR4F29;product=olfactory receptor 4F..."
323,NC_000001.11,BestRefSeq,CDS,685716,686654,.,-,0,"ID=cds-NP_001005277.1;Parent=rna-NM_001005277.1;Dbxref=CCDS:CCDS41221.1,GeneID:81399,Genbank:NP_001005277.1,HGNC:HGNC:15079;Name=NP_001005277.1;gbkey=CDS;gene=OR4F16;product=olfactory receptor 4F3..."
468,NC_000001.11,BestRefSeq,CDS,924432,924948,.,+,0,"ID=cds-NP_001372569.1;Parent=rna-NM_001385640.1;Dbxref=GeneID:148398,Genbank:NP_001372569.1,HGNC:HGNC:28706,MIM:616765;Name=NP_001372569.1;Note=isoform 2 is encoded by transcript variant 2;gbkey=C..."
...,...,...,...,...,...,...,...,...,...
3781864,NT_113949.2,BestRefSeq,CDS,147881,148180,.,-,2,"ID=cds-NP_703144.3-47;Parent=rna-NM_153443.4-47;Dbxref=GeneID:115653,Genbank:NP_703144.3,HGNC:HGNC:16312,MIM:610095;Name=NP_703144.3;Note=The RefSeq protein has 3 substitutions compared to this ge..."
3781865,NT_113949.2,BestRefSeq,CDS,146009,146302,.,-,2,"ID=cds-NP_703144.3-47;Parent=rna-NM_153443.4-47;Dbxref=GeneID:115653,Genbank:NP_703144.3,HGNC:HGNC:16312,MIM:610095;Name=NP_703144.3;Note=The RefSeq protein has 3 substitutions compared to this ge..."
3781866,NT_113949.2,BestRefSeq,CDS,140437,140541,.,-,2,"ID=cds-NP_703144.3-47;Parent=rna-NM_153443.4-47;Dbxref=GeneID:115653,Genbank:NP_703144.3,HGNC:HGNC:16312,MIM:610095;Name=NP_703144.3;Note=The RefSeq protein has 3 substitutions compared to this ge..."
3781867,NT_113949.2,BestRefSeq,CDS,139922,139974,.,-,2,"ID=cds-NP_703144.3-47;Parent=rna-NM_153443.4-47;Dbxref=GeneID:115653,Genbank:NP_703144.3,HGNC:HGNC:16312,MIM:610095;Name=NP_703144.3;Note=The RefSeq protein has 3 substitutions compared to this ge..."


In [7]:
from tqdm import tqdm
l = list()
for j, index in tqdm(enumerate(list(refseq_38_df_pc_genes.index))):
#     if j == 10:
#         break

    if j < len(list(refseq_38_df_pc_genes.index)) - 1 :
        start_index, stop_index = index, list(refseq_38_df_pc_genes.index)[j+1]

        current_gene = [e.replace('Name=', '') for e in refseq_38_df_pc_genes.loc[start_index]['Attributes'].split(';') if 'Name' in e][0]
        for cds in list(refseq_38_df_cds.index):
            if cds > start_index and cds < stop_index:

                current_gene_cds = [e.replace('gene=', '') for e in refseq_38_df_cds.loc[cds]['Attributes'].split(';') if 'gene' in e][0]
                parent_mrna = [e.replace('Parent=rna-', '') for e in refseq_38_df_cds.loc[cds]['Attributes'].split(';') if 'Parent' in e][0].split('.')[0]
                mrna_index = refseq_38_df_mrna.loc[refseq_38_df_mrna['Attributes'].str.contains(parent_mrna)].index[0]
                
                

#                 print(j, start_index, stop_index, cds, current_gene, current_gene_cds)

                if current_gene == current_gene_cds:
                    l.append(
                        {
                            'Gene' : current_gene,
                            'Gene_start' : refseq_38_df_pc_genes.loc[start_index]['Start'],
                            'Gene_stop' : refseq_38_df_pc_genes.loc[start_index]['End'],
                            'Exon_start' : refseq_38_df_cds.loc[cds]['Start'],
                            'Exon_stop' : refseq_38_df_cds.loc[cds]['End'],
                            'mRNA' : parent_mrna,
                            'mRNA_start' : refseq_38_df_mrna.loc[mrna_index]['Start'],
                            'mRNA_stop' : refseq_38_df_mrna.loc[mrna_index]['End'],
                        }
                    )
        
            
pd.options.display.max_rows =200
df = pd.DataFrame(l).sort_values(by=['Gene', 'Exon_start', 'Exon_stop']).drop_duplicates()
df['Length'] = df['Exon_stop'] - df['Exon_start']
df = df.sort_values(by=['Gene', 'Exon_start', 'Exon_stop', 'Length'], ascending=[True, True, True, False])
df = df.drop_duplicates(subset=['Gene', 'Exon_start'], keep='last').drop_duplicates(subset=['Gene', 'Exon_stop'], keep='first')

df


480it [11:44,  1.47s/it]

KeyboardInterrupt



In [16]:

output_json = dict()

for ig, gene in tqdm(enumerate(df.Gene.unique())):
    if ig == 10:
        break

    output_json[gene] = dict()

    tmp_gene_df = df.loc[df['Gene'] == gene]
    for chrom in chroms:
        if chrom[0] <= tmp_gene_df.index[0] < chrom[1]:
            
            output_json[gene]['CHROM'] = [c.replace('chromosome=', '') for c in refseq_38_df_chroms.loc[chrom[0]]['Attributes'].split(';') if 'chromosome=' in c][0]
# tmp_gene_df
    
    tmp_gene_df = tmp_gene_df.reset_index(drop=True)
    output_json[gene]['Gene_start_end'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
    for j, row in tmp_gene_df.iterrows():
        if 'Exon_list' not in output_json[gene]:
            output_json[gene]['Exon_list'] = list()
        if j == 0:
            output_json[gene]['5_prime_UTR'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(row.Exon_start - 1)
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            if j == (tmp_gene_df.shape[0] - 1):
                output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
        elif j > 0 and j < (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
        elif j == (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
            
for gene in output_json:
    
    output_json[gene]['Exons_length'] = [int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Exon_list']]
    output_json[gene]['Intron_list'] = list()
    output_json[gene]['Exon_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Exon_list']])
    output_json[gene]['Bin_size_exon'] = round(output_json[gene]['Exon_size_total'] / 20)
    for j, exon in enumerate(output_json[gene]['Exon_list']):
        if j < len(output_json[gene]['Exon_list'])-1:
            intron_start = int(output_json[gene]['Exon_list'][j].split('_')[1]) + 1
            intron_end = int(output_json[gene]['Exon_list'][j+1].split('_')[0]) - 1
            output_json[gene]['Intron_list'].append('{}_{}'.format(str(intron_start), str(intron_end)))
    output_json[gene]['Intron_length'] = [int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Intron_list']]
    output_json[gene]['Intron_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Intron_list']])
    output_json[gene]['Bin_size_intron'] = round(output_json[gene]['Intron_size_total'] / 20)
    output_json[gene]['Gene_body_size'] = int(output_json[gene]['Gene_start_end'].split('_')[1]) - int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_GB'] = round(output_json[gene]['Gene_body_size'] / 20)
    
    init_gene_body_bin = int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_UpDown'] = round(5000 / 20)
    output_json[gene]['5_prime_UTR_size'] = int(output_json[gene]['5_prime_UTR'].split('_')[1]) - int(output_json[gene]['5_prime_UTR'].split('_')[0])
    output_json[gene]['3_prime_UTR_size'] = int(output_json[gene]['3_prime_UTR'].split('_')[1]) - int(output_json[gene]['3_prime_UTR'].split('_')[0])

    output_json[gene]['Bin_size_5_prime_UTR'] = round(int(output_json[gene]['5_prime_UTR_size']) / 20)
    output_json[gene]['Bin_size_3_prime_UTR'] = round(int(output_json[gene]['3_prime_UTR_size']) / 20)
            
refseq_38_df_transformed = pd.DataFrame.from_dict(output_json).T
refseq_38_df_transformed

10it [00:00, 433.05it/s]


Unnamed: 0,Gene_start_end,Exon_list,5_prime_UTR,3_prime_UTR,Exons_length,Intron_list,Exon_size_total,Bin_size_exon,Intron_length,Intron_size_total,Bin_size_intron,Gene_body_size,Bin_size_GB,Bin_size_UpDown,5_prime_UTR_size,3_prime_UTR_size,Bin_size_5_prime_UTR,Bin_size_3_prime_UTR
ACAP3,1292384_1307930,"[1293564_1293708, 1293823_1293933, 1294090_1294199, 1294402_1294628, 1294718_1294816, 1295447_1295554, 1295736_1295938, 1296015_1296109, 1296211_1296280, 1296425_1296633, 1297822_1297933, 1298013_...",1292384_1293563,1307816_1307930,"[144, 110, 109, 226, 98, 107, 202, 94, 69, 208, 111, 100, 51, 112, 11, 74, 95, 44, 183, 58, 53, 119, 57, 46]","[1293709_1293822, 1293934_1294089, 1294200_1294401, 1294629_1294717, 1294817_1295446, 1295555_1295735, 1295939_1296014, 1296110_1296210, 1296281_1296424, 1296634_1297821, 1297934_1298012, 1298114_...",2481,124,"[113, 155, 201, 88, 629, 180, 75, 100, 143, 1187, 78, 255, 144, 664, 473, 66, 88, 305, 1294, 874, 185, 803, 3624]",11724,586,15546,777,250,1179,114,59,6
AGRN,1020102_1056119,"[1020173_1020373, 1022201_1022462, 1034556_1034703, 1035277_1035324, 1040665_1040880, 1041173_1041397, 1041478_1041702, 1041956_1042162, 1043239_1043457, 1043538_1043732, 1043823_1044023, 1044109_...",1020102_1020172,1054982_1056119,"[200, 261, 147, 47, 215, 224, 224, 206, 218, 194, 200, 148, 105, 116, 164, 143, 124, 105, 338, 137, 127, 114, 119, 353, 192, 215, 229, 134, 96, 164, 111, 11, 116, 192, 87, 23, 32, 224, 103, 157]","[1020374_1022200, 1022463_1034555, 1034704_1035276, 1035325_1040664, 1040881_1041172, 1041398_1041477, 1041703_1041955, 1042163_1043238, 1043458_1043537, 1043733_1043822, 1044024_1044108, 1044258_...",6315,316,"[1826, 12092, 572, 5339, 291, 79, 252, 1075, 79, 89, 84, 75, 720, 80, 208, 86, 70, 130, 83, 368, 117, 87, 115, 500, 175, 113, 106, 194, 96, 133, 193, 208, 82, 81, 176, 1443, 258, 469, 271]",28415,1421,36017,1801,250,70,1137,4,57
ANKRD65,1418420_1422471,"[1419100_1419549, 1420052_1420592, 1420797_1421005]",1418420_1419099,1421006_1422471,"[449, 540, 208]","[1419550_1420051, 1420593_1420796]",1197,60,"[501, 203]",704,35,4051,203,250,679,1465,34,73
ATAD3A,1512143_1534686,"[1512269_1512473, 1516012_1516088, 1517167_1517412, 1517716_1517775, 1518921_1518990, 1520141_1520306, 1520548_1520617, 1522744_1522899, 1523511_1523567, 1523839_1523964, 1524273_1524397, 1525240_...",1512143_1512268,1534073_1534686,"[204, 76, 245, 59, 69, 165, 69, 155, 56, 125, 124, 51, 70, 167, 108, 146]","[1512474_1516011, 1516089_1517166, 1517413_1517715, 1517776_1518920, 1518991_1520140, 1520307_1520547, 1520618_1522743, 1522900_1523510, 1523568_1523838, 1523965_1524272, 1524398_1525239, 1525292_...",1889,94,"[3537, 1077, 302, 1144, 1149, 240, 2125, 610, 270, 307, 841, 1168, 1162, 1359, 4593]",19884,994,22543,1127,250,125,613,6,31
ATAD3B,1471732_1509466,"[1471885_1472089, 1477274_1477350, 1478500_1478745, 1479049_1479108, 1480867_1480936, 1482138_1482303, 1482545_1482614, 1485016_1485171, 1485782_1485838, 1486110_1486235, 1486544_1486668, 1487863_...",1471732_1471884,1495818_1509466,"[204, 76, 245, 59, 69, 165, 69, 155, 56, 125, 124, 51, 70, 167, 108, 332]","[1472090_1477273, 1477351_1478499, 1478746_1479048, 1479109_1480866, 1480937_1482137, 1482304_1482544, 1482615_1485015, 1485172_1485781, 1485839_1486109, 1486236_1486543, 1486669_1487862, 1487915_...",2075,104,"[5183, 1148, 302, 1757, 1200, 240, 2400, 609, 270, 307, 1193, 1288, 981, 137, 4812]",21827,1091,37734,1887,250,152,13648,8,682
ATAD3C,1449689_1470163,"[1450684_1450758, 1452046_1452122, 1452365_1452434, 1454345_1454500, 1455460_1455519, 1455791_1455916, 1456225_1456349, 1457129_1457180, 1459161_1459231, 1460750_1460917, 1462600_1462708, 1468384_...",1449689_1450683,1468531_1470163,"[74, 76, 69, 155, 59, 125, 124, 51, 70, 167, 108, 146]","[1450759_1452045, 1452123_1452364, 1452435_1454344, 1454501_1455459, 1455520_1455790, 1455917_1456224, 1456350_1457128, 1457181_1459160, 1459232_1460749, 1460918_1462599, 1462709_1468383]",1224,61,"[1286, 241, 1909, 958, 270, 307, 778, 1979, 1517, 1681, 5674]",16600,830,20474,1024,250,994,1632,50,82
AURKAIP1,1373730_1375516,"[1373801_1373902, 1374000_1374445, 1374705_1374756]",1373730_1373800,1374757_1375516,"[101, 445, 51]","[1373903_1373999, 1374446_1374704]",597,30,"[96, 258]",354,18,1786,89,250,70,759,4,38
B3GALT6,1232237_1235041,[1232279_1233268],1232237_1232278,1233269_1235041,[989],[],989,49,[],0,0,2804,140,250,41,1772,2,89
C1QTNF12,1242448_1247218,"[1242548_1242646, 1242835_1242913, 1243062_1243152, 1243444_1243552, 1243954_1244105, 1244191_1244275, 1244381_1244497, 1246514_1246690]",1242448_1242547,1246691_1247218,"[98, 78, 90, 108, 151, 84, 116, 176]","[1242647_1242834, 1242914_1243061, 1243153_1243443, 1243553_1243953, 1244106_1244190, 1244276_1244380, 1244498_1246513]",901,45,"[187, 147, 290, 400, 84, 104, 2015]",3227,161,4770,238,250,99,527,5,26
C1orf159,1081823_1116089,"[1082893_1082987, 1083915_1084383, 1084481_1084506, 1085878_1086012, 1087139_1087204, 1087502_1087597, 1090353_1090428, 1090876_1090983, 1091472_1091543]",1081823_1082892,1091544_1116089,"[94, 468, 25, 134, 65, 95, 75, 107, 71]","[1082988_1083914, 1084384_1084480, 1084507_1085877, 1086013_1087138, 1087205_1087501, 1087598_1090352, 1090429_1090875, 1090984_1091471]",1134,57,"[926, 96, 1370, 1125, 296, 2754, 446, 487]",7500,375,34266,1713,250,1069,24545,53,1227


In [None]:
l = list()
for j, index in tqdm(enumerate(list(refseq_38_df_pc_genes.index))):
#     if j == 10:
#         break

    if j < len(list(refseq_38_df_pc_genes.index)) - 1 :
        start_index, stop_index = index, list(refseq_38_df_pc_genes.index)[j+1]

        current_gene = [e.replace('Name=', '') for e in refseq_38_df_pc_genes.loc[start_index]['Attributes'].split(';') if 'Name' in e][0]
        for cds in list(refseq_38_df_cds.index):
            if cds > start_index and cds < stop_index:

                current_gene_cds = [e.replace('gene=', '') for e in refseq_38_df_cds.loc[cds]['Attributes'].split(';') if 'gene' in e][0]

#                 print(j, start_index, stop_index, cds, current_gene, current_gene_cds)

                if current_gene == current_gene_cds:
                    l.append(
                        {
                            'Gene' : current_gene,
                            'Gene_start' : refseq_38_df_pc_genes.loc[start_index]['Start'],
                            'Gene_stop' : refseq_38_df_pc_genes.loc[start_index]['End'],
                            'Exon_start' : refseq_38_df_cds.loc[cds]['Start'],
                            'Exon_stop' : refseq_38_df_cds.loc[cds]['End'],
                        }
                    )
        
            
pd.options.display.max_rows =200
df = pd.DataFrame(l).sort_values(by=['Gene', 'Exon_start', 'Exon_stop']).drop_duplicates()
df['Length'] = df['Exon_stop'] - df['Exon_start']
df = df.sort_values(by=['Gene', 'Exon_start', 'Exon_stop', 'Length'], ascending=[True, True, True, False])
df = df.drop_duplicates(subset=['Gene', 'Exon_start'], keep='last').drop_duplicates(subset=['Gene', 'Exon_stop'], keep='first')

df

output_json = dict()

for ig, gene in tqdm(enumerate(df.Gene.unique())):
    if ig == 10:
        break

    output_json[gene] = dict()

    tmp_gene_df = df.loc[df['Gene'] == gene]
    for chrom in chroms:
        if chrom[0] <= tmp_gene_df.index[0] < chrom[1]:
            
            output_json[gene]['CHROM'] = [c.replace('chromosome=', '') for c in refseq_38_df_chroms.loc[chrom[0]]['Attributes'].split(';') if 'chromosome=' in c][0]
# tmp_gene_df
    
    tmp_gene_df = tmp_gene_df.reset_index(drop=True)
    output_json[gene]['Gene_start_end'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
    for j, row in tmp_gene_df.iterrows():
        if 'Exon_list' not in output_json[gene]:
            output_json[gene]['Exon_list'] = list()
        if j == 0:
            output_json[gene]['5_prime_UTR'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(row.Exon_start - 1)
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            if j == (tmp_gene_df.shape[0] - 1):
                output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
        elif j > 0 and j < (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
        elif j == (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
            
for gene in output_json:
    
    output_json[gene]['Intron_list'] = list()
    output_json[gene]['Exon_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Exon_list']])
    output_json[gene]['Bin_size_exon'] = round(output_json[gene]['Exon_size_total'] / 20)
    for j, exon in enumerate(output_json[gene]['Exon_list']):
        if j < len(output_json[gene]['Exon_list'])-1:
            intron_start = int(output_json[gene]['Exon_list'][j].split('_')[1]) + 1
            intron_end = int(output_json[gene]['Exon_list'][j+1].split('_')[0]) - 1
            output_json[gene]['Intron_list'].append('{}_{}'.format(str(intron_start), str(intron_end)))
    output_json[gene]['Intron_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Intron_list']])
    output_json[gene]['Bin_size_intron'] = round(output_json[gene]['Intron_size_total'] / 20)
    output_json[gene]['Gene_body_size'] = int(output_json[gene]['Gene_start_end'].split('_')[1]) - int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_GB'] = round(output_json[gene]['Gene_body_size'] / 20)
    
    init_gene_body_bin = int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_UpDown'] = round(5000 / 20)
    output_json[gene]['5_prime_UTR_size'] = int(output_json[gene]['5_prime_UTR'].split('_')[1]) - int(output_json[gene]['5_prime_UTR'].split('_')[0])
    output_json[gene]['3_prime_UTR_size'] = int(output_json[gene]['3_prime_UTR'].split('_')[1]) - int(output_json[gene]['3_prime_UTR'].split('_')[0])

    output_json[gene]['Bin_size_5_prime_UTR'] = round(int(output_json[gene]['5_prime_UTR_size']) / 20)
    output_json[gene]['Bin_size_3_prime_UTR'] = round(int(output_json[gene]['3_prime_UTR_size']) / 20)
    
    output_json[gene]['Gene_body_bins'] = [round((b* output_json[gene]['Bin_size_GB']) + init_gene_body_bin) for b in range(20)]
    output_json[gene]['Gene_body_bins'] = [(e, output_json[gene]['Gene_body_bins'][i+1]) for i, e in enumerate(output_json[gene]['Gene_body_bins']) if i < len(output_json[gene]['Gene_body_bins'])-1]

    output_json[gene]['Upstream_bins'] = sorted([int(output_json[gene]['Gene_start_end'].split('_')[0]) - 1  - round((b* output_json[gene]['Bin_size_UpDown'])) for b in range(20)])
    output_json[gene]['Upstream_bins'] = [(e, output_json[gene]['Upstream_bins'][i+1]) for i, e in enumerate(output_json[gene]['Upstream_bins']) if i < len(output_json[gene]['Upstream_bins'])-1]

    output_json[gene]['Downstream_bins'] = [int(output_json[gene]['Gene_start_end'].split('_')[1]) + 1 + round((b* output_json[gene]['Bin_size_UpDown'])) for b in range(20)]
    output_json[gene]['Downstream_bins'] = [(e, output_json[gene]['Downstream_bins'][i+1]) for i, e in enumerate(output_json[gene]['Downstream_bins']) if i < len(output_json[gene]['Downstream_bins'])-1]


    output_json[gene]['Exon_bins'] = list()
    tmp_bins_list = [sub_pos for exon in output_json[gene]['Exon_list'] for sub_pos in range(int(exon.split('_')[0]), int(exon.split('_')[1]) + 1) ]
    output_json[gene]['Exon_bins'] = [pos for i, pos in enumerate(tmp_bins_list) if (i+1) % output_json[gene]['Bin_size_exon'] == 0]
    output_json[gene]['Exon_bins'] = [output_json[gene]['Exon_list'][0].split('_')[0]] + output_json[gene]['Exon_bins']
    output_json[gene]['Exon_bins'] = [(e, output_json[gene]['Exon_bins'][i+1]) for i, e in enumerate(output_json[gene]['Exon_bins']) if i < len(output_json[gene]['Exon_bins'])-1]
    
    current_size = 0
    previous_size = 0
    exon_bin_counter = 0
    
    for j, exon in enumerate(output_json[gene]['Exon_list']):
                
        if j < len(output_json[gene]['Exon_list'])-1:
            intron_start = int(output_json[gene]['Exon_list'][j].split('_')[1]) + 1
            intron_end = int(output_json[gene]['Exon_list'][j+1].split('_')[0]) - 1
            output_json[gene]['Intron_list'].append('{}_{}'.format(str(intron_start), str(intron_end)))
    
    
    
    output_json[gene]['Intron_bins'] = list()
    if output_json[gene]['Intron_list']:
        tmp_bins_list = [sub_pos for intron in output_json[gene]['Intron_list'] for sub_pos in range(int(intron.split('_')[0]), int(intron.split('_')[1]) + 1) ]
        output_json[gene]['Intron_bins'] = [pos for i, pos in enumerate(tmp_bins_list) if (i+1) % output_json[gene]['Bin_size_intron'] == 0]
        output_json[gene]['Intron_bins'] = [output_json[gene]['Intron_list'][0].split('_')[0]] + output_json[gene]['Intron_bins']
        output_json[gene]['Intron_bins'] = [(e, output_json[gene]['Intron_bins'][i+1]) for i, e in enumerate(output_json[gene]['Intron_bins']) if i < len(output_json[gene]['Intron_bins'])-1]
    
    
    output_json[gene]['BED_boundaries_for_hail_10kbp_added'] = "chr" +  str(output_json[gene]['CHROM']) + ':' + str((int(output_json[gene]['Gene_start_end'].split('_')[0]) - 5000)) + '-' + str((int(output_json[gene]['Gene_start_end'].split('_')[1]) + 5000))

                
                
            

In [241]:
output_json = dict()
bins_position = list()


for ig, gene in tqdm(enumerate(df.Gene.unique())):
    if ig == 10:
        break

    output_json[gene] = dict()

    tmp_gene_df = df.loc[df['Gene'] == gene]
    for chrom in chroms:
        if chrom[0] <= tmp_gene_df.index[0] < chrom[1]:
            
            output_json[gene]['CHROM'] = [c.replace('chromosome=', '') for c in refseq_38_df_chroms.loc[chrom[0]]['Attributes'].split(';') if 'chromosome=' in c][0]
# tmp_gene_df
    
    tmp_gene_df = tmp_gene_df.reset_index(drop=True)
    output_json[gene]['Gene_start_end'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
    for j, row in tmp_gene_df.iterrows():
        if 'Exon_list' not in output_json[gene]:
            output_json[gene]['Exon_list'] = list()
        if j == 0:
            output_json[gene]['5_prime_UTR'] = str(tmp_gene_df.Gene_start.unique()[0]) + '_' + str(row.Exon_start - 1)
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            if j == (tmp_gene_df.shape[0] - 1):
                output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
        elif j > 0 and j < (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
        elif j == (tmp_gene_df.shape[0] - 1):
            output_json[gene]['Exon_list'].append(str(row.Exon_start) + '_' + str(row.Exon_stop))
            output_json[gene]['3_prime_UTR'] = str(row.Exon_stop + 1) + '_' + str(tmp_gene_df.Gene_stop.unique()[0])
for gene in output_json:
    output_json[gene]['Intron_list'] = list()
    output_json[gene]['Exon_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Exon_list']])
    output_json[gene]['Bin_size_exon'] = round(output_json[gene]['Exon_size_total'] / 20)
    for j, exon in enumerate(output_json[gene]['Exon_list']):
        if j < len(output_json[gene]['Exon_list'])-1:
            intron_start = int(output_json[gene]['Exon_list'][j].split('_')[1]) + 1
            intron_end = int(output_json[gene]['Exon_list'][j+1].split('_')[0]) - 1
            output_json[gene]['Intron_list'].append('{}_{}'.format(str(intron_start), str(intron_end)))
    output_json[gene]['Intron_size_total'] = sum([int(e.split('_')[1]) - int(e.split('_')[0]) for e in output_json[gene]['Intron_list']])
    output_json[gene]['Bin_size_intron'] = round(output_json[gene]['Intron_size_total'] / 20)
    output_json[gene]['Gene_body_size'] = int(output_json[gene]['Gene_start_end'].split('_')[1]) - int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_GB'] = round(output_json[gene]['Gene_body_size'] / 20)
    
    init_gene_body_bin = int(output_json[gene]['Gene_start_end'].split('_')[0])
    output_json[gene]['Bin_size_UpDown'] = round(5000 / 20)
    output_json[gene]['5_prime_UTR_size'] = int(output_json[gene]['5_prime_UTR'].split('_')[1]) - int(output_json[gene]['5_prime_UTR'].split('_')[0])
    output_json[gene]['3_prime_UTR_size'] = int(output_json[gene]['3_prime_UTR'].split('_')[1]) - int(output_json[gene]['3_prime_UTR'].split('_')[0])

    output_json[gene]['Bin_size_5_prime_UTR'] = round(int(output_json[gene]['5_prime_UTR_size']) / 20)
    output_json[gene]['Bin_size_3_prime_UTR'] = round(int(output_json[gene]['3_prime_UTR_size']) / 20)
    
    output_json[gene]['Gene_body_bins'] = [round((b* output_json[gene]['Bin_size_GB']) + init_gene_body_bin) for b in range(20)]
    output_json[gene]['Gene_body_bins'] = ["{}_{}".format(str(e), str(output_json[gene]['Gene_body_bins'][i+1])) for i, e in enumerate(output_json[gene]['Gene_body_bins']) if i < len(output_json[gene]['Gene_body_bins'])-1]

    output_json[gene]['Upstream_bins'] = sorted([int(output_json[gene]['Gene_start_end'].split('_')[0]) - 1  - round((b* output_json[gene]['Bin_size_UpDown'])) for b in range(20)])
    output_json[gene]['Upstream_bins'] = ["{}_{}".format(str(e), str(output_json[gene]['Upstream_bins'][i+1])) for i, e in enumerate(output_json[gene]['Upstream_bins']) if i < len(output_json[gene]['Upstream_bins'])-1]

    output_json[gene]['Downstream_bins'] = [int(output_json[gene]['Gene_start_end'].split('_')[1]) + 1 + round((b* output_json[gene]['Bin_size_UpDown'])) for b in range(20)]
    output_json[gene]['Downstream_bins'] = ["{}_{}".format(str(e), str(output_json[gene]['Downstream_bins'][i+1])) for i, e in enumerate(output_json[gene]['Downstream_bins']) if i < len(output_json[gene]['Downstream_bins'])-1]


    output_json[gene]['Exon_bins'] = list()
    tmp_bins_list = [sub_pos for exon in output_json[gene]['Exon_list'] for sub_pos in range(int(exon.split('_')[0]), int(exon.split('_')[1]) + 1) ]
    output_json[gene]['Exon_bins'] = [pos for i, pos in enumerate(tmp_bins_list) if (i+1) % output_json[gene]['Bin_size_exon'] == 0]
    output_json[gene]['Exon_bins'] = [output_json[gene]['Exon_list'][0].split('_')[0]] + output_json[gene]['Exon_bins']
    output_json[gene]['Exon_bins'] = ["{}_{}".format(str(e), str(output_json[gene]['Exon_bins'][i+1])) for i, e in enumerate(output_json[gene]['Exon_bins']) if i < len(output_json[gene]['Exon_bins'])-1]
    
    current_size = 0
    previous_size = 0
    exon_bin_counter = 0
    
    for j, exon in enumerate(output_json[gene]['Exon_list']):
                
        if j < len(output_json[gene]['Exon_list'])-1:
            intron_start = int(output_json[gene]['Exon_list'][j].split('_')[1]) + 1
            intron_end = int(output_json[gene]['Exon_list'][j+1].split('_')[0]) - 1
            output_json[gene]['Intron_list'].append('{}_{}'.format(str(intron_start), str(intron_end)))
    
    for elem in ['Gene_body_bins', 'Upstream_bins', 'Downstream_bins', 'Exon_bins']:
        bins_position.extend([{'CHR' :  'chr' +  output_json[gene]['CHROM'], 'Start' : e.split('_')[0], 'End' : e.split('_')[1], 'Region_type' : elem.replace('_bins', ''), 'Num' : i,  'Gene': gene} for i,e in zip(list(range(1,21)), output_json[gene][elem])])
    
    output_json[gene]['Intron_bins'] = list()
    if output_json[gene]['Intron_list']:
        tmp_bins_list = [sub_pos for intron in output_json[gene]['Intron_list'] for sub_pos in range(int(intron.split('_')[0]), int(intron.split('_')[1]) + 1) ]
        output_json[gene]['Intron_bins'] = [pos for i, pos in enumerate(tmp_bins_list) if (i+1) % output_json[gene]['Bin_size_intron'] == 0]
        output_json[gene]['Intron_bins'] = [output_json[gene]['Intron_list'][0].split('_')[0]] + output_json[gene]['Intron_bins']
        output_json[gene]['Intron_bins'] = ["{}_{}".format(str(e), str(output_json[gene]['Intron_bins'][i+1])) for i, e in enumerate(output_json[gene]['Intron_bins']) if i < len(output_json[gene]['Intron_bins'])-1]
        bins_position.extend([{'CHR' :  'chr' +  output_json[gene]['CHROM'],  'Start' : e.split('_')[0], 'End' : e.split('_')[1], 'Region_type' : elem.replace('_bins', ''), 'Num' : i, 'Gene' :  gene} for i,e in zip(list(range(1,21)), output_json[gene]['Intron_bins'])])
#         bins_position.extend([{'Num' : i, 'CHR' :  'chr' +  output_json[gene]['CHROM'], 'Region' : e, 'Region_type' : elem.replace('_bins', '')} for i,e in zip(list(range(1,21)), output_json[gene][elem])])

    
   

    
    
    output_json[gene]['BED_boundaries_for_hail_10kbp_added'] = "chr" +  str(output_json[gene]['CHROM']) + ':' + str((int(output_json[gene]['Gene_start_end'].split('_')[0]) - 5000)) + '-' + str((int(output_json[gene]['Gene_start_end'].split('_')[1]) + 5000))

            
refseq_38_df_transformed = pd.DataFrame.from_dict(output_json).T
refseq_38_df_transformed


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




Unnamed: 0,CHROM,Gene_start_end,Exon_list,5_prime_UTR,3_prime_UTR,Intron_list,Exon_size_total,Bin_size_exon,Intron_size_total,Bin_size_intron,...,5_prime_UTR_size,3_prime_UTR_size,Bin_size_5_prime_UTR,Bin_size_3_prime_UTR,Gene_body_bins,Upstream_bins,Downstream_bins,Exon_bins,Intron_bins,BED_boundaries_for_hail_10kbp_added
A1BG,2,58345183_58353492,"[58347022_58347029, 58347353_58347640, 58350370_58350651, 58351391_58351687, 58352283_58352555, 58352928_58353197, 58353292_58353327, 58353404_58353437]",58345183_58347021,58353438_58353492,"[58347030_58347352, 58347641_58350369, 58350652_58351390, 58351688_58352282, 58352556_58352927, 58353198_58353291, 58353328_58353403, 58347030_58347352, 58347641_58350369, 58350652_58351390, 58351...",1480,74,4921,246,...,1838,54,92,3,"[58345183_58345598, 58345598_58346013, 58346013_58346428, 58346428_58346843, 58346843_58347258, 58347258_58347673, 58347673_58348088, 58348088_58348503, 58348503_58348918, 58348918_58349333, 58349...","[58340432_58340682, 58340682_58340932, 58340932_58341182, 58341182_58341432, 58341432_58341682, 58341682_58341932, 58341932_58342182, 58342182_58342432, 58342432_58342682, 58342682_58342932, 58342...","[58353493_58353743, 58353743_58353993, 58353993_58354243, 58354243_58354493, 58354493_58354743, 58354743_58354993, 58354993_58355243, 58355243_58355493, 58355493_58355743, 58355743_58355993, 58355...","[58347022_58347418, 58347418_58347492, 58347492_58347566, 58347566_58347640, 58347640_58350443, 58350443_58350517, 58350517_58350591, 58350591_58351404, 58351404_58351478, 58351478_58351552, 58351...","[58347030_58347275, 58347275_58347809, 58347809_58348055, 58348055_58348301, 58348301_58348547, 58348547_58348793, 58348793_58349039, 58349039_58349285, 58349285_58349531, 58349531_58349777, 58349...",chr2:58340183-58358492
A1CF,1,50799409_50885666,"[50806729_50806880, 50809894_50810042, 50811040_50811176, 50813857_50814062, 50816006_50816279, 50820552_50820649, 50828131_50828295, 50836074_50836312, 50841862_50841992, 50843988_50844122, 50850...",50799409_50806728,50859941_50885666,"[50806881_50809893, 50810043_50811039, 50811177_50813856, 50814063_50816005, 50816280_50820551, 50820650_50828130, 50828296_50836073, 50836313_50841861, 50841993_50843987, 50844123_50850664, 50850...",1896,95,51293,2565,...,7319,25725,366,1286,"[50799409_50803722, 50803722_50808035, 50808035_50812348, 50812348_50816661, 50816661_50820974, 50820974_50825287, 50825287_50829600, 50829600_50833913, 50833913_50838226, 50838226_50842539, 50842...","[50794658_50794908, 50794908_50795158, 50795158_50795408, 50795408_50795658, 50795658_50795908, 50795908_50796158, 50796158_50796408, 50796408_50796658, 50796658_50796908, 50796908_50797158, 50797...","[50885667_50885917, 50885917_50886167, 50886167_50886417, 50886417_50886667, 50886667_50886917, 50886917_50887167, 50887167_50887417, 50887417_50887667, 50887667_50887917, 50887917_50888167, 50888...","[50806729_50806823, 50806823_50809931, 50809931_50810026, 50810026_50811118, 50811118_50813893, 50813893_50813988, 50813988_50816026, 50816026_50816121, 50816121_50816216, 50816216_50820583, 50820...","[50806881_50809445, 50809445_50812296, 50812296_50815067, 50815067_50817906, 50817906_50820471, 50820471_50823134, 50823134_50825699, 50825699_50828429, 50828429_50830994, 50830994_50833559, 50833...",chr1:50794409-50890666
A2M,2,9067708_9116229,"[9067823_9067839, 9068183_9068224, 9068740_9068842, 9069745_9069813, 9070488_9070578, 9072359_9072486, 9072653_9072871, 9074560_9074783, 9076756_9076936, 9077346_9077420, 9077701_9077857, 9079244_...",9067708_9067822,9115850_9116229,"[9067840_9068182, 9068225_9068739, 9068843_9069744, 9069814_9070487, 9070579_9072358, 9072487_9072652, 9072872_9074559, 9074784_9076755, 9076937_9077345, 9077421_9077700, 9077858_9079243, 9079332_...",4389,219,43567,2178,...,114,379,6,19,"[9067708_9070134, 9070134_9072560, 9072560_9074986, 9074986_9077412, 9077412_9079838, 9079838_9082264, 9082264_9084690, 9084690_9087116, 9087116_9089542, 9089542_9091968, 9091968_9094394, 9094394_...","[9062957_9063207, 9063207_9063457, 9063457_9063707, 9063707_9063957, 9063957_9064207, 9064207_9064457, 9064457_9064707, 9064707_9064957, 9064957_9065207, 9065207_9065457, 9065457_9065707, 9065707_...","[9116230_9116480, 9116480_9116730, 9116730_9116980, 9116980_9117230, 9117230_9117480, 9117480_9117730, 9117730_9117980, 9117980_9118230, 9118230_9118480, 9118480_9118730, 9118730_9118980, 9118980_...","[9067823_9069801, 9069801_9072474, 9072474_9072859, 9072859_9074766, 9074766_9077366, 9077366_9079251, 9079251_9079777, 9079777_9089946, 9089946_9091215, 9091215_9093469, 9093469_9095081, 9095081_...","[9067840_9070231, 9070231_9072628, 9072628_9075249, 9075249_9077683, 9077683_9080367, 9080367_9082545, 9082545_9084723, 9084723_9086901, 9086901_9089079, 9089079_9091787, 9091787_9094080, 9094080_...",chr2:9062708-9121229
A2ML1,2,8822554_8887202,"[8822652_8822713, 8823182_8823365, 8823720_8823882, 8829727_8829779, 8834662_8834682, 8835507_8835666, 8836255_8836339, 8837440_8837566, 8838336_8838450, 8839113_8839222, 8841369_8841536, 8843134_...",8822554_8822651,8875012_8887202,"[8822714_8823181, 8823366_8823719, 8823883_8829726, 8829780_8834661, 8834683_8835506, 8835667_8836254, 8836340_8837439, 8837567_8838335, 8838451_8839112, 8839223_8841368, 8841537_8843133, 8843362_...",4332,217,47957,2398,...,97,12190,5,610,"[8822554_8825786, 8825786_8829018, 8829018_8832250, 8832250_8835482, 8835482_8838714, 8838714_8841946, 8841946_8845178, 8845178_8848410, 8848410_8851642, 8851642_8854874, 8854874_8858106, 8858106_...","[8817803_8818053, 8818053_8818303, 8818303_8818553, 8818553_8818803, 8818803_8819053, 8819053_8819303, 8819303_8819553, 8819553_8819803, 8819803_8820053, 8820053_8820303, 8820303_8820553, 8820553_...","[8887203_8887453, 8887453_8887703, 8887703_8887953, 8887953_8888203, 8888203_8888453, 8888453_8888703, 8888703_8888953, 8888953_8889203, 8889203_8889453, 8889453_8889703, 8889703_8889953, 8889953_...","[8822652_8823336, 8823336_8829751, 8829751_8836262, 8836262_8838348, 8838348_8841373, 8841373_8843187, 8843187_8845481, 8845481_8847598, 8847598_8848836, 8848836_8850207, 8850207_8851933, 8851933_...","[8822714_8825458, 8825458_8827856, 8827856_8830307, 8830307_8832705, 8832705_8835124, 8835124_8837894, 8837894_8840517, 8840517_8843083, 8843083_8845773, 8845773_8848467, 8848467_8851266, 8851266_...",chr2:8817554-8892202
A3GALT2,1,33306766_33321098,"[33306766_33307453, 33312052_33312189, 33312501_33312590, 33312807_33312890, 33321076_33321098]",33306766_33306765,33321099_33321098,"[33307454_33312051, 33312190_33312500, 33312591_33312806, 33312891_33321075, 33307454_33312051, 33312190_33312500, 33312591_33312806, 33312891_33321075]",1018,51,13306,665,...,-1,-1,0,0,"[33306766_33307483, 33307483_33308200, 33308200_33308917, 33308917_33309634, 33309634_33310351, 33310351_33311068, 33311068_33311785, 33311785_33312502, 33312502_33313219, 33313219_33313936, 33313...","[33302015_33302265, 33302265_33302515, 33302515_33302765, 33302765_33303015, 33303015_33303265, 33303265_33303515, 33303515_33303765, 33303765_33304015, 33304015_33304265, 33304265_33304515, 33304...","[33321099_33321349, 33321349_33321599, 33321599_33321849, 33321849_33322099, 33322099_33322349, 33322349_33322599, 33322599_33322849, 33322849_33323099, 33323099_33323349, 33323349_33323599, 33323...","[33306766_33306816, 33306816_33306867, 33306867_33306918, 33306918_33306969, 33306969_33307020, 33307020_33307071, 33307071_33307122, 33307122_33307173, 33307173_33307224, 33307224_33307275, 33307...","[33307454_33308118, 33308118_33308783, 33308783_33309448, 33309448_33310113, 33310113_33310778, 33310778_33311443, 33311443_33312246, 33312246_33313085, 33313085_33313750, 33313750_33314415, 33314...",chr1:33301766-33326098
A4GALT,2,42692115_42721301,[42692890_42693951],42692115_42692889,42693952_42721301,[],1061,53,0,0,...,774,27349,39,1367,"[42692115_42693574, 42693574_42695033, 42695033_42696492, 42696492_42697951, 42697951_42699410, 42699410_42700869, 42700869_42702328, 42702328_42703787, 42703787_42705246, 42705246_42706705, 42706...","[42687364_42687614, 42687614_42687864, 42687864_42688114, 42688114_42688364, 42688364_42688614, 42688614_42688864, 42688864_42689114, 42689114_42689364, 42689364_42689614, 42689614_42689864, 42689...","[42721302_42721552, 42721552_42721802, 42721802_42722052, 42722052_42722302, 42722302_42722552, 42722552_42722802, 42722802_42723052, 42723052_42723302, 42723302_42723552, 42723552_42723802, 42723...","[42692890_42692942, 42692942_42692995, 42692995_42693048, 42693048_42693101, 42693101_42693154, 42693154_42693207, 42693207_42693260, 42693260_42693313, 42693313_42693366, 42693366_42693419, 42693...",[],chr2:42687115-42726301
A4GNT,1,138123713_138133310,"[138124264_138124878, 138130849_138131256]",138123713_138124263,138131257_138133310,"[138124879_138130848, 138124879_138130848]",1021,51,5969,298,...,550,2053,28,103,"[138123713_138124193, 138124193_138124673, 138124673_138125153, 138125153_138125633, 138125633_138126113, 138126113_138126593, 138126593_138127073, 138127073_138127553, 138127553_138128033, 138128...","[138118962_138119212, 138119212_138119462, 138119462_138119712, 138119712_138119962, 138119962_138120212, 138120212_138120462, 138120462_138120712, 138120712_138120962, 138120962_138121212, 138121...","[138133311_138133561, 138133561_138133811, 138133811_138134061, 138134061_138134311, 138134311_138134561, 138134561_138134811, 138134811_138135061, 138135061_138135311, 138135311_138135561, 138135...","[138124264_138124314, 138124314_138124365, 138124365_138124416, 138124416_138124467, 138124467_138124518, 138124518_138124569, 138124569_138124620, 138124620_138124671, 138124671_138124722, 138124...","[138124879_138125176, 138125176_138125474, 138125474_138125772, 138125772_138126070, 138126070_138126368, 138126368_138126666, 138126666_138126964, 138126964_138127262, 138127262_138127560, 138127...",chr1:138118713-138138310
AAAS,2,53307456_53321610,"[53307489_53307713, 53307845_53307929, 53308052_53308133, 53308282_53308349, 53308435_53308528, 53308725_53308815, 53308960_53309020, 53309157_53309281, 53309601_53309721, 53314298_53314441, 53314...",53307456_53307488,53321466_53321610,"[53307714_53307844, 53307930_53308051, 53308134_53308281, 53308350_53308434, 53308529_53308724, 53308816_53308959, 53309021_53309156, 53309282_53309600, 53309722_53314297, 53314442_53314750, 53314...",1625,81,12321,616,...,32,144,2,7,"[53307456_53308164, 53308164_53308872, 53308872_53309580, 53309580_53310288, 53310288_53310996, 53310996_53311704, 53311704_53312412, 53312412_53313120, 53313120_53313828, 53313828_53314536, 53314...","[53302705_53302955, 53302955_53303205, 53303205_53303455, 53303455_53303705, 53303705_53303955, 53303955_53304205, 53304205_53304455, 53304455_53304705, 53304705_53304955, 53304955_53305205, 53305...","[53321611_53321861, 53321861_53322111, 53322111_53322361, 53322361_53322611, 53322611_53322861, 53322861_53323111, 53323111_53323361, 53323361_53323611, 53323611_53323861, 53323861_53324111, 53324...","[53307489_53307569, 53307569_53307650, 53307650_53307862, 53307862_53308065, 53308065_53308294, 53308294_53308460, 53308460_53308737, 53308737_53308962, 53308962_53309179, 53309179_53309260, 53309...","[53307714_53308658, 53308658_53309551, 53309551_53310288, 53310288_53310904, 53310904_53311520, 53311520_53312136, 53312136_53312752, 53312752_53313368, 53313368_53313984, 53313984_53314744, 53314...",chr2:53302456-53326610
AACS,2,125065435_125143327,"[125065585_125065717, 125073876_125073979, 125076491_125076611, 125086330_125086443, 125091426_125091523, 125102679_125102793, 125103000_125103081, 125107121_125107268, 125114477_125114557, 125118...",125065435_125065584,125142230_125143327,"[125065718_125073875, 125073980_125076490, 125076612_125086329, 125086444_125091425, 125091524_125102678, 125102794_125102999, 125103082_125107120, 125107269_125114476, 125114558_125118640, 125118...",2001,100,74609,3730,...,149,1097,7,55,"[125065435_125069330, 125069330_125073225, 125073225_125077120, 125077120_125081015, 125081015_125084910, 125084910_125088805, 125088805_125092700, 125092700_125096595, 125096595_125100490, 125100...","[125060684_125060934, 125060934_125061184, 125061184_125061434, 125061434_125061684, 125061684_125061934, 125061934_125062184, 125062184_125062434, 125062434_125062684, 125062684_125062934, 125062...","[125143328_125143578, 125143578_125143828, 125143828_125144078, 125144078_125144328, 125144328_125144578, 125144578_125144828, 125144828_125145078, 125145078_125145328, 125145328_125145578, 125145...","[125065585_125065684, 125065684_125073942, 125073942_125076553, 125076553_125086371, 125086371_125091453, 125091453_125102708, 125102708_125103014, 125103014_125107153, 125107153_125107253, 125107...","[125065718_125069447, 125069447_125073177, 125073177_125077132, 125077132_125080862, 125080862_125084592, 125084592_125088436, 125088436_125092264, 125092264_125095994, 125095994_125099724, 125099...",chr2:125060435-125148327
AADAC,1,151814008_151828488,"[151814163_151814300, 151817366_151817588, 151820383_151820452, 151824663_151824834, 151827576_151828172]",151814008_151814162,151828173_151828488,"[151814301_151817365, 151817589_151820382, 151820453_151824662, 151824835_151827575, 151814301_151817365, 151817589_151820382, 151820453_151824662, 151824835_151827575]",1195,60,12806,640,...,154,315,8,16,"[151814008_151814732, 151814732_151815456, 151815456_151816180, 151816180_151816904, 151816904_151817628, 151817628_151818352, 151818352_151819076, 151819076_151819800, 151819800_151820524, 151820...","[151809257_151809507, 151809507_151809757, 151809757_151810007, 151810007_151810257, 151810257_151810507, 151810507_151810757, 151810757_151811007, 151811007_151811257, 151811257_151811507, 151811...","[151828489_151828739, 151828739_151828989, 151828989_151829239, 151829239_151829489, 151829489_151829739, 151829739_151829989, 151829989_151830239, 151830239_151830489, 151830489_151830739, 151830...","[151814163_151814222, 151814222_151814282, 151814282_151817407, 151817407_151817467, 151817467_151817527, 151817527_151817587, 151817587_151820441, 151820441_151824711, 151824711_151824771, 151824...","[151814301_151814940, 151814940_151815580, 151815580_151816220, 151816220_151816860, 151816860_151817723, 151817723_151818363, 151818363_151819003, 151819003_151819643, 151819643_151820283, 151820...",chr1:151809008-151833488


In [244]:
bed = pd.DataFrame(bins_position)
bed['Start'] = bed['Start'].astype(int)
bed['End'] = bed['End'].astype(int)
bed = bed.loc[(bed['Start'] > 0 ) & (bed['End'] > 0 )]
bed.to_csv('/gstock/EXOTIC/data/GENOMICS/BED_test_h.txt', sep='\t', index=False)
bed

Unnamed: 0,CHR,Start,End,Region_type,Num,Gene
0,chr2,58345183,58345598,Gene_body,1,A1BG
1,chr2,58345598,58346013,Gene_body,2,A1BG
2,chr2,58346013,58346428,Gene_body,3,A1BG
3,chr2,58346428,58346843,Gene_body,4,A1BG
4,chr2,58346843,58347258,Gene_body,5,A1BG
...,...,...,...,...,...,...
945,chr1,151824193,151825005,Exon,16,AADAC
946,chr1,151825005,151825645,Exon,17,AADAC
947,chr1,151825645,151826285,Exon,18,AADAC
948,chr1,151826285,151826925,Exon,19,AADAC


In [46]:
refseq_38_df_transformed.to_parquet('/gstock/EXOTIC/data/GENOMICS/RefSeq_38_precomputed.parquet')

In [91]:
biomart_raw = pd.read_csv('/gstock/EXOTIC/data/OTHERS/biomart_refseq_ensembl_hgnc.txt.gz', compression='gzip', sep='\t')
# biomart_raw['Gene_start_end_ENS'] = biomart_raw['Gene start (bp)'].astype(str) + '_' + biomart_raw['Gene end (bp)'].astype(str)
biomart_raw

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Gene stable ID,Gene end (bp),Gene start (bp),Gene name,Transcript start (bp),Transcript end (bp),Transcription start site (TSS),Transcript stable ID,Transcript support level (TSL),APPRIS annotation,Transcript type,Gene type,HGNC ID,RefSeq mRNA ID,Chromosome/scaffold name
0,ENSG00000198888,4262,3307,MT-ND1,3307,4262,3307,ENST00000361390,tslNA,principal1,protein_coding,protein_coding,HGNC:7455,,MT
1,ENSG00000198763,5511,4470,MT-ND2,4470,5511,4470,ENST00000361453,tslNA,principal1,protein_coding,protein_coding,HGNC:7456,,MT
2,ENSG00000198804,7445,5904,MT-CO1,5904,7445,5904,ENST00000361624,tslNA,principal1,protein_coding,protein_coding,HGNC:7419,,MT
3,ENSG00000198712,8269,7586,MT-CO2,7586,8269,7586,ENST00000361739,tslNA,principal1,protein_coding,protein_coding,HGNC:7421,,MT
4,ENSG00000228253,8572,8366,MT-ATP8,8366,8572,8366,ENST00000361851,tslNA,principal1,protein_coding,protein_coding,HGNC:7415,,MT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194939,ENSG00000171163,248859144,248850006,ZNF692,248857865,248859144,248859144,ENST00000534456,tsl4,,retained_intron,protein_coding,HGNC:26049,,1
194940,ENSG00000171163,248859144,248850006,ZNF692,248857975,248859033,248859033,ENST00000533976,tsl4,,retained_intron,protein_coding,HGNC:26049,,1
194941,ENSG00000185220,248919946,248906196,PGBD2,248906196,248919946,248906196,ENST00000355360,tsl1,,protein_coding,protein_coding,HGNC:19399,NM_001017434,1
194942,ENSG00000185220,248919946,248906196,PGBD2,248906235,248919146,248906235,ENST00000329291,tsl1 (assigned to previous version 5),principal1,protein_coding,protein_coding,HGNC:19399,NM_170725,1


In [185]:
merge_biomart_refseq_38 = pd.merge(biomart_raw.loc[biomart_raw['Gene stable ID'].isin(exotic_genes_not_sqtl), ['Gene stable ID', 'Gene name']].drop_duplicates().sort_values(by='Gene name'), refseq_38_df_transformed.reset_index(), left_on='Gene name', right_on='index')
merge_biomart_refseq_38['Gene_start'] = merge_biomart_refseq_38.Gene_start_end.apply(lambda r: r.split('_')[0])
merge_biomart_refseq_38['Gene_start'] = merge_biomart_refseq_38['Gene_start'].astype(int)
merge_biomart_refseq_38['Gene_end'] = merge_biomart_refseq_38.Gene_start_end.apply(lambda r: r.split('_')[1])
merge_biomart_refseq_38['Gene_end'] = merge_biomart_refseq_38['Gene_end'].astype(int)
merge_biomart_refseq_38

Unnamed: 0,Gene stable ID,Gene name,index,CHROM,Gene_start_end,Exon_list,5_prime_UTR,3_prime_UTR,Intron_list,Exon_size_total,...,Bin_size_3_prime_UTR,Gene_body_bins,Upstream_bins,Downstream_bins,Exon_bins,Bins_position,Intron_bins,BED_boundaries_for_hail_10kbp_added,Gene_start,Gene_end
0,ENSG00000175899,A2M,A2M,2,9067708_9116229,"[9067823_9067839, 9068183_9068224, 9068740_9068842, 9069745_9069813, 9070488_9070578, 9072359_9072486, 9072653_9072871, 9074560_9074783, 9076756_9076936, 9077346_9077420, 9077701_9077857, 9079244_...",9067708_9067822,9115850_9116229,"[9067840_9068182, 9068225_9068739, 9068843_9069744, 9069814_9070487, 9070579_9072358, 9072487_9072652, 9072872_9074559, 9074784_9076755, 9076937_9077345, 9077421_9077700, 9077858_9079243, 9079332_...",4389,...,19,"[9067708_9070134, 9070134_9072560, 9072560_9074986, 9074986_9077412, 9077412_9079838, 9079838_9082264, 9082264_9084690, 9084690_9087116, 9087116_9089542, 9089542_9091968, 9091968_9094394, 9094394_...","[9062957_9063207, 9063207_9063457, 9063457_9063707, 9063707_9063957, 9063957_9064207, 9064207_9064457, 9064457_9064707, 9064707_9064957, 9064957_9065207, 9065207_9065457, 9065457_9065707, 9065707_...","[9116230_9116480, 9116480_9116730, 9116730_9116980, 9116980_9117230, 9117230_9117480, 9117480_9117730, 9117730_9117980, 9117980_9118230, 9118230_9118480, 9118480_9118730, 9118730_9118980, 9118980_...","[9067823_9069801, 9069801_9072474, 9072474_9072859, 9072859_9074766, 9074766_9077366, 9077366_9079251, 9079251_9079777, 9079777_9089946, 9089946_9091215, 9091215_9093469, 9093469_9095081, 9095081_...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]","[9067840_9070231, 9070231_9072628, 9072628_9075249, 9075249_9077683, 9077683_9080367, 9080367_9082545, 9082545_9084723, 9084723_9086901, 9086901_9089079, 9089079_9091787, 9091787_9094080, 9094080_...",chr2:9062708-9121229,9067708,9116229
1,ENSG00000166535,A2ML1,A2ML1,2,8822554_8887202,"[8822652_8822713, 8823182_8823365, 8823720_8823882, 8829727_8829779, 8834662_8834682, 8835507_8835666, 8836255_8836339, 8837440_8837566, 8838336_8838450, 8839113_8839222, 8841369_8841536, 8843134_...",8822554_8822651,8875012_8887202,"[8822714_8823181, 8823366_8823719, 8823883_8829726, 8829780_8834661, 8834683_8835506, 8835667_8836254, 8836340_8837439, 8837567_8838335, 8838451_8839112, 8839223_8841368, 8841537_8843133, 8843362_...",4332,...,610,"[8822554_8825786, 8825786_8829018, 8829018_8832250, 8832250_8835482, 8835482_8838714, 8838714_8841946, 8841946_8845178, 8845178_8848410, 8848410_8851642, 8851642_8854874, 8854874_8858106, 8858106_...","[8817803_8818053, 8818053_8818303, 8818303_8818553, 8818553_8818803, 8818803_8819053, 8819053_8819303, 8819303_8819553, 8819553_8819803, 8819803_8820053, 8820053_8820303, 8820303_8820553, 8820553_...","[8887203_8887453, 8887453_8887703, 8887703_8887953, 8887953_8888203, 8888203_8888453, 8888453_8888703, 8888703_8888953, 8888953_8889203, 8889203_8889453, 8889453_8889703, 8889703_8889953, 8889953_...","[8822652_8823336, 8823336_8829751, 8829751_8836262, 8836262_8838348, 8838348_8841373, 8841373_8843187, 8843187_8845481, 8845481_8847598, 8847598_8848836, 8848836_8850207, 8850207_8851933, 8851933_...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]","[8822714_8825458, 8825458_8827856, 8827856_8830307, 8830307_8832705, 8832705_8835124, 8835124_8837894, 8837894_8840517, 8840517_8843083, 8843083_8845773, 8845773_8848467, 8848467_8851266, 8851266_...",chr2:8817554-8892202,8822554,8887202
