In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
def reformattingPCCR():
    ## RNA Bridges and Exon Loopouts
    rna_bridges = pd.read_csv("Pervouchine2021_SuppData/Pervouchine2021_SuppData3_RNABridges.txt", sep="\t")
    exon_loopouts = pd.read_csv("Pervouchine2021_SuppData/Pervouchine2021_SuppData4_ExonLoopouts.tsv", sep="\t")
    pccr = pd.concat([rna_bridges, exon_loopouts], axis=0, ignore_index=True)
    pccr = pccr[["ph", "exon"]]
#     print(pccr)
    
    pccr["ph_chr"] = pccr["ph"].str.extract(r"((?<=chr)\d+)")
    pccr["ph_start"] = pccr["ph"].str.extract(r"((?<=_)\d+(?=_\d))").astype(int)
    pccr["ph_end"] = pccr["ph"].str.extract(r"((?<=\d_)\d+(?=_\W))").astype(int)
    pccr["ph_strand"] = pccr["ph"].str.extract(r"(\+|-)")
    
    pccr["exon_chr"] = pccr["exon"].str.extract(r"((?<=chr)\d+)")
    pccr["exon_start"] = pccr["exon"].str.extract(r"((?<=_)\d+(?=_\d))").astype(int)
    pccr["exon_end"] = pccr["exon"].str.extract(r"((?<=\d_)\d+(?=_\W))").astype(int)
    pccr["exon_strand"] = pccr["exon"].str.extract(r"(\+|-)")
    
    pccr = pccr.drop_duplicates()
    pccr = pccr.dropna()
    pccr["strand"] = np.where(pccr["exon_strand"]=="-",-1, 1)
    
#     pccr.to_csv("Pervouchine2021_pccrFull.txt", sep="\t")
    
    
    print("max pccr length: ", (pccr["ph_end"].astype(int)-pccr["ph_start"].astype(int)).abs().max())
    print("min pccr length: ", (pccr["ph_end"].astype(int)-pccr["ph_start"].astype(int)).abs().min())
    print("max exon length: ", (pccr["exon_end"].astype(int)-pccr["exon_start"].astype(int)).abs().max())
    print("min exon length: ", (pccr["exon_end"].astype(int)-pccr["exon_start"].astype(int)).abs().min())


In [None]:
# reformattingPCCR()

In [None]:
import requests, sys

In [None]:
def getEnsemblSeqsByCoords(coordinate):
    server = "https://grch37.rest.ensembl.org"
    ext = f"/sequence/region/human/{coordinate}?"

    r = requests.get(server+ext, headers={ "Content-Type" : "text/plain"})

    if not r.ok:
      r.raise_for_status()
      sys.exit()


    return(r.text)

In [None]:
def createEnsemblFormatCoord(window, chromosome, start, strand):
    return str(chromosome)+":"+str(start)+"-"+str(start+window-1)+":"+str(strand)

In [None]:
def slidingWindow(chromosome, startCoord, endCoord, strand):
    window = 40
    
    coordinates = []
    for idx in range(startCoord, endCoord-window+2):
        ensembl_coord = createEnsemblFormatCoord(window, chromosome, startCoord, strand)
        coordinates.append(ensembl_coord)
        
    return coordinates
        

In [None]:
def getTentativeBoxes():
    pccr = pd.read_csv("Pervouchine2021_pccrFull.txt", sep="\t")
    pccr_short = pccr[(pccr["ph_end"]-pccr["ph_start"]) <= 40]
    pccr_long = pccr[(pccr["ph_end"]-pccr["ph_start"]) > 40]
    print(pccr.shape, pccr_short.shape, pccr_long.shape)
    print(pccr_short)
    
    tentative_coords = []
    for idx, pccr_row in pccr_long.iterrows():
        coords = slidingWindow(pccr_row["ph_chr"], pccr_row["ph_start"], 
                               pccr_row["ph_end"], pccr_row["strand"])
        tentative_coords.extend(coords)

    for idx, pccr_short_row in pccr_short.iterrows():
        coord_short = str(pccr_short_row["ph_chr"])+":"+str(pccr_short_row["ph_start"])+"-"+str(pccr_short_row["ph_end"])+":"+str(pccr_short_row["strand"])

        tentative_coords.append(coord_short)
        
        
    tentative_boxes = pd.DataFrame()
    tentative_boxes["ensembl_coordinate"] = tentative_coords 
    tentative_boxes.to_csv("tentative_boxes.txt", sep="\t")
    

In [None]:
getTentativeBoxes()

In [None]:
def getSequencesByChr():
#     pccr = pd.read_csv("Pervouchine2021_pccrFull.txt", sep="\t")
    box_coords = pd.read_csv("tentative_boxes.txt", sep="\t")
    box_coords["chr"] = box_coords["ensembl_coordinate"].str.extract(r"(\d+)")
    for chromosome in box_coords["chr"].unique():
        chr_coords = box_coords[box_coords["chr"].astype(int)==8]
        chr_coords["sequence"] = chr_coords["ensembl_coordinate"].map(getEnsemblSeqsByCoords)
        chr_coords.to_csv(r"pccr_chr8.txt", sep="\t")
        break

In [None]:
getSequencesByChr()