In [1]:
import numpy as np

In [2]:
def read_transcript_bed_file(transcript_bed_file):
    """Returns a dictionary that maps transcript_ids to their genomic locations."""

    transcripts = {}
    for line in open(transcript_bed_file):
        elements = line.strip().split()
        # print(elements)
        transcripts[elements[3]] = {
            "chrom_id": elements[0],
            "start": int(elements[1]),
            "end": int(elements[2]),
            "strand": elements[5],
            "block_count": int(elements[9]),
            "block_sizes": [int(size) for size in elements[10].split(",") if size],
            "block_starts": [int(start_pos) for start_pos in elements[11].split(",") if start_pos],
        }
    return transcripts

def get_score(transcript, scores):
    score_map = []
    if transcript["strand"] == "-":
        scores = scores[::-1]

    current_score_pos = 0
    for block_index in range(transcript["block_count"]):
        for pos_on_transcript in range(transcript["block_sizes"][block_index]):
            pos_on_genome = pos_on_transcript + transcript["block_starts"][block_index] + transcript["start"]
            score_map.append((transcript["chrom_id"], pos_on_genome, scores[current_score_pos]))    
            current_score_pos += 1
    return score_map

In [3]:
def process_tis_transformer_scores(tis_transformer_folder, transcript_bed_file, tis_transformer_prediction_file):
    transcript_coordinates = read_transcript_bed_file(transcript_bed_file)

    with open(tis_transformer_prediction_file, "w") as prediction_file:
        for file_idx in range(1, 7):
            for transcript_info, scores in np.load(f"{tis_transformer_folder}/TIS_predictions_v2_{file_idx}.npy", allow_pickle=True):
                transcript_name, _ = transcript_info.split("::")
                for chrom_id, pos, score in get_score(transcript_coordinates[transcript_name], scores):
                    if score >= 1e-10:
                        line_to_write = f"{transcript_name}\t{chrom_id}\t{pos}\t{score}\n"
                        prediction_file.write(line_to_write)

In [4]:
tis_transformer_folder = "/home/ec2-user/efs-mount-point/mnt/efs0/riboseq_callers/TIS_transformer/data_veliadb/"
transcript_bed_file = "/home/ec2-user/efs-mount-point/mnt/efs0/riboseq_callers/data/ORFrater/human_GRCh38_p14/annotations/veliadb_v2.fixed.bed"
tis_transformer_prediction_file = "/home/ec2-user/efs-mount-point/mnt/efs0/riboseq_callers/data/ORFrater/human_GRCh38_p14/tis_transformer/tis_transformer_predictions.veliadb_v2.fixed.txt"
process_tis_transformer_scores(
    tis_transformer_folder=tis_transformer_folder,
    transcript_bed_file=transcript_bed_file,
    tis_transformer_prediction_file=tis_transformer_prediction_file)