In [1]:
from Bio import Align
from Bio.Seq import Seq
from Bio.Align import substitution_matrices
import pandas as pd
import subprocess
import shutil

In [2]:
def format_alignment(arr, query_start, ori, ref_chr, ref_offset=0):
    output = []
    seq_len = 0 # Track sequence length

    for i in range(0, len(arr), 4):  # Process each set of 3 rows
        if i >= len(arr) or arr[i].strip() == "":
            continue

        first_row = arr[i]
        seq = first_row.split()[2]  # Extract the sequence part
        seq_len = len(seq)
        end_number = query_start + seq_len - 1 if ori else query_start - seq_len + 1 # Calculate end number depending on orientation

        first_row = f"ref            {ref_chr}:{query_start} {seq} {end_number}"

        # Align second row
        second_row = arr[i+1]
        query_sequence_start = first_row.index(seq) - 1
        pipes_only = ' ' * query_sequence_start + ''.join([ch if not ch.isdigit() else '' for ch in second_row.strip()])

        # Change first number to 1-indexed
        third_row = arr[i+2]
        ref_start = ref_offset + 1
        ref_seq = third_row.split()[2]  # Extract the sequence part from third row
        ref_len = len(ref_seq)
        ref_end = ref_start + ref_len - 1  # Corrected end number for ref

        ref_row = f"query          {" "*len(ref_chr)} {ref_start} {ref_seq} {ref_end}"

        # Since ref row number is always less than query, calculate extra space
        space_diff = first_row.index(seq) - ref_row.index(ref_seq)
        ref_row = " " * space_diff + ref_row.lstrip()

        output.append(first_row)
        output.append(pipes_only)
        output.append(ref_row)
        output.append('')

        # Update query_start for the next set
        query_start = query_start + seq_len if ori else query_start - seq_len

        # Update the ref_offset for next set
        ref_offset += ref_len

    return "\n".join(output)

In [3]:
def extract_region(fasta_file, region):
    """
    Extracts a specific region from a FASTA file using samtools.

    :param fasta_file: Path to the indexed FASTA file.
    :param region: Region in the format "chr:start-end" (e.g., "chr18:180-280").
    :return: The sequence as a string.
    """
    print(region)
    result = subprocess.run(
        ["samtools", "faidx", fasta_file, region],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    if result.returncode == 0:
        # The sequence will be in the second line, as the first line contains the header
        return ''.join(result.stdout.splitlines()[1:])
    else:
        raise Exception(f"Error: {result.stderr.strip()}")
    
def generate_scaffolds(fastq1, fastq2, out_dir='out'):
    """
    Extracts a specific region from a FASTA file using samtools.

    :param fasta_file: Path to the indexed FASTA file.
    :param region: Region in the format "chr:start-end" (e.g., "chr18:180-280").
    :return: The sequence as a string.
    """

    shutil.rmtree('./out')
    result = subprocess.run(
        ["python", "./SPAdes-4.2.0-Linux/bin/spades.py", "--meta", "--pe1-1", f"./fastq/{fastq1}", "--pe1-2", f"./fastq/{fastq2}", "-o", out_dir],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    # print(result.stdout)
    # print(result.stderr)
    if result.returncode == 0:
        # The sequence will be in the second line, as the first line contains the header
        with open(f'./{out_dir}/scaffolds.fasta', 'r') as file:
            file_content = file.read()  # Read the entire content of the file into a string
            return [''.join(s.split('\n')[1:]) for s in file_content.split('>')][1:]
    else:
        raise Exception(f"{result.stderr}")

In [4]:
df = pd.read_csv('./K562/K562_summaries.tsv', sep="\t")
# aligner = Align.PairwiseAligner(mode="global", open_gap_score = -10, extend_gap_score = -0.5, match_score = 1.0, mismatch_score = -1.0)
aligner = Align.PairwiseAligner(mode="local", open_gap_score = -10, extend_gap_score = -5, match_score=2, mismatch_score=-2)
df['ref_seq_1'] = df['chrom1'].str.cat((df['pos1'] - 349).astype(str).str.cat((df['pos1'] + 351).astype(str), sep='-'), sep=':').apply(lambda x: extract_region('hg19/hg19full.fa', x))
df['ref_seq_2'] = df['chrom2'].str.cat((df['pos2'] - 349).astype(str).str.cat((df['pos2'] + 351).astype(str), sep='-'), sep=':').apply(lambda x: extract_region('hg19/hg19full.fa', x))
fastqs = ([('b_' + df['chrom1'] + '_' + (df['pos1']+1).astype(str) + '_' + df['chrom2'] + '_' + (df['pos2']+1).astype(str) + '_' + str(x) + '.fastq.gz').to_list() for x in range(1, 3)])
df['scaffolds'] = [generate_scaffolds(fastqs[0][idx], fastqs[1][idx]) for idx, val in enumerate(fastqs[0])]



chr9:133606797-133607497
chr9:134155174-134155874
chr13:81087683-81088383
chr13:81109064-81109764
chr13:81469686-81470386
chr13:90862424-90863124
chr13:92473661-92474361
chr13:94023058-94023758
chr22:18952773-18953473
chr22:20950224-20950924
chr22:23049358-23050058
chr22:23049561-23050261
chr22:23632392-23633092
chr13:108661061-108661761
chr13:81469585-81470285
chr13:90438357-90439057
chr13:81469961-81470661
chr13:90864457-90865157
chr13:92475408-92476108
chr13:108500623-108501323
chr22:22935024-22935724
chr22:20953492-20954192
chr22:23107720-23108420
chr22:23107509-23108209


In [5]:
scaffold_alignments = df.apply(lambda row: [(aligner.align(row['ref_seq_1'], scaffold)[0].score, aligner.align(row['ref_seq_2'], scaffold)[0].score) for scaffold in row['scaffolds']], axis=1)
scaffold_alignments

0                                      [(132.0, 698.0)]
1                         [(28.0, 48.0), (532.0, 12.0)]
2                        [(22.0, 106.0), (702.0, 28.0)]
3                        [(120.0, 18.0), (18.0, 226.0)]
4                                      [(106.0, 106.0)]
5            [(20.0, 22.0), (18.0, 20.0), (18.0, 20.0)]
6           [(458.0, 30.0), (18.0, 26.0), (20.0, 24.0)]
7           [(30.0, 26.0), (22.0, 250.0), (26.0, 32.0)]
8            [(26.0, 28.0), (26.0, 22.0), (30.0, 26.0)]
9                                        [(4.0, 294.0)]
10         [(228.0, 532.0), (16.0, 20.0), (58.0, 76.0)]
11    [(584.0, 124.0), (58.0, 76.0), (66.0, 64.0), (...
dtype: object

In [6]:
for idx, aligns in enumerate(scaffold_alignments):
    print("breakpoint", idx)
    left_max = max(scaffold_alignments[idx], key=lambda x: x[0])
    left_idx_max = scaffold_alignments[idx].index(left_max)
    right_max = max(scaffold_alignments[idx], key=lambda x: x[1])
    right_idx_max = scaffold_alignments[idx].index(right_max)
    print("best_left_scaffold:", left_idx_max, ", score:", left_max)
    print("best_right_scaffold:", right_idx_max, ", score:", right_max)


breakpoint 0
best_left_scaffold: 0 , score: (132.0, 698.0)
best_right_scaffold: 0 , score: (132.0, 698.0)
breakpoint 1
best_left_scaffold: 1 , score: (532.0, 12.0)
best_right_scaffold: 0 , score: (28.0, 48.0)
breakpoint 2
best_left_scaffold: 1 , score: (702.0, 28.0)
best_right_scaffold: 0 , score: (22.0, 106.0)
breakpoint 3
best_left_scaffold: 0 , score: (120.0, 18.0)
best_right_scaffold: 1 , score: (18.0, 226.0)
breakpoint 4
best_left_scaffold: 0 , score: (106.0, 106.0)
best_right_scaffold: 0 , score: (106.0, 106.0)
breakpoint 5
best_left_scaffold: 0 , score: (20.0, 22.0)
best_right_scaffold: 0 , score: (20.0, 22.0)
breakpoint 6
best_left_scaffold: 0 , score: (458.0, 30.0)
best_right_scaffold: 0 , score: (458.0, 30.0)
breakpoint 7
best_left_scaffold: 0 , score: (30.0, 26.0)
best_right_scaffold: 1 , score: (22.0, 250.0)
breakpoint 8
best_left_scaffold: 2 , score: (30.0, 26.0)
best_right_scaffold: 0 , score: (26.0, 28.0)
breakpoint 9
best_left_scaffold: 0 , score: (4.0, 294.0)
best_righ

In [7]:
scaffold_alignments[11]

[(584.0, 124.0), (58.0, 76.0), (66.0, 64.0), (58.0, 14.0)]

In [8]:
df['scaffolds'][1]

['TCCTTTGACATGTGTATATCCTCTTAATTGAAATGTCTCTTCAGGACTTTGGCCTATTTTCTGGATGGGCTATTGGATTTTTTAATGTAGAGTTTTGAGAGTTCTTTATGTATTCTAGATACCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAATTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGTGATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATGTGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTGAGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTTGGAGCTGGTCAGATCGGAGCTGGCATCCAGCCTCTGAGAAGTCTCTATCCCCTGATCCCTGGGTTTTTGGGTTTCTCTATCTGTGGAGCCCACTGCCTAGGGCTAAGGTCAAGGATAGAAGAGACAGTGCCTGCTCAATGTAACGCCTGACACACAGTGGGTGCTTTACAAACCCAGGCGGAGCCCAGGACGGAGGGGGCGGGGCACTGTCACTCCCAGCTGGCCTTGAAGAGAGCCCACCCCCGGGAAATCACCTCCAGCAGCTTCACTGCTCACAAGGCGGTTCA

In [9]:
for strn in df['scaffolds'][1]:
    print(strn, "\n")

TCCTTTGACATGTGTATATCCTCTTAATTGAAATGTCTCTTCAGGACTTTGGCCTATTTTCTGGATGGGCTATTGGATTTTTTAATGTAGAGTTTTGAGAGTTCTTTATGTATTCTAGATACCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAATTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGTGATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATGTGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTGAGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTTGGAGCTGGTCAGATCGGAGCTGGCATCCAGCCTCTGAGAAGTCTCTATCCCCTGATCCCTGGGTTTTTGGGTTTCTCTATCTGTGGAGCCCACTGCCTAGGGCTAAGGTCAAGGATAGAAGAGACAGTGCCTGCTCAATGTAACGCCTGACACACAGTGGGTGCTTTACAAACCCAGGCGGAGCCCAGGACGGAGGGGGCGGGGCACTGTCACTCCCAGCTGGCCTTGAAGAGAGCCCACCCCCGGGAAATCACCTCCAGCAGCTTCACTGCTCACAAGGCGGTTCAGG

In [10]:
df["ref_seq_1"][1].upper()

'TGACCTTAGCCCTAGGCAGTGGGCTCCACAGATAGAGAAACCCAAAAACCCAGGGATCAGGGGATAGAGACTTCTCAGAGGCTGGATGCCAGCTCCGATCTGACCAGCTCCAAAGCCCACATTCAGCTTCTTCACCAAGCACACCCCACTTGCTAGCCAGGGCCAGCCCCTCAGAACTAGGATGGGACTGATCACTAGGAAGTCCTACCAGCAGAGGTGTGTGGGCGCCCACATGGTGAGGACTTGCAAAGTAGAAGACCCCTCTCCCTTAACCTGCTCCCCCTTGACATCACCCCTGCACAGGATGTCATCTCAGTCTGCAGGAGAAGGGACCAGGAAGCATCAGTCCATGGGCCTGTTTGCCGGCTTTGAGCTGGGGGGCAGCTCTGCCAGGCAGAGGGTGTGAATGGGCTCGGCTCCTGCATGCACTCCCTGCCCTGGCTTCTGAAGCGCCCACCAGGGTTCTGGTGGCTTCCCCAGCTCGCGGCCCCTCCGGCCCTGCCGCTTCCCTACACAGCCCTTCCTCCTGCTCGTCATTTTCCGGAGCTCCGTGGGTGTTGGGCACAGACCCACGGCAGCCTCAGCATATGTGGAAGGCTCATGACGCATCCACGCGCGTGTGAAGTGCCGGTGTCTGCAGAGCAGAGGAAGGGGACATCTGTTTGAGTCATTGCCCAACAGCACAGGGCACTCCCATTGCT'

In [11]:
df["ref_seq_2"][1].upper()

'CAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAACTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATTTCTTAGACTGCGTTAAAATTTTTATTTCAATCATCACATACATTTTAGATCATGGTATAATTTCTGTTTTAATCACTACATATGATTTAGAAAACCTATGATGAGGATAATCTATGTATTTACCCACAGTTCTGCTCCTTCTGTTGCTGTTTTTTTTTTTCTTCCTGATGTTTCCAGACTTCTTCTTTCACCATTTCCTTTCCTTTTAAAGAATTTTCTGTAACCATTCTTTAAGGATAGATCTGCAATTGACAAATTATTTTACATTCTATTTGTCTGAAAATGTCTTTATTTCTACTTCATTCTTGCTGGATTGTTGTAGCTGATATACAATCCATGGTTGGTAGTT'

In [12]:
df["scaffolds"][1][0]

'TCCTTTGACATGTGTATATCCTCTTAATTGAAATGTCTCTTCAGGACTTTGGCCTATTTTCTGGATGGGCTATTGGATTTTTTAATGTAGAGTTTTGAGAGTTCTTTATGTATTCTAGATACCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAATTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGTGATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATGTGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTGAGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTTGGAGCTGGTCAGATCGGAGCTGGCATCCAGCCTCTGAGAAGTCTCTATCCCCTGATCCCTGGGTTTTTGGGTTTCTCTATCTGTGGAGCCCACTGCCTAGGGCTAAGGTCAAGGATAGAAGAGACAGTGCCTGCTCAATGTAACGCCTGACACACAGTGGGTGCTTTACAAACCCAGGCGGAGCCCAGGACGGAGGGGGCGGGGCACTGTCACTCCCAGCTGGCCTTGAAGAGAGCCCACCCCCGGGAAATCACCTCCAGCAGCTTCACTGCTCACAAGGCGGTTCAG

In [13]:
def rev_comp(dna):
    complement = {"A": "T", "T": "A", "C": "G", "G": "C", "N": "N"}
    reversed_dna = dna[::-1]
    return "".join(complement[base] for base in reversed_dna)

In [14]:
# breakpoint 8
# best_left_scaffold: 2 , score: (30.0, 26.0)
# best_right_scaffold: 0 , score: (26.0, 28.0)
alignments = aligner.align(df['scaffolds'][8][1], rev_comp(df["ref_seq_1"][8].upper()))
for aln in alignments:
    print(aln)

target           50 CCACCTGGCAGTGACAGCTAAATTGGGAGCTGCTGTTAACACCAGGAGCCCAGCAGGCAG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 CCACCTGGCAGTGACAGCTAAATTGGGAGCTGCTGTTAACACCAGGAGCCCAGCAGGCAG

target          110 TCAAACCTGGGTTGGGGGGGAGGGGCACCTGTTGAAAGCACCCACGAGCTCCCAGTGGAG
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TCAAACCTGGGTTGGGGGGGAGGGGCACCTGTTGAAAGCACCCACGAGCTCCCAGTGGAG

target          170 CAGGGAAGGCAGGGAAGGCATGGCGATAGCTGTCAGCAGGAGTGGGTGGGGAGTGGAAGG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 CAGGGAAGGCAGGGAAGGCATGGCGATAGCTGTCAGCAGGAGTGGGTGGGGAGTGGAAGG

target          230 CAGGGTGGACCGCCAAACCCACCTTGGGGCCACAGTGAGATGGTCCCTAGGATGCCTTGG
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 CAGGGTGGACCGCCAAACCCACCTTGGGGCCACAGTGAGATGGTCCCTAGGATGCCTTGG

target          290 CTGG

In [15]:
# breakpoint 8
# best_left_scaffold: 2 , score: (30.0, 26.0)
# best_right_scaffold: 0 , score: (26.0, 28.0)
alignments = aligner.align(df['scaffolds'][8][0], rev_comp(df["ref_seq_2"][8].upper()))
for aln in alignments:
    print(aln)

target          127 GTTACCTCCTTGCTGTGACCTCACATGGTCTTTCCTCCGCCTGCTCATCTCTGGTGTCTG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 GTTACCTCCTTGCTGTGACCTCACATGGTCTTTCCTCCGCCTGCTCATCTCTGGTGTCTG

target          187 ATTTCTGTGTGTGCAAATTTTCTCTTCTAAGGAGAACACCAGTCAGACTGGATAAGAACC
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 ATTTCTGTGTGTGCAAATTTTCTCTTCTAAGGAGAACACCAGTCAGACTGGATAAGAACC

target          247 CACCCTCATGACCTCATTTTCACTTAATCACCTCCATAAAGACCCTGTCTCCAAATTTAG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 CACCCTCATGACCTCATTTTCACTTAATCACCTCCATAAAGACCCTGTCTCCAAATTTAG

target          307 TCACACACTGTGATTAAGGCTTCATTGCATGAATCTTGAGGGACACAATTTAGCCAATAA
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 TCACACACTGTGATTAAGGCTTCATTGCATGAATCTTGAGGGACACAATTTAGCCAATAA

target          367 GCCT

In [16]:
# Recall, df is # breakpoint rows. Each row contains x scaffolds.
# We compute a new column which is a list of the highest score PairwiseAlignment objects for each of x scaffolds.
# We take a max of the scores between non-rev comp and rev comp of reference for each scaffold to resolve orientation issues.
df["alignments_left"] = df.apply(
    lambda row: [
        max(
            [aligner.align(scaffold, row["ref_seq_1"].upper())[0],
             aligner.align(scaffold, rev_comp(row["ref_seq_1"].upper()))[0]],
            key=lambda aln: aln.score
        )
        for scaffold in row["scaffolds"]
    ],
    axis=1
)

df["alignments_right"] = df.apply(
    lambda row: [
        max(
            [aligner.align(scaffold, row["ref_seq_2"].upper())[0],
             aligner.align(scaffold, rev_comp(row["ref_seq_2"].upper()))[0]],
            key=lambda aln: aln.score
        )
        for scaffold in row["scaffolds"]
    ],
    axis=1
)

In [17]:
# Reformats formatted PairwiseAlignment lists by adding a custom reference chrom and pos, accounting for orientation
def format_alignment(arr, ori, ref_chr, ref_start=0, query_start=0):
    output = []

    for i in range(0, len(arr), 4):
        if i + 2 >= len(arr) or not arr[i].strip():
            continue

        # --- parse target ---
        _, tgt_start, seq, *rest_t = arr[i].split()
        tgt_start = int(tgt_start)
        if(i==0):
            query_start = query_start + tgt_start + 1
        seq_len = len(seq)
        end_num = query_start + seq_len - 1

        # fixed label + start
        prefix = f"{'target':<15}{query_start} "
        prefix_len = len(prefix)

        # --- parse ref ---
        _, ref_offset, ref_seq, *rest_r = arr[i + 2].split()
        ref_len   = len(ref_seq)
        ref_offset = int(ref_offset)
        if i==0:
            ref_start = ref_start + ref_offset - 350 if ori else ref_start + 350 - ref_offset
        ref_start = ref_start + 1

        ref_end   = ref_start + ref_len - 1 if ori else ref_start - ref_len + 1
        ref_label = f"{'ref':<15}{ref_chr}:{ref_start} "
        ref_row_str = f"{ref_label}{ref_seq} {ref_end}"

        # where the ref bases begin
        ref_seq_col = ref_row_str.index(ref_seq)

        # compute how much to shift the target seq so seq cols match
        pad_spaces = ref_seq_col - prefix_len
        if pad_spaces < 0:
            pad_spaces = 0

        # build target line (label stays flush left)
        first_row_str = prefix + " " * pad_spaces + seq + f" {end_num}"

        # --- build pipes ---
        raw_pipes   = arr[i + 1].strip()
        clean_pipes = ''.join(ch for ch in raw_pipes if not ch.isdigit())
        # reuse exact same pad that moved the target seq
        pipe_line = " " * (prefix_len + pad_spaces - 1) + clean_pipes

        # --- collect ---
        output.append(first_row_str)
        output.append(pipe_line)
        output.append(ref_row_str)
        output.append("")

        # advance
        query_start += seq_len
        ref_start   += ref_len - 1 if ori else -ref_len + 1

    return "\n".join(output)


In [18]:
# format function turns PairwiseAlignment object into string which can be made into list
# 2nd breakpoint, first scaffold, assuming left side alignment is chrom1 and pos1
print(format_alignment(format(df['alignments_left'][0][0]).split('\n'), False, df['chrom1'][0], df['pos1'][0]))
print(df['alignments_left'][0][0])
print(format_alignment(format(df['alignments_right'][0][0]).split('\n'), True, df['chrom2'][0], df['pos2'][0]))
print(df['alignments_right'][0][0])

target         492            GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG 551
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607147 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG 133607088

target         552            ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT 611
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607089 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT 133607030

target         612            CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT 671
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607031 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT 133606972

target         672            AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC 731
                              |||||||||||

In [19]:
# format function turns PairwiseAlignment object into string which can be made into list
# 2nd breakpoint, first scaffold, assuming left side alignment is chrom1 and pos1
print(format_alignment(format(df['alignments_left'][10][0]).split('\n'), True, df['chrom1'][10], df['pos1'][10]))

print(format_alignment(format(df['alignments_right'][10][0]).split('\n'), False, df['chrom2'][10], df['pos2'][10]))

target         1              GTCCATTCCACCTGAATGCACACTGATTCTGATCCTATTCTCTGAGGGAAATAGACAAGA 60
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr22:23049722 GTCCATTCCACCTGAATGCACACTGATTCTGATCCTATTCTCTGAGGGAAATAGACAAGA 23049781

target         61             CCTCAGTATCTGCTTCAGTGGTGTAACTGTGCACCTGCAGCCATAGCATTCTGTTCCCAG 120
                              ||||||||||||||||||||..||||||||||||||||||||||||||||||||||||||
ref            chr22:23049782 CCTCAGTATCTGCTTCAGTGAGGTAACTGTGCACCTGCAGCCATAGCATTCTGTTCCCAG 23049841

target         121            AGATCCCCATCTGGCACAGATATTGCTAATGCCACTATTGCCCATCTGGGTCTGTACAAA 180
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr22:23049842 AGATCCCCATCTGGCACAGATATTGCTAATGCCACTATTGCCCATCTGGGTCTGTACAAA 23049901

target         181            TCCAAATTCATCTTCCAGGATCATTCAACTATTGTAGAGATCAGTCCAGTAAACTGAAAA 240
                              |||||||||||||||

In [20]:
print(len(df['scaffolds'][1][0]))
print(df['scaffolds'][7][0])
print(df['ref_seq_1'][7])

1120
AACATGAAGTGCCACGGAGAGACACAGAGGATGCTTAGGGAATTTAGGATAGATAAAATGCATATGTCAAGAACTATGTTACATTTCAACCTAGTTTTCAAAATCAGAATACAACATCTGTCACAACCATCCACATAGCTGTGGACTATGTTTGTCAACATTAACAAAAGTAAGAGATTATTTGTTTTCCAGCAGCGTTCTCCTAACTTGTTTATTTCCACAAACCGTCTTTTCAATTTGTCAAAAGGGTGAATCATTCTAATTGGCTATGGAAGCATGACAATACAATGGAAATAAATGTCATTGGTCTATATAAACATCCATGTAGAAAGTATTGAGCCATTCTCAATGTTTTCTAAAGGAGGAATTACAACGCAATGATACCTCTATGTGAAATGTTTGATAAACAATAAACAAGATTGTTCTTCATTTTTACAGTAAAGGTTTTCCCACACTCTAAGTTAACCAGGTTTGACTATTTAAATCTATGTAACATAATCCCTGGTAATTTGCC
TATCAAACATTTCACATAGAGGTATCATTGCGTTGTAATTCCTCCTTTAGAAAACATTGAGAATGGCTCAATACTTTCTACATGGATGTTTATATAGACCAATGACATTTATTTCCATTGTATTGTCATGCTTCCATAGCCAATTAGAATGATTCACCCTTTTGACAAATTGAAAAGACGGTTTGTGGAAATAAACAAGTTAGGAGAACGCTACTGGAAAACAAATAATCTCTTACTTTTGTTAATGTTGACAAACATAGTCCACAGCTATGTGGATGGTTGTGACAGATGTTGTATTCTGATTTTGAAAACTAGGTTGAAATGTAACATAGTTCTTGACATATGCATTTTTCCAAAGGTCATTGGGATCAGGTTTATGGCTAGAGAGTTAATGATTCGGGAAATATTTTCATTGAGGATGAAGCTTGGCAGGGGTTGGGGGTAGTATAGCCTTCCGGTAGGCATCATAGAGTGTGTCAC

In [21]:
len('TTCTGCTAATACAGAAGAGGGAACA-----GTTTATTTCCCACTCTTAAAAGTGCATGGA')

60

In [22]:
# alignments = aligner.align(df['scaffolds'][3][0], rev_comp(df["ref_seq_1"][3].upper()))
print(df['ref_seq_1'][0])
alignments = aligner.align(df['scaffolds'][0][0], df["ref_seq_2"][0].upper())
for aln in alignments:
    print(aln)

ctttgtgtctaccctaaggtcacaaagaagtttcactctcttacatctcatgtttatagttggcctggaattgttttttctgtgtaggggcgaaggttcattttttccctcatattgacctagctgtttaattgaaaagaacatctgctgctgtgtaaatcaggtatacatgtacgtgtggatctgctggggaactccttcctgttaatttgtttgtctttccttgcaccagtattacaatgtattactgtagctttaaaataaatcttaatgcctggtagaataaggcctcctgttttgttcttcaatattgtcttggcttgctcctttgtatttccatatacattttagagtgggttttatcagcttccatacccaaacagaaatacccttaaggattttcttctctgattgcactaaatctataggtttctttgATCCTAGAAATGAATAATAGAATATAAAGAATTCTTATTGATGAAAGAAAGCTATTTCAGGAAACTtttttttttctgttaatgataaaggctaaatgattcctgtcaattgtttcaaattttcaactttcaaaatacctgctgcttctttccaattcacctttctctattctttttttttttccaattgtatttttctTTGACACTCAGTACTTGTCCCTTGATAGTGGAGACCATGTTTATACATTTTCATGGATCGAACAG
target          145 CTGGCTTACCTTGTGCCAGGCAGATGGCAGCCACACAGTGTCCACCGGATGGTTGATTTT
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 CTGGCTTACCTTGTGCCAGGCAGATGGCAGCCACACAGTGTCCACCGGATGGTTGATTTT

target          205 GAAGCAGAGTTAGCTTGTCACCTGCCTCCCTTTC

In [23]:
# alignments = aligner.align(df['scaffolds'][3][0], rev_comp(df["ref_seq_1"][3].upper()))
print(df['ref_seq_1'][4])
alignments = aligner.align(df['scaffolds'][4][0], rev_comp(df["ref_seq_1"][4].upper()))
for aln in alignments:
    print(aln)

gtcttaatttctttatggccagctgttccacagaaatgtggggaaaattcagagtaatatttttaggcttgatagctgactttagggaaaaatgattttagtttctatgacttgcctcaggcatggatgaggggtaagagacaggagacaggagggcaggagaaggtcagagagaaactgcttttgatgccttcattttggggtattattttcagagccccaGTAGTATCATTTACATTCTTTTCATGGGTTTCTTTGGTCTTTATTCTTCTGTTTTTTTCATTAACTGTCACATTACATAAACTATTTATACACATACACACGGATGTTCCATATGAGTCTGAAATAAAAATGCATAATAGTTACAAAACAATTTAAAGATAAGTTAGATGGTCAAATAGTAGAATGATTAATGAATTtgtaggactattcagtttttctttttcatttcatttttaaagttgtacttgcctagcaatttatttcaaatttgttggcatatgtttatttatattatctcattatcctttaaatgcctgaataatattgatattagttatttgtaacttgtatctctgttttaaaaagtttatctatatgtcttgccacagatttattacttttatcattatgttctaagaaacagtttttgccatcttgattctctttgtttacttttttttctTAATGtttcttttttt
target           47 ATAAAAGTAATAAATCTGTGGCAAGACATATAGATAAACTTTTTAAAACAGAGATACAAG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            75 ATAAAAGTAATAAATCTGTGGCAAGACATATAGATAAACTTTTTAAAACAGAGATACAAG

target          107 TTACAAATAACTAATATCAATATTATTCAGGCAT

In [24]:
print(df['alignments_left'][1][0])
print(df['ref_seq_1'][1].upper())
print(len(df['ref_seq_1'][1]))
print(df['pos1'][1])

target          471 ATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGT
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           350 ATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGT

target          531 GATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATG
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           410 GATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATG

target          591 TGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTG
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           470 TGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTG

target          651 AGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTT
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           530 AGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTT

target          711 GGAG

In [25]:
print(df['alignments_left'][0][0])

target          491 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           350 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG

target          551 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           410 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT

target          611 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           470 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT

target          671 AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           530 AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC

target          731 AATT

In [26]:
# format function turns PairwiseAlignment object into string which can be made into list
# 2nd breakpoint, first scaffold, assuming left side alignment is chrom1 and pos1
print(format_alignment(format(df['alignments_left'][0][0]).split('\n'), True, df['chrom1'][0], df['pos1'][0]))

print(format_alignment(format(df['alignments_right'][0][0]).split('\n'), True, df['chrom2'][0], df['pos2'][0]))

target         492            GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG 551
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607147 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG 133607206

target         552            ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT 611
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607207 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT 133607266

target         612            CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT 671
                              ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
ref            chr9:133607267 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT 133607326

target         672            AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC 731
                              |||||||||||

In [27]:
print(df['alignments_left'][0][0])
print(df['ref_seq_1'][0])
print(len(df['ref_seq_1'][0]))

target          491 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           350 GAGTGGGTTTTATCAGCTTCCATACCCAAACAGAAATACCCTTAAGGATTTTCTTCTCTG

target          551 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           410 ATTGCACTAAATCTATAGGTTTCTTTGATCCTAGAAATGAATAATAGAATATAAAGAATT

target          611 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           470 CTTATTGATGAAAGAAAGCTATTTCAGGAAACTTTTTTTTTTCTGTTAATGATAAAGGCT

target          671 AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           530 AAATGATTCCTGTCAATTGTTTCAAATTTTCAACTTTCAAAATACCTGCTGCTTCTTTCC

target          731 AATT

In [28]:
print(df['alignments_right'][0][0])
print(df['ref_seq_2'][0])
print(len(df['ref_seq_2'][0]))

target          145 CTGGCTTACCTTGTGCCAGGCAGATGGCAGCCACACAGTGTCCACCGGATGGTTGATTTT
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 CTGGCTTACCTTGTGCCAGGCAGATGGCAGCCACACAGTGTCCACCGGATGGTTGATTTT

target          205 GAAGCAGAGTTAGCTTGTCACCTGCCTCCCTTTCCCGGGACAACAGAAGCTGACCTCTTT
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 GAAGCAGAGTTAGCTTGTCACCTGCCTCCCTTTCCCGGGACAACAGAAGCTGACCTCTTT

target          265 GATCTCTTGCGCAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCA
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 GATCTCTTGCGCAGATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCA

target          325 CTCAGCCACTGGATTTAAGCAGAGTTCAAGTAAGTACTGGTTTGGGGAGGAGGGTTGCAG
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 CTCAGCCACTGGATTTAAGCAGAGTTCAAGTAAGTACTGGTTTGGGGAGGAGGGTTGCAG

target          385 CGGC

In [29]:
len(df['ref_seq_1'][0])

701

In [30]:
import numpy as np
import pandas as pd
df1 = pd.read_csv("augmented_predictions1.tsv", sep="\t")
df2 = pd.read_csv("augmented_predictions.tsv", sep="\t")

df2["sc_hom_len"] = df2["sc_hom_len"].replace('', np.nan)
df2["sc_hom_len"] = df2["sc_hom_len"].astype(float).astype('Int64')
df2["sc_pos1"] = df2["sc_pos1"].replace('', np.nan)
df2["sc_pos1"] = df2["sc_pos1"].astype(float).astype('Int64')
df2["sc_pos2"] = df2["sc_pos2"].replace('', np.nan)
df2["sc_pos2"] = df2["sc_pos2"].astype(float).astype('Int64')

new_cols = [col for col in df2.columns if col not in df1.columns]
df1_with_new = pd.concat([df1, df2[new_cols]], axis=1)
df1_with_new.to_csv("K562_summaries_refined.tsv", sep="\t", index=False)

In [61]:
alignments = aligner.align(df['scaffolds'][2][1], rev_comp(df["ref_seq_2"][2].upper()))
print(alignments[0])

target            0 GTTAATGAAAAAAACAGAAGAATAAAGACCAAAGAAAC  38
                  0 ||||||||||||||||||||||||||||||||||||||  38
query           313 GTTAATGAAAAAAACAGAAGAATAAAGACCAAAGAAAC 351



In [None]:
alignments = aligner.align(df['scaffolds'][8][1], df["ref_seq_1"][2].upper())
print(alignments[0])

target           35 AACTTTTATCCTCTCTCATGGGGTGCATTACTTTCCAGTCTGGTACATTGTCTAGAAAAG
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           350 AACTTTTATCCTCTCTCATGGGGTGCATTACTTTCCAGTCTGGTACATTGTCTAGAAAAG

target           95 TGTCGGTGACCCTGATTCATCAAGTCATGGGCTACTTTTCACCCCCAAATTTCTTAGTCA
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           410 TGTCGGTGACCCTGATTCATCAAGTCATGGGCTACTTTTCACCCCCAAATTTCTTAGTCA

target          155 TTGTCTCTGCTGTGGGGCCCCACTAGTTCCATGCCATTAAAATTCAGACTTGAGTCATCT
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           470 TTGTCTCTGCTGTGGGGCCCCACTAGTTCCATGCCATTAAAATTCAGACTTGAGTCATCT

target          215 TCATTGAAAATAATATTGATACTATCCAAAGGAAATACAAACAATTTAAAAAATATATAG
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           530 TCATTGAAAATAATATTGATACTATCCAAAGGAAATACAAACAATTTAAAAAATATATAG

target          275 ATCA