In [1]:
from Bio import Align
from Bio.Seq import Seq
from Bio.Align import substitution_matrices
import pandas as pd
import subprocess
import shutil

In [2]:
def extract_region(fasta_file, region):
    """
    Extracts a specific region from a FASTA file using samtools.

    :param fasta_file: Path to the indexed FASTA file.
    :param region: Region in the format "chr:start-end" (e.g., "chr18:180-280").
    :return: The sequence as a string.
    """
    result = subprocess.run(
        ["samtools", "faidx", fasta_file, region],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    if result.returncode == 0:
        # The sequence will be in the second line, as the first line contains the header
        return ''.join(result.stdout.splitlines()[1:])
    else:
        raise Exception(f"Error: {result.stderr.strip()}")
    
def generate_scaffolds(fastq1, fastq2, out_dir='out'):
    """
    Extracts a specific region from a FASTA file using samtools.

    :param fasta_file: Path to the indexed FASTA file.
    :param region: Region in the format "chr:start-end" (e.g., "chr18:180-280").
    :return: The sequence as a string.
    """

    shutil.rmtree('./out')
    result = subprocess.run(
        ["python", "./SPAdes-4.2.0-Linux/bin/spades.py", "--meta", "--pe1-1", f"./fastq/{fastq1}", "--pe1-2", f"./fastq/{fastq2}", "-o", out_dir],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    # print(result.stdout)
    # print(result.stderr)
    if result.returncode == 0:
        # The sequence will be in the second line, as the first line contains the header
        with open(f'./{out_dir}/scaffolds.fasta', 'r') as file:
            file_content = file.read()  # Read the entire content of the file into a string
            return [''.join(s.split('\n')[1:]) for s in file_content.split('>')][1:]
    else:
        raise Exception(f"{result.stderr}")

In [3]:
df = pd.read_csv('./K562/K562_summaries.tsv', sep="\t")
# aligner = Align.PairwiseAligner(mode="global", open_gap_score = -10, extend_gap_score = -0.5, match_score = 1.0, mismatch_score = -1.0)
aligner = Align.PairwiseAligner(mode="local", open_gap_score = -10, extend_gap_score = -5, match_score=2, mismatch_score=-2)
df['ref_seq_1'] = df['chrom1'].str.cat((df['pos1'] - 350).astype(str).str.cat((df['pos1'] + 350).astype(str), sep='-'), sep=':').apply(lambda x: extract_region('hg19/hg19full.fa', x))
df['ref_seq_2'] = df['chrom2'].str.cat((df['pos2'] - 350).astype(str).str.cat((df['pos2'] + 350).astype(str), sep='-'), sep=':').apply(lambda x: extract_region('hg19/hg19full.fa', x))
fastqs = ([('b_' + df['chrom1'] + '_' + (df['pos1']+1).astype(str) + '_' + df['chrom2'] + '_' + (df['pos2']+1).astype(str) + '_' + str(x) + '.fastq.gz').to_list() for x in range(1, 3)])
df['scaffolds'] = [generate_scaffolds(fastqs[0][idx], fastqs[1][idx]) for idx, val in enumerate(fastqs[0])]



In [4]:
scaffold_alignments = df.apply(lambda row: [(aligner.align(row['ref_seq_1'], scaffold)[0].score, aligner.align(row['ref_seq_2'], scaffold)[0].score) for scaffold in row['scaffolds']], axis=1)
scaffold_alignments

0                                      [(132.0, 700.0)]
1                         [(28.0, 48.0), (530.0, 12.0)]
2                        [(22.0, 106.0), (700.0, 28.0)]
3                        [(120.0, 18.0), (18.0, 224.0)]
4                                      [(106.0, 106.0)]
5            [(20.0, 22.0), (18.0, 20.0), (18.0, 20.0)]
6           [(456.0, 30.0), (18.0, 26.0), (20.0, 24.0)]
7           [(30.0, 26.0), (22.0, 248.0), (26.0, 32.0)]
8            [(26.0, 28.0), (26.0, 22.0), (30.0, 26.0)]
9                                        [(4.0, 294.0)]
10         [(226.0, 530.0), (16.0, 20.0), (58.0, 76.0)]
11    [(584.0, 124.0), (58.0, 76.0), (66.0, 64.0), (...
dtype: object

In [5]:
for idx, aligns in enumerate(scaffold_alignments):
    print("breakpoint", idx)
    left_max = max(scaffold_alignments[idx], key=lambda x: x[0])
    left_idx_max = scaffold_alignments[idx].index(left_max)
    right_max = max(scaffold_alignments[idx], key=lambda x: x[1])
    right_idx_max = scaffold_alignments[idx].index(right_max)
    print("best_left_scaffold:", left_idx_max, ", score:", left_max)
    print("best_right_scaffold:", right_idx_max, ", score:", right_max)


breakpoint 0
best_left_scaffold: 0 , score: (132.0, 700.0)
best_right_scaffold: 0 , score: (132.0, 700.0)
breakpoint 1
best_left_scaffold: 1 , score: (530.0, 12.0)
best_right_scaffold: 0 , score: (28.0, 48.0)
breakpoint 2
best_left_scaffold: 1 , score: (700.0, 28.0)
best_right_scaffold: 0 , score: (22.0, 106.0)
breakpoint 3
best_left_scaffold: 0 , score: (120.0, 18.0)
best_right_scaffold: 1 , score: (18.0, 224.0)
breakpoint 4
best_left_scaffold: 0 , score: (106.0, 106.0)
best_right_scaffold: 0 , score: (106.0, 106.0)
breakpoint 5
best_left_scaffold: 0 , score: (20.0, 22.0)
best_right_scaffold: 0 , score: (20.0, 22.0)
breakpoint 6
best_left_scaffold: 0 , score: (456.0, 30.0)
best_right_scaffold: 0 , score: (456.0, 30.0)
breakpoint 7
best_left_scaffold: 0 , score: (30.0, 26.0)
best_right_scaffold: 1 , score: (22.0, 248.0)
breakpoint 8
best_left_scaffold: 2 , score: (30.0, 26.0)
best_right_scaffold: 0 , score: (26.0, 28.0)
breakpoint 9
best_left_scaffold: 0 , score: (4.0, 294.0)
best_righ

In [6]:
scaffold_alignments[11]

[(584.0, 124.0), (58.0, 76.0), (66.0, 64.0), (58.0, 14.0)]

In [None]:
df['scaffolds'][1]

['GTCCATTCCACCTGAATGCACACTGATTCTGATCCTATTCTCTGAGGGAAATAGACAAGACCTCAGTATCTGCTTCAGTGGTGTAACTGTGCACCTGCAGCCATAGCATTCTGTTCCCAGAGATCCCCATCTGGCACAGATATTGCTAATGCCACTATTGCCCATCTGGGTCTGTACAAATCCAAATTCATCTTCCAGGATCATTCAACTATTGTAGAGATCAGTCCAGTAAACTGAAAACAAAACCCCAGAAGTAATCACACGTACTGCGGGAGAGGAGAGAGCTGTCTCAGAGGGCACAGTCTTGTTGAAATTCCTCTGCTTCTTGGGAAGAAACAGCCCAGAAAAGGCTCCAGGGGCTGCGTCTCATCTGGGTTCCTGGATGAGGCACACCCCCTACTGCACTGAGATTCCACAGAAGCTGGGGGTGAAGTGCAAACACCGTCTTTGGTTCATCCTTGGGATTTTCTTTTTATCACACACTCGTCATTTTGTCCACAATGTCCAAAGTTTCTAAAGTGCTGTCTAATAGATAGAGGTTCTTTGTGCGCTGAACTAAAAGCTCTTGGGTTGGACAATGGTGATGTTTGCACAACATTTTTACTGTACTTACTGCCGCTTAATTACA',
 'TTTTTTAGATGGAGTCTCACTCTGTAGCCCAGGCTGGAGTGCAGTGGCCCGATCTTGGCTCACTGCAAGCTCCACCTCCCAGATTCACACCGTTCTCCTGCCTCAGCCTCCCTAGTAGCTGGGACTACAGGCACGCGCCACCGCACCCGGCTAATTTTTTTGTATTTTTTTATTTTAGTAGAGATGAGTTTTCACTGTGTTAGCCAGGATGGTCTCAATCTCCTGACCTCGTGATCTGCCTGCTTCGGCCTCCCAAAGTGCTGGGATTACAGGTGTGAGCCACCGCGCTCGGCCATAACGTGTTGTTTATTTTGTAGGAGACTCTCAGCTGCTCCAGACAACTGAAGTCCCAATTTCAGCTAATA

In [9]:
for str in df['scaffolds'][1]:
    print(str, "\n")

TCCTTTGACATGTGTATATCCTCTTAATTGAAATGTCTCTTCAGGACTTTGGCCTATTTTCTGGATGGGCTATTGGATTTTTTAATGTAGAGTTTTGAGAGTTCTTTATGTATTCTAGATACCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAATTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGTGATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATGTGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTGAGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTTGGAGCTGGTCAGATCGGAGCTGGCATCCAGCCTCTGAGAAGTCTCTATCCCCTGATCCCTGGGTTTTTGGGTTTCTCTATCTGTGGAGCCCACTGCCTAGGGCTAAGGTCAAGGATAGAAGAGACAGTGCCTGCTCAATGTAACGCCTGACACACAGTGGGTGCTTTACAAACCCAGGCGGAGCCCAGGACGGAGGGGGCGGGGCACTGTCACTCCCAGCTGGCCTTGAAGAGAGCCCACCCCCGGGAAATCACCTCCAGCAGCTTCACTGCTCACAAGGCGGTTCAGG

In [13]:
df["ref_seq_1"][1].upper()

'TTGACCTTAGCCCTAGGCAGTGGGCTCCACAGATAGAGAAACCCAAAAACCCAGGGATCAGGGGATAGAGACTTCTCAGAGGCTGGATGCCAGCTCCGATCTGACCAGCTCCAAAGCCCACATTCAGCTTCTTCACCAAGCACACCCCACTTGCTAGCCAGGGCCAGCCCCTCAGAACTAGGATGGGACTGATCACTAGGAAGTCCTACCAGCAGAGGTGTGTGGGCGCCCACATGGTGAGGACTTGCAAAGTAGAAGACCCCTCTCCCTTAACCTGCTCCCCCTTGACATCACCCCTGCACAGGATGTCATCTCAGTCTGCAGGAGAAGGGACCAGGAAGCATCAGTCCATGGGCCTGTTTGCCGGCTTTGAGCTGGGGGGCAGCTCTGCCAGGCAGAGGGTGTGAATGGGCTCGGCTCCTGCATGCACTCCCTGCCCTGGCTTCTGAAGCGCCCACCAGGGTTCTGGTGGCTTCCCCAGCTCGCGGCCCCTCCGGCCCTGCCGCTTCCCTACACAGCCCTTCCTCCTGCTCGTCATTTTCCGGAGCTCCGTGGGTGTTGGGCACAGACCCACGGCAGCCTCAGCATATGTGGAAGGCTCATGACGCATCCACGCGCGTGTGAAGTGCCGGTGTCTGCAGAGCAGAGGAAGGGGACATCTGTTTGAGTCATTGCCCAACAGCACAGGGCACTCCCATTGC'

In [14]:
df["ref_seq_2"][1].upper()

'CCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAACTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATTTCTTAGACTGCGTTAAAATTTTTATTTCAATCATCACATACATTTTAGATCATGGTATAATTTCTGTTTTAATCACTACATATGATTTAGAAAACCTATGATGAGGATAATCTATGTATTTACCCACAGTTCTGCTCCTTCTGTTGCTGTTTTTTTTTTTCTTCCTGATGTTTCCAGACTTCTTCTTTCACCATTTCCTTTCCTTTTAAAGAATTTTCTGTAACCATTCTTTAAGGATAGATCTGCAATTGACAAATTATTTTACATTCTATTTGTCTGAAAATGTCTTTATTTCTACTTCATTCTTGCTGGATTGTTGTAGCTGATATACAATCCATGGTTGGTAGT'

In [20]:
df["scaffolds"][1][0]

'TCCTTTGACATGTGTATATCCTCTTAATTGAAATGTCTCTTCAGGACTTTGGCCTATTTTCTGGATGGGCTATTGGATTTTTTAATGTAGAGTTTTGAGAGTTCTTTATGTATTCTAGATACCAATCCCTTTTTGGATATGTAAATGCAATATTTTCTTCCAGAAAAAGTTTGAAGATTTTAAAAATTGTTTTCTGCTTGCTTTCTCTTTTTCTTGTTCCTCTGTTTTCTTTTCTTTCATCCTTTTGTGCTACTGGAACATATTTTTAAAATTTCATCTTGACTTTGTAAATAATGTTTATCCCTTTGTGTAGGTTTCTTGGTGGTTATTCTGAGTATTACAATATTATTACAATATAAATATATAATTTATCACAGCCTAAACATTCAAGTTTTATGACTTTGAGTGAAGTATTTAGGACTTGCTTCTATTGAGGCTATTTTACCTTCCTGCTTTGTCTTGAAAATTATAATGGACTGATGCTTCCTGGTCCCTTCTCCTGCAGACTGAGATGACATCCTGTGCAGGGGTGATGTCAAGGGGGAGCAGGTTAAGGGAGAGGGGTCTTCTACTTTGCAAGTCCTCACCATGTGGGCGCCCACACACCTCTGCTGGTAGGACTTCCTAGTGATCAGTCCCATCCTAGTTCTGAGGGGCTGGCCCTGGCTAGCAAGTGGGGTGTGCTTGGTGAAGAAGCTGAATGTGGGCTTTGGAGCTGGTCAGATCGGAGCTGGCATCCAGCCTCTGAGAAGTCTCTATCCCCTGATCCCTGGGTTTTTGGGTTTCTCTATCTGTGGAGCCCACTGCCTAGGGCTAAGGTCAAGGATAGAAGAGACAGTGCCTGCTCAATGTAACGCCTGACACACAGTGGGTGCTTTACAAACCCAGGCGGAGCCCAGGACGGAGGGGGCGGGGCACTGTCACTCCCAGCTGGCCTTGAAGAGAGCCCACCCCCGGGAAATCACCTCCAGCAGCTTCACTGCTCACAAGGCGGTTCAG

In [31]:
def rev_comp(dna):
    complement = {"A": "T", "T": "A", "C": "G", "G": "C", "N": "N"}
    reversed_dna = dna[::-1]
    return "".join(complement[base] for base in reversed_dna)

In [72]:
# breakpoint 8
# best_left_scaffold: 2 , score: (30.0, 26.0)
# best_right_scaffold: 0 , score: (26.0, 28.0)
alignments = aligner.align(df['scaffolds'][8][1], rev_comp(df["ref_seq_1"][8].upper()))
for aln in alignments:
    print(aln)

target           51 CACCTGGCAGTGACAGCTAAATTGGGAGCTGCTGTTAACACCAGGAGCCCAGCAGGCAGT
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 CACCTGGCAGTGACAGCTAAATTGGGAGCTGCTGTTAACACCAGGAGCCCAGCAGGCAGT

target          111 CAAACCTGGGTTGGGGGGGAGGGGCACCTGTTGAAAGCACCCACGAGCTCCCAGTGGAGC
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 CAAACCTGGGTTGGGGGGGAGGGGCACCTGTTGAAAGCACCCACGAGCTCCCAGTGGAGC

target          171 AGGGAAGGCAGGGAAGGCATGGCGATAGCTGTCAGCAGGAGTGGGTGGGGAGTGGAAGGC
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 AGGGAAGGCAGGGAAGGCATGGCGATAGCTGTCAGCAGGAGTGGGTGGGGAGTGGAAGGC

target          231 AGGGTGGACCGCCAAACCCACCTTGGGGCCACAGTGAGATGGTCCCTAGGATGCCTTGGC
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 AGGGTGGACCGCCAAACCCACCTTGGGGCCACAGTGAGATGGTCCCTAGGATGCCTTGGC

target          291 TGGC

In [69]:
# breakpoint 8
# best_left_scaffold: 2 , score: (30.0, 26.0)
# best_right_scaffold: 0 , score: (26.0, 28.0)
alignments = aligner.align(df['scaffolds'][8][0], rev_comp(df["ref_seq_2"][8].upper()))
for aln in alignments:
    print(aln)

target          128 TTACCTCCTTGCTGTGACCTCACATGGTCTTTCCTCCGCCTGCTCATCTCTGGTGTCTGA
                  0 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query             0 TTACCTCCTTGCTGTGACCTCACATGGTCTTTCCTCCGCCTGCTCATCTCTGGTGTCTGA

target          188 TTTCTGTGTGTGCAAATTTTCTCTTCTAAGGAGAACACCAGTCAGACTGGATAAGAACCC
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            60 TTTCTGTGTGTGCAAATTTTCTCTTCTAAGGAGAACACCAGTCAGACTGGATAAGAACCC

target          248 ACCCTCATGACCTCATTTTCACTTAATCACCTCCATAAAGACCCTGTCTCCAAATTTAGT
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           120 ACCCTCATGACCTCATTTTCACTTAATCACCTCCATAAAGACCCTGTCTCCAAATTTAGT

target          308 CACACACTGTGATTAAGGCTTCATTGCATGAATCTTGAGGGACACAATTTAGCCAATAAG
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           180 CACACACTGTGATTAAGGCTTCATTGCATGAATCTTGAGGGACACAATTTAGCCAATAAG

target          368 CCTC