In [32]:
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

In [33]:

# indelphi sample
# >2_1_4_2_GTGTCGCCGGTCTACATTAC 30 FORWARD
# ATCGCTGAGCGTGTCGCCGGTCTACATTACAGGCTTCCCCCTGGGCTCGTAGCGC
def get_sequence_with_flank_lengths(sequence, pam_index, left_flank_len, right_flank_len, include_spacer=False):
    l = sequence[pam_index-left_flank_len:pam_index]
    r = sequence[pam_index:pam_index+right_flank_len]
    if include_spacer:
        return l + "|" + r
    else: 
        return l + r
sequence = "ATCGCTGAGCGTGTCGCCGGTCTACATTACAGGCTTCCCCCTGGGCTCGTAGCGC"
pam_index = 30
left_flank_len = 30
right_flank_len = 25
# print(f"Starting sequence: {sequence}")
after = get_sequence_with_flank_lengths(sequence, pam_index, left_flank_len, right_flank_len, include_spacer=True)
print(f"After sequence: {after}")

# FORECasT sample
# >Oligo_43170 42 FORWARD
# CTCCTATAATTCTAATCACTACAAGTCAGGAATGCCTGCGTTTGGCCGTCCAGTTAGTAACAGAAGGTCAGGTAAGAGG
sequence = "CTCCTATAATTCTAATCACTACAAGTCAGGAATGCCTGCGTTTGGCCGTCCAGTTAGTAACAGAAGGTCAGGTAAGAGG"
pam_index = 42
# print(f"Starting sequence: {sequence}")
after = get_sequence_with_flank_lengths(sequence, pam_index, left_flank_len, right_flank_len, include_spacer=True)
print(f"After sequence: {after}")





After sequence: ATCGCTGAGCGTGTCGCCGGTCTACATTAC|AGGCTTCCCCCTGGGCTCGTAGCGC
After sequence: TAATCACTACAAGTCAGGAATGCCTGCGTT|TGGCCGTCCAGTTAGTAACAGAAGG


In [None]:

def read_fasta(file_path):
    sequences = []
    # Parse the FASTA file and store sequences in the list
    for record in SeqIO.parse(file_path, "fasta"):
        description = record.description.split(" ")
        oritentation = description[2]
        if oritentation != "FORWARD":
            continue
        pam_index = int(description[1])
        sequence = str(record.seq)
        left_flank_len = 30
        right_flank_len = 25
        after = get_sequence_with_flank_lengths(sequence, 
                                                pam_index, 
                                                left_flank_len, 
                                                right_flank_len, 
                                                include_spacer=False)
        sequences.append(after)  # Convert Seq object to string
        if len(sequences) > 10:
            break
    return sequences

FORECAST_train_file_path = '/Users/colm/repos/repair-outcome-prediction/local/data/FORECasT/train.fasta'
FORECAST_test_file_path = '/Users/colm/repos/repair-outcome-prediction/local/data/FORECasT/train.fasta'
FC_sequences = read_fasta(FORECAST_train_file_path) + read_fasta(FORECAST_test_file_path)

print(f"Number of FORECasT Sequences: {len(FC_sequences)}")

Number of FORECasT Sequences: 22


In [40]:
indelphi_test_file_path = '/Users/colm/repos/repair-outcome-prediction/local/data/inDelphi/train.fasta'
indelphi_train_file_path = '/Users/colm/repos/repair-outcome-prediction/local/data/inDelphi/test.fasta'
indelphi_sequences = read_fasta(indelphi_train_file_path) + read_fasta(indelphi_test_file_path)

print(f"Number of Indelphi Sequences: {len(indelphi_sequences)}")

Number of Indelphi Sequences: 22


In [41]:
def smith_waterman_score(seq1, seq2):
    # Perform local alignment (Smith-Waterman)
    alignments = pairwise2.align.localxx(seq1, seq2)
    
    # Extract the best alignment score
    best_alignment = alignments[0]
    score = best_alignment[2]
    
    return score

In [37]:
max_sw_score = smith_waterman_score(indelphi_sequences[0], indelphi_sequences[0])
print(f"Max Smith-Waterman Score is {max_sw_score}")

Max Smith-Waterman Score is 55.0


In [38]:
from tqdm import tqdm
#benchmarking
for s in tqdm(indelphi_sequences):
    smith_waterman_score(indelphi_sequences[0], s)

# Takes about 23 seconds to do 1 iteration

100%|██████████| 22/22 [00:00<00:00, 90.75it/s]


In [39]:
#estimates
one_core_hourly_estimate = (23 * len(indelphi_sequences)) / 60 / 60
print(f"On 1 core, would take about {one_core_hourly_estimate} hours")

eight_core_hourly_estimate = one_core_hourly_estimate/8
print(f"On 8 core, would take about {eight_core_hourly_estimate} hours")


On 1 core, would take about 0.14055555555555557 hours
On 8 core, would take about 0.017569444444444447 hours
