In [163]:
filename ="./sequence.fasta"

In [164]:
from Bio import SeqIO

In [165]:
seq_object= SeqIO.read(filename,"fasta")


In [166]:
print(seq_object.description)

NM_001101.5 Homo sapiens actin beta (ACTB), mRNA


In [167]:
sequence= seq_object.seq
print(len(sequence))

1812


In [168]:
import random
def truncate_sequence(sequence):
    if len(sequence) > 400:
        # Calculate number of deletions required
        num_deletions = len(sequence) - 400
        # Randomly select positions to delete
        delete_positions = random.sample(range(len(sequence)), num_deletions)
        # Create truncated sequence
        truncated_sequence = ''.join([sequence[i] for i in range(len(sequence)) if i not in delete_positions])
        return truncated_sequence
    else:
        return sequence

# Function to introduce mutations based on specified rules
def introduce_mutations(sequence, k):
    # Insertions
    for _ in range(k):
        pos = random.randint(0, len(sequence))
        sequence = sequence[:pos] + random.choice('ACGT') + sequence[pos:]
    
    # Deletions
    for _ in range(k):
        pos = random.randint(0, len(sequence) - 1)
        sequence = sequence[:pos] + sequence[pos+1:]
    
    # Single-point mutations
    for _ in range(k):
        pos = random.randint(0, len(sequence) - 1)
        sequence = sequence[:pos] + random.choice('ACGT'.replace(sequence[pos], '')) + sequence[pos+1:]
    
    # Generate X1 variant
    start1 = 2 * k
    repeat_seq1 = sequence[start1:start1 + k]  # Subsequence starting at 2k
    start2 = 400 - 2 * k
    delete_seq1 = sequence[start2:start2 + k]  # Subsequence starting at 400-2k

    X1 = sequence[:start2] + repeat_seq1 + sequence[start2:]  # Insert repeat_seq1
    X1 = X1.replace(delete_seq1, '')  # Delete delete_seq1 from X1

    # Generate X2 variant
    start3 = 400 - 2 * k
    repeat_seq2 = sequence[start3:start3 + k]  # Subsequence starting at 400-2k
    start4 = 2 * k
    delete_seq2 = sequence[start4:start4 + k]  # Subsequence starting at 2k

    X2 = sequence[:start4] + repeat_seq2 + sequence[start4:]  # Insert repeat_seq2
    X2 = X2.replace(delete_seq2, '')  # Delete delete_seq2 from X2

    return X1, X2




In [169]:
truncated_sequence = truncate_sequence(sequence)
print(len(truncated_sequence))


400


In [170]:
print(truncated_sequence)

GCGCGGCTCCGCCACGCCTCATGACCCGCAGAGCCGGCCCCCCCCATAGGGCGATTCGCGAGCCGACGTACTGCGACCTAGGGAGAGGCTGCCCCAGACGACCCCTCGTGCCCCGGAACGCTCTCCCGTCACGTCACAGGCCCGTCCAGCGAGGAGCGATACCCCGTCCCCCCCCTCCGCGCGATTACTCTGCAGATCCTTTCGCGCACTCACTCGGGCGTGAAGTGCTCACTTAACCTGGTACCCTCGCTGGGTCACGAAATCGTCATCCAAATGTAGATTGCTACAAGGGATATGTTTGTTTTTTTACGAGCGATGTTGCCAAGGTAGTGTCTGATCTCCCATCTGGCACCGGTTTGAGTTTTTGTTATGCCCTTTCCGACTTGGGGCTCTAGACAAG


In [171]:
k = 10
X1, X2 = introduce_mutations(truncated_sequence, k)

print(truncated_sequence)
print("\nVariant X1:")
print(X1)
print("\nVariant X2:")
print(X2)


GCGCGGCTCCGCCACGCCTCATGACCCGCAGAGCCGGCCCCCCCCATAGGGCGATTCGCGAGCCGACGTACTGCGACCTAGGGAGAGGCTGCCCCAGACGACCCCTCGTGCCCCGGAACGCTCTCCCGTCACGTCACAGGCCCGTCCAGCGAGGAGCGATACCCCGTCCCCCCCCTCCGCGCGATTACTCTGCAGATCCTTTCGCGCACTCACTCGGGCGTGAAGTGCTCACTTAACCTGGTACCCTCGCTGGGTCACGAAATCGTCATCCAAATGTAGATTGCTACAAGGGATATGTTTGTTTTTTTACGAGCGATGTTGCCAAGGTAGTGTCTGATCTCCCATCTGGCACCGGTTTGAGTTTTTGTTATGCCCTTTCCGACTTGGGGCTCTAGACAAG

Variant X1:
GCGCGGCTCCGCCACGCATCATGACCGCAGAGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCTAGGGACGAGGCGCCCAGACGACCCCGCGTGCACCGGAACAGCTCTCCCGTCACGTCACAGGCCTGTCCAGTAGAGGAGCGATACCCGTCCCCCCCCTCCGCGCGAGTACTCTGCAGATCCTTTCGCGCACTCACTAGGGCGTGAAGTTCTCACCTTAACCTGGTACCCTCGCTGGGGTCACGAAATCGTCATCCAAATGTAGATGCTACAAGGATATGTTTGTTTTTTACGAACGATGTTGCCAAGGTTAGGTGTCTGATCTCCCATCTGGGCACCGGTCTTGAGATTTTGTTATGCCCTTTCCATGACCGCAGTCTAGACAAG

Variant X2:
GCGCGGCTCCGCCACGCATCGACTTGGGGCAGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCTAGGGACGAGGCGCCCAGACGACCCCGCGTGCACCGGAACAGCTCTCCCGTCACGTCACAGGCCTGTCCAGTAGAGGAGCGATACCCGTCCCCCCCC

In [172]:
from Bio import pairwise2
from Bio.Seq import Seq
from Bio import SeqIO

# Function to perform Needleman-Wunsch alignment
def needleman_wunsch_alignment(seq1, seq2):
    alignments = pairwise2.align.globalms(seq1, seq2, 1, -1, -1, -1)
    best_alignment = alignments[0]
    return best_alignment

# Function to perform FASTA alignment
def fasta_alignment(seq1, seq2):
    alignments = pairwise2.align.localms(seq1, seq2, 1, -1, -1, -1)
    best_alignment = alignments[0]
    return best_alignment


In [173]:
alignment_nw = needleman_wunsch_alignment(X1, X2)
alignment_fasta = fasta_alignment(X1, X2)

print("\nNeedleman-Wunsch Alignment:")
print(pairwise2.format_alignment(*alignment_nw))
print("\nFASTA Alignment:")
print(pairwise2.format_alignment(*alignment_fasta))




Needleman-Wunsch Alignment:
GCGCGGCTCCGCCACGCATCATGACC-GCAG-AGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCTAGGGACGAGGCGCCCAGACGACCCCGCGTGCACCGGAACAGCTCTCCCGTCACGTCACAGGCCTGTCCAGTAGAGGAGCGATACCCGTCCCCCCCCTCCGCGCGAGTACTCTGCAGATCCTTTCGCGCACTCACTAGGGCGTGAAGTTCTCACCTTAACCTGGTACCCTCGCTGGGGTCACGAAATCGTCATCCAAATGTAGATGCTACAAGGATATGTTTGTTTTTTACGAACGATGTTGCCAAGGTTAGGTGTCTGATCTCCCATCTGGGCACCGGTCTTGAGATTTTGTTATGCCCTTTCCATGACC-GCAG-TCTAGACAAG
||||||||||||||||||||  |||. |..| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  |||. |..| ||||||||||
GCGCGGCTCCGCCACGCATC--GACTTGGGGCAGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCTAGGGACGAGGCGCCCAGACGACCCCGCGTGCACCGGAACAGCTCTCCCGTCACGTCACAGGCCTGTCCAGTAGAGGAGCGATA

In [174]:
import time
from Bio import pairwise2

# Function to calculate identity
def calculate_identity_similarity(alignment):
    aligned_seq1, aligned_seq2, score, begin, end = alignment
    alignment_length = len(aligned_seq1)
    num_identical = sum(1 for i in range(alignment_length) if aligned_seq1[i] == aligned_seq2[i] and aligned_seq1[i] != '-')
    identity = num_identical / alignment_length * 100
    return identity

# List of k values to iterate over
k_values = [20, 30, 40, 50]

# Storage for results
results = []

# Iterate over each k value
for k in k_values:
    print(f"\n### Running for k = {k} ###")

    # Measure time and quality for Needleman-Wunsch alignment
    start_time = time.time()
    alignment_nw = needleman_wunsch_alignment(X1, X2)
    nw_time = time.time() - start_time
    nw_score = alignment_nw[2]
    nw_identity = calculate_identity_similarity(alignment_nw)

    # Measure time and quality for FASTA alignment
    start_time = time.time()
    alignment_fasta = fasta_alignment(X1, X2)
    fasta_time = time.time() - start_time
    fasta_score = alignment_fasta[2]
    fasta_identity = calculate_identity_similarity(alignment_fasta)

    # Store results for current k value
    results.append({
        'k': k,
        'NW_time': nw_time,
        'NW_score': nw_score,
        'NW_identity': nw_identity,
        'FASTA_time': fasta_time,
        'FASTA_score': fasta_score,
        'FASTA_identity': fasta_identity
    })

    # Print results for current k value
    print(f"\nNeedleman-Wunsch Alignment (time: {nw_time:.4f} seconds, score: {nw_score}, identity: {nw_identity:.2f}%):")
    print(pairwise2.format_alignment(*alignment_nw))
    print(f"\nFASTA Alignment (time: {fasta_time:.4f} seconds, score: {fasta_score}, identity: {fasta_identity:.2f}%):")
    print(pairwise2.format_alignment(*alignment_fasta))

# Print summary of results side by side
print("\n### Summary of Results ###")
print(f"{'k':<3} | {'NW Time':<8} | {'NW Score':<8} | {'NW Identity':<12} || {'FASTA Time':<10} | {'FASTA Score':<11} | {'FASTA Identity':<14}")
print("-" * 90)
for result in results:
    print(f"{result['k']:<3} | {result['NW_time']:<8.4f} | {result['NW_score']:<8} | {result['NW_identity']:<12.2f} || {result['FASTA_time']:<10.4f} | {result['FASTA_score']:<11} | {result['FASTA_identity']:<14.2f}")



### Running for k = 20 ###

Needleman-Wunsch Alignment (time: 0.0109 seconds, score: 376.0, identity: 96.53%):
GCGCGGCTCCGCCACGCATCATGACC-GCAG-AGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCTAGGGACGAGGCGCCCAGACGACCCCGCGTGCACCGGAACAGCTCTCCCGTCACGTCACAGGCCTGTCCAGTAGAGGAGCGATACCCGTCCCCCCCCTCCGCGCGAGTACTCTGCAGATCCTTTCGCGCACTCACTAGGGCGTGAAGTTCTCACCTTAACCTGGTACCCTCGCTGGGGTCACGAAATCGTCATCCAAATGTAGATGCTACAAGGATATGTTTGTTTTTTACGAACGATGTTGCCAAGGTTAGGTGTCTGATCTCCCATCTGGGCACCGGTCTTGAGATTTTGTTATGCCCTTTCCATGACC-GCAG-TCTAGACAAG
||||||||||||||||||||  |||. |..| ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  |||. |..| ||||||||||
GCGCGGCTCCGCCACGCATC--GACTTGGGGCAGCCGGCCCCCCCCATAGGGCGATTCGGAGCCGACGTACGCGACCT