# Code for BLAST-styled queries 

In [5]:
from collections import defaultdict
import numpy as np

def generate_kmers(sequence, k):
    """Generates all k-mers of length k from a given sequence."""
    kmers = defaultdict(list)
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        kmers[kmer].append(i)
    return kmers

def build_index(database, k):
    """Builds an index of k-mers from a database of sequences."""
    index = defaultdict(list)
    for seq_id, seq in enumerate(database):
        kmers = generate_kmers(seq, k)
        for kmer, positions in kmers.items():
            index[kmer].append((seq_id, positions))
    return index

def calculate_score(alignment1, alignment2, scoring_matrix):
    """Calculates the alignment score based on a scoring matrix."""
    score = 0
    for a, b in zip(alignment1, alignment2):
        score += scoring_matrix.get((a, b), 0)  # Default to 0 for mismatches not in the matrix
    return score

def blast(query, database, k, scoring_matrix, threshold):
    """Performs a BLAST-like search comparing a query sequence to a database."""
    # Step 1: Build the k-mer index
    index = build_index(database, k)
    
    # Step 2: Identify initial hits
    query_kmers = generate_kmers(query, k)
    initial_hits = []
    for kmer, q_positions in query_kmers.items():
        if kmer in index:
            for seq_id, d_positions in index[kmer]:
                for q_pos in q_positions:
                    for d_pos in d_positions:
                        initial_hits.append((seq_id, q_pos, d_pos))
    
    # Step 3: Extend and merge the hits
    alignments = []
    for seq_id, q_pos, d_pos in initial_hits:
        q_sub = query[q_pos:]
        d_sub = database[seq_id][d_pos:]
        min_len = min(len(q_sub), len(d_sub))
        
        q_alignment, d_alignment = q_sub[:min_len], d_sub[:min_len]
        score = calculate_score(q_alignment, d_alignment, scoring_matrix)
        
        # Extend the alignment in both directions
        left, right = 0, min_len - 1
        while left > 0 or right < min_len - 1:
            if left > 0:
                left -= 1
                q_alignment = query[q_pos + left: q_pos + min_len]
                d_alignment = database[seq_id][d_pos + left: d_pos + min_len]
                score = calculate_score(q_alignment, d_alignment, scoring_matrix)
                
            if right < min_len - 1:
                right += 1
                q_alignment = query[q_pos: q_pos + right + 1]
                d_alignment = database[seq_id][d_pos: d_pos + right + 1]
                score = calculate_score(q_alignment, d_alignment, scoring_matrix)
                
            # Break if score drops below threshold
            if score < threshold:
                break

        # Save if score is valid
        if score >= threshold:
            alignments.append((seq_id, q_pos, d_pos, score, q_alignment, d_alignment))

    # Step 4: Sort alignments by score
    alignments = sorted(alignments, key=lambda x: x[3], reverse=True)

    # Step 5: Output the results
    for alignment in alignments:
        seq_id, q_pos, d_pos, score, q_alignment, d_alignment = alignment
        print(f"Database Sequence ID: {seq_id}")
        print(f"Query Position: {q_pos}, Database Position: {d_pos}")
        print(f"Alignment Score: {score}")
        print(f"Query Alignment:   {q_alignment}")
        print(f"Database Alignment: {d_alignment}")
        print()
        
# Example usage
database = [
    "AGCTAGCTAGCTAGC",
    "GCTAGCTAGCTAGCT",
    "TAGCTAGCTAGCTAG"
]
query = "AGCTAGC"

# BLOSUM-like scoring matrix for example purposes
scoring_matrix = {
    ('A', 'A'): 2, ('A', 'C'): -1, ('A', 'G'): -1, ('A', 'T'): -1,
    ('C', 'C'): 2, ('C', 'G'): -1, ('C', 'T'): -1,
    ('G', 'G'): 2, ('G', 'T'): -1,
    ('T', 'T'): 2
}

blast(query, database, k=3, scoring_matrix=scoring_matrix, threshold=10)


Database Sequence ID: 0
Query Position: 0, Database Position: 0
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 0
Query Position: 0, Database Position: 4
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 0
Query Position: 0, Database Position: 8
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 1
Query Position: 0, Database Position: 3
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 1
Query Position: 0, Database Position: 7
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 2
Query Position: 0, Database Position: 1
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID: 2
Query Position: 0, Database Position: 5
Alignment Score: 14
Query Alignment:   AGCTAGC
Database Alignment: AGCTAGC

Database Sequence ID