# Code for BLAST-styled queries 

build_index function: This function creates an index from a given sequence for subsequences (k-mers) of length k. Each k-mer is mapped to all positions where it appears in the sequence.

In [5]:
def build_index(sequences, k):
    """Builds indexes from multiple sequences for subsequences of length k."""
    index = {}
    for seq_id, sequence in enumerate(sequences):
        for i in range(len(sequence) - k + 1):
            k_mer = sequence[i:i+k]
            if k_mer in index:
                index[k_mer].append((seq_id,i))
            else:
                index[k_mer] = [(seq_id,i)]
    return index

query_index function: This function takes a query sequence and finds all k-mers in the query that are present in the index, along with their positions in the original sequence.

In [6]:
def query_index(query, index, k):
    """Queries multiple indexes and finds positions where the query's k-mers match the k-mers in the indexes."""
    matches = {}
    for i in range(len(query) - k + 1):
        k_mer = query[i:i+k]
        if k_mer in index:
            if k_mer in matches:
                matches[k_mer].append(index[k_mer])
            else:
                matches[k_mer] = [index[k_mer]]
    return matches

In [12]:

def merge_matches(matches, k):
    """Merges all k-mer matches into continuous sequences for each database sequence."""
    rearanged_matches = {}
    for k_mer, entries in matches.items():
        for seqix,pos in entries[0]:
            if seqix not in rearanged_matches:
                rearanged_matches[seqix] = []
            rearanged_matches[seqix].append((pos, k_mer))
    matches = []
    for seqix, pos_kmer in rearanged_matches.items():
        pos_kmer.sort() # sort by position
        match_start = pos_kmer[0][0]
        match_end = pos_kmer[-1][0] + k
        match = "-" * (match_end - match_start)
        for pos, kmer in pos_kmer:
            cur_pos = pos - match_start
            # print(cur_pos, kmer, match[cur_pos:cur_pos+k])
            match = match[:cur_pos] + kmer + match[cur_pos+k:]
        matches.append(match)
    return matches

# Example sequences in the database
sequences = ["APEPTIDE", "PEPTIDEA", "TIDEAPEP"]

# Query sequence
query = "PEPTID"

# Build index for a k-mer length of 3 from multiple sequences
k = 3
index = build_index(sequences, k)
print("Indexes:", index)

# Query the indexes
matches = query_index(query, index, k)
print("Matches:", matches)

# Merge matches into continuous sequences
merged_alignments = merge_matches(matches, k)
print("Merged Alignments:", merged_alignments)

Indexes: {'APE': [(0, 0), (2, 4)], 'PEP': [(0, 1), (1, 0), (2, 5)], 'EPT': [(0, 2), (1, 1)], 'PTI': [(0, 3), (1, 2)], 'TID': [(0, 4), (1, 3), (2, 0)], 'IDE': [(0, 5), (1, 4), (2, 1)], 'DEA': [(1, 5), (2, 2)], 'EAP': [(2, 3)]}
Matches: {'PEP': [[(0, 1), (1, 0), (2, 5)]], 'EPT': [[(0, 2), (1, 1)]], 'PTI': [[(0, 3), (1, 2)]], 'TID': [[(0, 4), (1, 3), (2, 0)]]}
Merged Alignments: ['PEPTID', 'PEPTID', 'TID--PEP']
