In [7]:
def build_index(database, k):
    index = {}
    for i in range(len(database)-k+1):
        kmer = database[i:i+k]
        if kmer not in index:
            index[kmer] = []
        index[kmer].append(i)
    return index

In [18]:
def seeding(query, database, k):
    index = build_index(database, k)
    seeds = []
    for i in range(len(query) - k + 1):
        kmer = query[i:i + k]
        if kmer in index:
            for j in index[kmer]:
                seeds.append((i, j))
    return seeds, index

In [9]:
def extend_seed(query, db, i, j, k, match=2, mismatch=-1, gap=-2, X=2):
    left_max_score = k * match
    left_cur_score = left_max_score
    left_best_pos = (i, j)
    
    offset = 1
    while i - offset >= 0 and j - offset >= 0:
        if query[i - offset] == db[j - offset]:
            left_cur_score += match
        else:
            left_cur_score += mismatch
        
        if left_cur_score > left_max_score:
            left_max_score = left_cur_score
            left_best_pos = (i - offset, j - offset)
        
        if left_max_score - left_cur_score >= X:
            break
        offset += 1
    
    right_max_score = k * match
    right_cur_score = right_max_score
    right_best_pos = (i + k - 1, j + k - 1)
    
    offset = 1
    while i + k - 1 + offset < len(query) and j + k - 1 + offset < len(db):
        if query[i + k - 1 + offset] == db[j + k - 1 + offset]:
            right_cur_score += match
        else:
            right_cur_score += mismatch
        
        if right_cur_score > right_max_score:
            right_max_score = right_cur_score
            right_best_pos = (i + k - 1 + offset, j + k - 1 + offset)
        
        if right_max_score - right_cur_score >= X:
            break
        offset += 1
    
    return left_best_pos[0], left_best_pos[1], right_best_pos[0], right_best_pos[1], left_max_score + right_max_score - k * match

In [20]:
database = 'CTAGGATCCAGGCATACGA'
query = 'GGATCCATTCATTA'
k = 4
X = 2
seeds = seeding(database, query, k=4)
seeds, index = seeding(query, database, k)
print(seeds, index)

[(0, 3), (1, 4), (2, 5), (3, 6)] {'CTAG': [0], 'TAGG': [1], 'AGGA': [2], 'GGAT': [3], 'GATC': [4], 'ATCC': [5], 'TCCA': [6], 'CCAG': [7], 'CAGG': [8], 'AGGC': [9], 'GGCA': [10], 'GCAT': [11], 'CATA': [12], 'ATAC': [13], 'TACG': [14], 'ACGA': [15]}


In [25]:
results = []

for i, j in seeds:
    res = extend_seed(query, database, i, j, k, match=2, mismatch=-1, gap=-2, X=2)
    results.append(res)

results.sort(key=lambda x: x[4], reverse=True)

best = results[0]
best_kmer = query[best[0]:best[0] + k]  # best[0] = left_i

In [26]:
print("\n ‚ú®‚ö°Ô∏èüíïü´¶üò©–õ–£–ß–®–ï–ï –í–´–†–ê–í–ù–ò–í–ê–ù–ò–ï‚ú®‚ö°Ô∏èüíïü´¶üò©")
print(f"–õ—É—á—à–∏–π k-–º–µ—Ä: {best_kmer} (–ø–æ–∑. {best[0]},{best[1]})")
print(f"–ò—Ç–æ–≥–æ–≤—ã–π Smax: {best[4]}")
print(f"–ì—Ä–∞–Ω–∏—Ü—ã: –∑–∞–ø—Ä–æ—Å [{best[0]}:{best[2]}], –ë–î [{best[1]}:{best[3]}]")


 ‚ú®‚ö°Ô∏èüíïü´¶üò©–õ–£–ß–®–ï–ï –í–´–†–ê–í–ù–ò–í–ê–ù–ò–ï‚ú®‚ö°Ô∏èüíïü´¶üò©
–õ—É—á—à–∏–π k-–º–µ—Ä: GGAT (–ø–æ–∑. 0,3)
–ò—Ç–æ–≥–æ–≤—ã–π Smax: 14
–ì—Ä–∞–Ω–∏—Ü—ã: –∑–∞–ø—Ä–æ—Å [0:6], –ë–î [3:9]
