In [5]:
from random import random
class RandomHash:
    def __init__(self):
        self.table = dict()
    def of(self, x):
        if x not in self.table:
            self.table[x] = random()
        return self.table[x]

In [6]:
def build_array(s, q):
    """
    build an array A with |s | − q + 1 elements, where the i-th element
    A[i] = F (si..i+q−1), where F : Σq → (0, 1) is a random hash function
    """
    from random import random
    F = RandomHash()
    A = []
    for i in range(len(s)-q+1):
        A.append(F.of(s[i:i+q]))
    return A    

In [7]:
build_array('amigo', 3)

[0.598152218891704, 0.7818686822714424, 0.12328346654685007]

In [10]:
def Rank(s, r, q):
    """
    Input: input string s; minimum rank r
    Output: rank array R = {(pi, Ri )}: pi denotes the index of the letter in
    s corresponding to the i-th pair, and Ri is its rank
    """
    A = build_array(s, q)
    R = [(0, float('inf'))] # the first character of s has ranking infinity
    for i in range(len(A)):
        x = 1
        while i-x >= 0 and i+x < len(A):
            if A[i] < min(A[i+x], A[i-x]):
                x += 1
            else:
                break
        if x > r:
            R.append((i, x-1))
    R.append((len(s)-q, float('inf')))
    return R   

In [19]:
Rank('ACGTTCGACTGGTTAG', 1, 3)

[(0, inf), (1, 1), (3, 1), (5, 5), (12, 1), (13, inf)]

In [3]:
def Partition(s, R, start, end):
    """
    Input: input string s; rank array R = {(pi, Ri )} of s; two indices start and end
    Output: set of partitions P = {(ssub , l)}, where (ssub , l) denotes a
    substring ssub with level l
    """
    if end <= start + 1:
        return []
    maxR = float('-inf')
    M = []
    i = start + 1
    while i < end:
        if R[i][1] > maxR:
            maxR = R[i][1]
            M = [i]
        if R[i][1] == maxR:
            M.append(i)
        i += 1
    P = []
    M = [start] + M + [end]
    M.sort()
    for j in range(len(M)-1):
        u = M[j]
        v = M[j+1]
        pu = R[u][0]
        pv_minus_1 = R[v-1][0]
        P.append((s[pu:pv_minus_1 + 1], min(R[u][1], R[v][1])))
        P.extend(Partition(s, R, u, v))
    return P

In [20]:
R = Rank('ACGTTCGACTGGTTAG', 1, 3)
Partition('ACGTTCGACTGGTTAG', R, 0, len(R)-1)

[('ACG', 4),
 ('A', 1),
 ('', 1),
 ('G', 1),
 ('', 4),
 ('TCGACTG', 4),
 ('TCG', 3),
 ('T', 1),
 ('', 1),
 ('G', 1),
 ('', 3),
 ('G', 3)]

In [25]:
from random import randint
class RandomHashInt:
    def __init__(self):
        self.table = dict()
    def of(self, x):
        if x not in self.table:
            self.table[x] = randint(0,1000)
        return self.table[x]
class HashTablesList:
    def __init__(self):
        self.tables = {}
    def get(self, l):
        if l not in self.tables:
            return None
        return self.tables[l]
    def insert(self, l):
        self.tables[l] = {'table': {}, 'random_function': RandomHashInt() }

In [26]:
def build_index(S, q):
    """
    Input: set of input strings S = {s1, . . . , sn }
    Output: set of hash tables {(Hi, fi )}, where for the i-th hash table, each
    string s is hashed into the fi(s)-th bucket of Hi
    """
    hash_tables = HashTablesList()
    for i,si in enumerate(S):
        Ri = Rank(si, 1, q)
        Pi = Partition(si, Ri, 0, len(Ri)-1)
        for (ssub, l) in Pi:
            if hash_tables.get(l) is None:
                hash_tables.insert(l)
            fl = hash_tables.get(l)['random_function']
            hash_tables.get(l)['table'][fl.of(ssub)] = i
    return hash_tables

In [31]:
S = ['ACGTTCGACTGGTTAG',
     'CCGTTCGAACTGGTTAG',
     'ACATTCGACTGGTTGAG',
     'TCGAACGTTCGAACGT']
index = build_index(S, 3)

In [32]:
index.tables

{5: {'table': {767: 0, 797: 0, 32: 0},
  'random_function': <__main__.RandomHashInt at 0x7f4eb4644d60>},
 1: {'table': {517: 1, 257: 3, 770: 3, 575: 2, 627: 3},
  'random_function': <__main__.RandomHashInt at 0x7f4eb46440a0>},
 2: {'table': {654: 0, 838: 3, 373: 3, 456: 1, 634: 1, 849: 1, 590: 2, 875: 3},
  'random_function': <__main__.RandomHashInt at 0x7f4eb46554f0>},
 6: {'table': {380: 1, 553: 2, 280: 1, 228: 2, 885: 2},
  'random_function': <__main__.RandomHashInt at 0x7f4eb4655340>},
 3: {'table': {131: 2, 175: 3, 790: 3, 42: 3, 774: 3},
  'random_function': <__main__.RandomHashInt at 0x7f4eb4655580>}}