In [131]:
import re
import random
import numpy as np
import time

In [132]:
def timing(function):
    import time   
    
    def wrapper(*a):
        start = time.time()
        function(*a)
        stop = time.time()
        print('Выполнение {} - {:f} секунд.\n'.format(function.__name__, stop - start))
    return wrapper

def make_shingle_set(file, encod='utf8', N_shingle=3):
    set_text = set()
    with open(file, 'r', encoding=encod) as f:
        text = re.sub(r'[^\w\ ]', '', f.read()).lower().split(' ')
        for item in range(len(text) - N_shingle):
            set_text.add(' '.join(text[item:item + N_shingle]))
    return set_text
    

class UniversalHash:
    def __init__(self, size=2**32): 
        self._m = size
        self._p = self.generate_prime()
        self._a = random.randint(0, self._p)
        self._b = random.randint(1, self._p)

    def hash(self, key): 
        if isinstance(key, int):
            summ = key
        else:
            string = str(key)
            summ = 0
            for item in range(len(string)):
                summ = summ + ord(string[item]) * (item + 1)
        return (((self._a * summ + self._b) % self._p) % (self._m - 1))

    def generate_prime(self):
        while True:
            p = random.randrange(self._m, self._m << 1)
            if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
                return p
            
            
def calculate_signature(hash_list):
    result = np.zeros(32, dtype='int')
    for item in hash_list:
        count = []
        for i in range(32):
            count.append(1) if item & 1 else count.append(-1)
            item >>= 1
            count.reverse()
        result += np.array(count, dtype='int')
    result[result < 0] = 0
    result[result > 0] = 1
    return result


def calculate_hamming_distance(sign_1, sign_2):
    return sum(np.bitwise_xor(sign_1,sign_2))

@timing
def minhash(file_1, file_2, encoding='utf8', N_shingle=3, max_error=0.05, hash_size=2**32):  
    
    set_text_1 = make_shingle_set(file_1, encod=encoding, N_shingle=N_shingle)
    set_text_2 = make_shingle_set(file_2, encod=encoding, N_shingle=N_shingle)
    
    # jaccard
    jacc_sim = len(set_text_1.intersection(set_text_2)) / len(set_text_1.union(set_text_2))
    print(f"Jaccard: {jacc_sim}.")
     
    # создаем набор hash функций
    N_hash = round(1 / (max_error * max_error))
    hash_funcs = []
    for i in range(N_hash):
        h = UniversalHash(hash_size)
        hash_funcs.append(h)
        
    # minhash
    hash_set_1 = [min([h.hash(e) for e in set_text_1]) for h in hash_funcs]
    hash_set_2 = [min([h.hash(e) for e in set_text_2]) for h in hash_funcs]
    
    minhash_sim = sum(int(hash_set_1[i] == hash_set_2[i]) for i in range(N_hash)) / N_hash
    print(f"MinHash : {minhash_sim}.")
    
    

In [133]:
@timing
def simhash(file_1, file_2, encoding='utf8', N_shingle=3, hash_size=2 ** 32):  
    set_text_1 = make_shingle_set(file_1, encod=encoding, N_shingle=N_shingle)
    set_text_2 = make_shingle_set(file_2, encod=encoding, N_shingle=N_shingle)
    
    h = UniversalHash(hash_size)
        
    hash_list_1 = [h.hash(e) for e in set_text_1]
    hash_list_2 = [h.hash(e) for e in set_text_2]
    
    signature_1 = calculate_signature(hash_list_1)
    signature_2 = calculate_signature(hash_list_2)
    print(f'Simhash. Hamming distance: {calculate_hamming_distance(signature_1, signature_2)}.')

In [134]:
# сравнение первого и второго тома "Война и мир"
file_1 = 'tom_1.txt'
file_2 = 'tom_2.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6
hash_size = 2 ** 32

minhash(file_1, file_2, encoding, N_shingle, max_error, hash_size)
simhash(file_1, file_2, encoding, N_shingle, hash_size)

Jaccard: 0.00013489936507365505.
MinHash : 0.05.
Выполнение minhash - 120.360846 секунд.

Simhash. Hamming distance: 13.
Выполнение simhash - 3.391059 секунд.



In [135]:
# сравнение произведений разных авторов
file_1 = 'tom_1.txt'
file_2 = 'tolkien.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6
hash_size = 2 ** 32

minhash(file_1, file_2, encoding, N_shingle, max_error, hash_size)
simhash(file_1, file_2, encoding, N_shingle, hash_size)

Jaccard: 0.0.
MinHash : 0.04.
Выполнение minhash - 115.666098 секунд.

Simhash. Hamming distance: 14.
Выполнение simhash - 3.196705 секунд.



In [136]:
with open('tom_1.txt', 'r', encoding='cp1251') as f_r:
    with open('tom_1_reduce.txt', 'w', encoding='cp1251') as f_w: 
        f_w.write(f_r.read()[:-10000])

In [137]:
# сравнение полного текста с незначительно урезанной копией
file_1 = 'tom_1.txt'
file_2 = 'tom_1_reduce.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6
hash_size = 2 ** 32

minhash(file_1, file_2, encoding, N_shingle, max_error, hash_size)
simhash(file_1, file_2, encoding, N_shingle, hash_size)

Jaccard: 0.9864871209773596.
MinHash : 0.97.
Выполнение minhash - 118.907702 секунд.

Simhash. Hamming distance: 0.
Выполнение simhash - 3.243857 секунд.



In [138]:
file_1 = 'tom_1.txt'
file_2 = 'tom_2.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6
hash_size = 2 ** 16

minhash(file_1, file_2, encoding, N_shingle, max_error, hash_size)
simhash(file_1, file_2, encoding, N_shingle, hash_size)

Jaccard: 0.00013489936507365505.
MinHash : 0.82.
Выполнение minhash - 136.508347 секунд.

Simhash. Hamming distance: 7.
Выполнение simhash - 3.460364 секунд.



In [139]:
file_1 = 'tom_1.txt'
file_2 = 'tom_2.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6
hash_size = 2 ** 24

minhash(file_1, file_2, encoding, N_shingle, max_error, hash_size)
simhash(file_1, file_2, encoding, N_shingle, hash_size)

Jaccard: 0.00013489936507365505.
MinHash : 0.05.
Выполнение minhash - 126.236976 секунд.

Simhash. Hamming distance: 10.
Выполнение simhash - 3.318329 секунд.

