In [1]:
import re
import random
import numpy as np

In [2]:
def make_shingle_set(file, encod='utf8', N_shingle=3):
    set_text = set()
    with open(file, 'r', encoding=encod) as f:
        text = re.sub(r'[^\w\ ]', '', f.read()).lower().split(' ')
        for item in range(len(text) - N_shingle):
            set_text.add(' '.join(text[item:item + N_shingle]))
    return set_text
    

class UniversalHash:
    def __init__(self, size): 
        self._m = size
#         self._p = self.generate_prime(min_border)
        self._p = self.generate_prime()
        self._a = random.randint(0, self._p)
        self._b = random.randint(1, self._p)

    def hash(self, key): 
        if isinstance(key, int):
            summ = key
        else:
            string = str(key)
            summ = 0
            for item in range(len(string)):
                summ = summ + ord(string[item]) * (item + 1)
        return (((self._a * summ + self._b) % self._p) % (self._m - 1))

    def generate_prime(self):
        while True:
#             p = random.randrange(min_border, min_border ** 1.5)
            p = random.randrange(2 ** 32, 2 ** 34)
            if all(p % n != 0 for n in range(3, int((p ** 0.5) + 1), 2)):
                return p
            
            
def calculate_signature(set_hash):
    result = np.zeros(32, dtype='int')
    for item in set_hash:
        count = []
        for i in range(32):
            count.append(1) if item & 1 else count.append(-1)
            item >>= 1
            count.reverse()
        result += np.array(count, dtype='int')
    result[result < 0] = 0
    result[result > 0] = 1
    return result


def calculate_hamming_distance(sign_1, sign_2):
    return sum(np.bitwise_xor(sign_1,sign_2))


def minhash(file_1, file_2, encoding='utf8', N_shingle=3, max_error=0.05):  
    
    set_text_1 = make_shingle_set(file_1, encod=encoding, N_shingle=N_shingle)
    set_text_2 = make_shingle_set(file_2, encod=encoding, N_shingle=N_shingle)
    
    # jaccard
    jacc_sim = len(set_text_1.intersection(set_text_2)) / len(set_text_1.union(set_text_2))
    print(f"Jaccard: {jacc_sim}.")
     
    # создаем набор hash функций
    N_hash = round(1 / (max_error * max_error))
    hash_funcs = []
    for i in range(N_hash):
        h = UniversalHash(2 ** 32 - 1)
        hash_funcs.append(h)
        
    # minhash
    hash_set_1 = [min([h.hash(e) for e in set_text_1]) for h in hash_funcs]
    hash_set_2 = [min([h.hash(e) for e in set_text_2]) for h in hash_funcs]
    
    minhash_sim = sum(int(hash_set_1[i] == hash_set_2[i]) for i in range(N_hash)) / N_hash
    print(f"MinHash : {minhash_sim}.")
    
    # simhash    
    # Добавлено только для демонстрации, чтоб повторно не выполнять создание шинглов, хеш функций, хеширования шинглов
    # отдельная функция simhash - представлена далее.
    signature_1 = calculate_signature(hash_set_1)
    signature_2 = calculate_signature(hash_set_2)
    print(f'SimHash. Hamming distance: {calculate_hamming_distance(signature_1, signature_2)}.')
    

def simhash(file_1, file_2, encoding='utf8', N_shingle=3, max_error=0.05):  
    set_text_1 = make_shingle_set(file_1, encod=encoding, N_shingle=N_shingle)
    set_text_2 = make_shingle_set(file_2, encod=encoding, N_shingle=N_shingle)
    
    N_hash = round(1 / (max_error * max_error))
    hash_funcs = []
    for i in range(N_hash):
        h = UniversalHash(2 ** 32 - 1)
        hash_funcs.append(h)
        
    hash_set_1 = [min([h.hash(e) for e in set_text_1]) for h in hash_funcs]
    hash_set_2 = [min([h.hash(e) for e in set_text_2]) for h in hash_funcs]
    
    signature_1 = calculate_signature(hash_set_1)
    signature_2 = calculate_signature(hash_set_2)
    print(f'Simhash. Hamming distance: {calculate_hamming_distance(signature_1, signature_2)}.')

In [3]:
# сравнение первого и второго тома "Война и мир"
file_1 = 'tom_1.txt'
file_2 = 'tom_2.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6

minhash(file_1, file_2, encoding, N_shingle, max_error)
# simhash(file_1, file_2, encoding, N_shingle, max_error)

jaccard: 0.00013489936507365505.
min-hash : 0.08.
Hamming distance: 5.


In [4]:
# сравнение произведений разных авторов
file_1 = 'tom_1.txt'
file_2 = 'tolkien.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6

minhash(file_1, file_2, encoding, N_shingle, max_error)

jaccard: 0.0.
min-hash : 0.07.
Hamming distance: 5.


In [5]:
with open('tom_1.txt', 'r', encoding='cp1251') as f_r:
    with open('tom_1_reduce.txt', 'w', encoding='cp1251') as f_w: 
        f_w.write(f_r.read()[:-10000])

In [6]:
# сравнение полного текста с незначительно урезанной копией
file_1 = 'tom_1.txt'
file_2 = 'tom_1_reduce.txt'
encoding  = 'cp1251'
max_error = 0.1
N_shingle = 6

minhash(file_1, file_2, encoding, N_shingle, max_error)

jaccard: 0.9864871209773596.
min-hash : 0.99.
Hamming distance: 0.
