In [5]:
from text_dedup.minhash_spark import generate_hash_values, generate_edges, ngram_hashes

In [6]:
from itertools import tee
from typing import List
def ngrams(sequence: List[str], n: int, min_length: int = 5):
    """
    Return the ngrams generated from a sequence of items, as an iterator.

    This is a modified version of nltk.util.ngrams.

    Parameters
    ----------
    sequence : List[Text]
        The sequence of items.
    n : int
        The length of each ngram.
    min_length : int, optional
        The minimum length of each ngram, by default 5

    Returns
    -------
    iterator
        The ngrams.

    Examples
    --------
    >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=1))
    [('a', 'b'), ('b', 'c'), ('c', 'd')]
    >>> list(ngrams(["a", "b", "c", "d"], 2, min_length=5))
    []
    >>> list(ngrams(["a", "b"], 3, min_length=1))
    [('a', 'b')]
    """
    if len(sequence) < min_length:
        return []
    if len(sequence) < n:
        return [tuple(sequence)]
    iterables = tee(iter(sequence), n)
    for i, sub_iterable in enumerate(iterables):
        for _ in range(i):
            next(sub_iterable, None)
    return zip(*iterables)


In [7]:
import hashlib
import struct

def sha1_hash(data: bytes, d: int = 32) -> int:
    """
    Generate a d-bit hash value from the given data.

    Parameters
    ----------
    data : bytes
        The data to be hashed.
    d : int
        The number of bits of the hash value.

    Returns
    -------
    int
        The hash value.

    Examples
    --------
    >>> sha1_hash(b"hello world", 32)
    896314922
    >>> sha1_hash(b"hello world", 64)
    13028719972609469994
    >>> sha1_hash(b"hello world", 128)
    310522945683037930239412421226792791594
    """
    if d == 32:
        return struct.unpack("<I", hashlib.sha1(data, usedforsecurity=False).digest()[:4])[0]
    if d == 64:
        return struct.unpack("<Q", hashlib.sha1(data, usedforsecurity=False).digest()[:8])[0]
    # struct is faster but does not support arbitrary bit lengths
    return int.from_bytes(hashlib.sha1(data, usedforsecurity=False).digest()[: d // 8], byteorder="little")


In [8]:
import re
import numpy as np
SEED = 42
RNG = np.random.RandomState(SEED)
NON_ALPHA = re.compile(r"\W", re.UNICODE)
SIGNATURE_COLUMN = "__signatures__"
content = "But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?"

tokens: set[bytes] = {
        bytes(" ".join(t).lower(), "utf-8") for t in ngrams(NON_ALPHA.split(content.lower()), 3, 5)
    }
tokens1: set[bytes] = {
        bytes("".join(t).lower(), "utf-8") for t in ngrams(NON_ALPHA.split(content.lower()), 3, 5)
    }

hashvalues: np.ndarray = np.array([sha1_hash(token) for token in tokens], dtype=np.uint64).reshape(len(tokens), 1)

dtype, max_hash, modulo_prime =  (np.uint64, np.uint32((1 << 32) - 1), np.uint64((1 << 61) - 1))

In [9]:
PERMUTATIONS: tuple[np.ndarray, np.ndarray] = (
        RNG.randint(
            1, modulo_prime, size=(200,), dtype=dtype
        ),  # a is a multiplier so should not be 0
        RNG.randint(0, modulo_prime, size=(200,), dtype=dtype),  # b
    )
a,b = PERMUTATIONS

In [6]:

hashvalues = (hashvalues * a + b) % modulo_prime & max_hash

In [9]:
# FROM PYSPARK
def pyspark():
    idx = 0
    num_perm = 200
    hashranges = [(i * 6, (i + 1) * 6) for i in range(33)]
    hashes = np.array(list(ngram_hashes(content, 2, 5)), dtype=dtype)
    p_hashes = ((np.outer(hashes, a) + b) % modulo_prime) & max_hash
    min_hashes = np.vstack([p_hashes, np.full(num_perm, max_hash, dtype=dtype)]).min(
            axis=0
    )
    return [
        (band_idx, min_hashes[start:end].data.tobytes(), idx)
        for band_idx, (start, end) in enumerate(hashranges)
    ]

In [8]:
masks: np.ndarray = np.full(shape=200, dtype=dtype, fill_value=max_hash)

In [10]:
hashvalues = (hashvalues * a + b) % modulo_prime & max_hash
    # this part is where the name "min" of minhash comes from
    # this stacks all the hashes and then takes the minimum from each column
num_perm = 200
hashranges = [(i * 6, (i + 1) * 6) for i in range(33)]
masks: np.ndarray = np.full(shape=num_perm, dtype=dtype, fill_value=max_hash)
hashvalues = np.vstack([hashvalues, masks]).min(axis=0)
# Originally, byteswap was done for speed. Testing show it has a negligible impact
# keeping  for backward compatibility, even though theoretically and empirically
# it doesnt matter if it is there or not. github.com/ekzhu/datasketch/issues/114
Hs: list[bytes] = [bytes(hashvalues[start:end].byteswap().data) for start, end in hashranges]

In [22]:
for start,end in hashranges: # hashranges is a list of indices
    print(start,end)
    mem = (hashvalues[start:end].data)
    utf8 = bytes(mem)
    print(utf8) # this usually breaks but its intended as the program doesn need to decode it
    break

0 6
b'\xd8\xc6\x8e\x00\x00\x00\x00\x00s\x8a\xe1\x00\x00\x00\x00\x00\x8dM{\x00\x00\x00\x00\x00\x05\x8f\x00\x00\x00\x00\x00\x00\xccV*\x02\x00\x00\x00\x00\xa9\xe7\x86\x01\x00\x00\x00\x00'


In [23]:
hashvalues[start:end]

array([ 9357016, 14781043,  8080781,    36613, 36329164, 25618345],
      dtype=uint64)

In [19]:
rust_out =  r"�L�\u{f}\0\0\0\0Q�@\r\0\0\0\0���\u{3}\0\0\0\0%��\u{c}\0\0\0\0Z��\u{1}\0\0\0\0F�\u{7f}\u{3}\0\0\0\0"

rust_out = rust_out.encode("utf-8")

In [28]:
print(list(Hs[0]))

[0, 0, 0, 0, 0, 142, 198, 216, 0, 0, 0, 0, 0, 225, 138, 115, 0, 0, 0, 0, 0, 123, 77, 141, 0, 0, 0, 0, 0, 0, 143, 5, 0, 0, 0, 0, 2, 42, 86, 204, 0, 0, 0, 0, 1, 134, 231, 169]


In [29]:
print(rust_int)

[0, 0, 0, 0, 15, 146, 76, 214, 0, 0, 0, 0, 13, 64, 158, 81, 0, 0, 0, 0, 3, 247, 225, 216, 0, 0, 0, 0, 12, 219, 184, 37, 0, 0, 0, 0, 1, 197, 133, 90, 0, 0, 0, 0, 3, 127, 203, 70]


In [31]:
bytes([214, 76, 146, 15, 0, 0, 0, 0, 81, 158, 64, 13, 0, 0, 0, 0, 216, 225, 247, 3, 0, 0, 0, 0, 37, 184, 219, 12, 0, 0, 0, 0, 90, 133, 197, 1, 0, 0, 0, 0, 70, 203, 127, 3, 0, 0, 0, 0])

b'\xd6L\x92\x0f\x00\x00\x00\x00Q\x9e@\r\x00\x00\x00\x00\xd8\xe1\xf7\x03\x00\x00\x00\x00%\xb8\xdb\x0c\x00\x00\x00\x00Z\x85\xc5\x01\x00\x00\x00\x00F\xcb\x7f\x03\x00\x00\x00\x00'

In [24]:
rust_int = [
    0,
    0,
    0,
    0,
    15,
    146,
    76,
    214,
    0,
    0,
    0,
    0,
    13,
    64,
    158,
    81,
    0,
    0,
    0,
    0,
    3,
    247,
    225,
    216,
    0,
    0,
    0,
    0,
    12,
    219,
    184,
    37,
    0,
    0,
    0,
    0,
    1,
    197,
    133,
    90,
    0,
    0,
    0,
    0,
    3,
    127,
    203,
    70,
]

In [2]:
from text_dedup.minhash import embed_func
from text_dedup.dedup_rs import EmbedFunc
import datasets as ds

Emb = EmbedFunc(threshold =0.5, num_perm=200, false_positive=0.5, false_negative=0.5,
                main_col = "__SIGNATURE__", idx_col = "__idx__")

In [3]:
import pickle

with open("embed_func.pkl", "wb") as f:
    pickle.dump(Emb, f)

PicklingError: Can't pickle <class 'builtins.EmbedFunc'>: attribute lookup EmbedFunc on builtins failed