In [1]:
from minhash import MinHash
from preprocess import get_n_grams
from jaccard import jaccard
import os

def n_grams_from_files(n):
    n_grams = {}
    for file in os.listdir("data"):
        print(f"Processing {file}..")
        n_grams[file] = get_n_grams(n, f"data/{file}")
    return n_grams

def jaccard_cross_check(n_grams):
    res = {}
    files = list(n_grams.keys())

    for i in range(len(files) - 1):
        for j in range(i + 1, len(files)):
            file1 = files[i]
            file2 = files[j]
            n_grams_1 = n_grams[file1]
            n_grams_2 = n_grams[file2]
            exact_sim = round(jaccard(n_grams_1, n_grams_2), 5)
            
            print(f"{file1}, {file2}: exact={exact_sim}")
            res[(file1, file2)] = exact_sim

    return res

def min_hash_cross_check(k, n_grams):
    res = {}
    files = list(n_grams.keys())

    for i in range(len(files) - 1):
        for j in range(i + 1, len(files)):
            file1 = files[i]
            file2 = files[j]
            n_grams_1 = n_grams[file1]
            n_grams_2 = n_grams[file2]
            minhash1 = MinHash.from_set(k, n_grams_1)
            minhash2 = MinHash.from_set(k, n_grams_2)
            minhash_sim = round(minhash1.jaccard(minhash2), 5)

            print(f"\n{file1}, {file2}: minhash={minhash_sim}")
            res[(file1, file2)] = minhash_sim

    return res


In [7]:
n = 7
n_grams = n_grams_from_files(n)
jaccard_sims = jaccard_cross_check(n_grams)

Processing ulyss12.txt..
Processing hamlet.txt..
Processing romeo_and_juliet.txt..
Processing king_lear.txt..
Processing tempest.txt..
ulyss12.txt, hamlet.txt: exact=0.06233
ulyss12.txt, romeo_and_juliet.txt: exact=0.05994
ulyss12.txt, king_lear.txt: exact=0.05703
ulyss12.txt, tempest.txt: exact=0.04028
hamlet.txt, romeo_and_juliet.txt: exact=0.11019
hamlet.txt, king_lear.txt: exact=0.11853
hamlet.txt, tempest.txt: exact=0.09464
romeo_and_juliet.txt, king_lear.txt: exact=0.11289
romeo_and_juliet.txt, tempest.txt: exact=0.08973
king_lear.txt, tempest.txt: exact=0.09876


In [8]:
minhash_sims = {}

for k in [64, 128, 256]:
    print(f"\nk={k}")
    minhash_sims[k] = min_hash_cross_check(k, n_grams)


k=64

ulyss12.txt, hamlet.txt: minhash=0.04688

ulyss12.txt, romeo_and_juliet.txt: minhash=0.04688

ulyss12.txt, king_lear.txt: minhash=0.01562

ulyss12.txt, tempest.txt: minhash=0.03125

hamlet.txt, romeo_and_juliet.txt: minhash=0.0625

hamlet.txt, king_lear.txt: minhash=0.07812

hamlet.txt, tempest.txt: minhash=0.0625

romeo_and_juliet.txt, king_lear.txt: minhash=0.04688

romeo_and_juliet.txt, tempest.txt: minhash=0.04688

king_lear.txt, tempest.txt: minhash=0.04688

k=128

ulyss12.txt, hamlet.txt: minhash=0.07031

ulyss12.txt, romeo_and_juliet.txt: minhash=0.05469

ulyss12.txt, king_lear.txt: minhash=0.04688

ulyss12.txt, tempest.txt: minhash=0.01562

hamlet.txt, romeo_and_juliet.txt: minhash=0.05469

hamlet.txt, king_lear.txt: minhash=0.07812

hamlet.txt, tempest.txt: minhash=0.0625

romeo_and_juliet.txt, king_lear.txt: minhash=0.04688

romeo_and_juliet.txt, tempest.txt: minhash=0.0625

king_lear.txt, tempest.txt: minhash=0.07031

k=256

ulyss12.txt, hamlet.txt: minhash=0.06641

u

In [9]:
for pair in jaccard_sims.keys():
    print()
    print(pair)
    print(f"exact:       {jaccard_sims[pair]}")
    for k in minhash_sims:
        print(f"minhash_{k}:  {minhash_sims[k][pair]}")


('ulyss12.txt', 'hamlet.txt')
exact:       0.06233
minhash_64:  0.04688
minhash_128:  0.07031
minhash_256:  0.06641

('ulyss12.txt', 'romeo_and_juliet.txt')
exact:       0.05994
minhash_64:  0.04688
minhash_128:  0.05469
minhash_256:  0.08203

('ulyss12.txt', 'king_lear.txt')
exact:       0.05703
minhash_64:  0.01562
minhash_128:  0.04688
minhash_256:  0.05078

('ulyss12.txt', 'tempest.txt')
exact:       0.04028
minhash_64:  0.03125
minhash_128:  0.01562
minhash_256:  0.05078

('hamlet.txt', 'romeo_and_juliet.txt')
exact:       0.11019
minhash_64:  0.0625
minhash_128:  0.05469
minhash_256:  0.07422

('hamlet.txt', 'king_lear.txt')
exact:       0.11853
minhash_64:  0.07812
minhash_128:  0.07812
minhash_256:  0.07031

('hamlet.txt', 'tempest.txt')
exact:       0.09464
minhash_64:  0.0625
minhash_128:  0.0625
minhash_256:  0.08984

('romeo_and_juliet.txt', 'king_lear.txt')
exact:       0.11289
minhash_64:  0.04688
minhash_128:  0.04688
minhash_256:  0.08203

('romeo_and_juliet.txt', 'tem