In [1]:
def jaccard(set1, set2):
    nominator = float(len(set1.intersection(set2)))
    denominator = float(len(set1.union(set2)))
    return nominator / denominator

In [2]:
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

english_stopwords = set(stopwords.words('english'))

ignored_tokens = set.union(
    english_stopwords,
    punctuation,
    {"''", "'re", "...", "....", "'m", "'ve", "``",
        "--", "'s", "'d", "n't", "'ll", "the"}
)


def get_words(text):
    stemmer = PorterStemmer()
    words = []
    for token in word_tokenize(text):
        stem = stemmer.stem(token.lower())
        if not stem in ignored_tokens and len(stem) > 2:
            words.append(stem)

    return words


def get_n_grams(n, text):
    words = get_words(text)
    text = ' '.join(words)
    return set(text[i:i+n] for i in range(len(text) - n + 1))

def read_docs():
    docs = []
    for file in os.listdir("data"):
        with open(f"data/{file}") as f:
            docs.append((file, f.read()))

    return docs

In [30]:
import mmh3
import numpy as np

class MinHash:
    def __init__(self, k):
        self.k = k
        self.elems = [float('inf')] * k

    def add(self, elem):
        for i in range(self.k):
            h = mmh3.hash(elem, i, signed=False)
            self.elems[i] = min(self.elems[i], h)

    def from_set(k, elems):
        min_hash = MinHash(k)
        for elem in elems:
            min_hash.add(elem)
        return min_hash

    def jaccard(self, other):
        return np.sum(np.array(self.elems) == np.array(other.elems)) / self.k

In [39]:
import os

def n_grams_from_docs(n, docs):
    n_grams = {}
    for file, doc in docs:
        print(f"Processing {file}..")
        n_grams[file] = get_n_grams(n, doc)

    return n_grams

def jaccard_cross_check(n_grams):
    res = {}
    files = list(n_grams.keys())

    for i in range(len(files) - 1):
        for j in range(i + 1, len(files)):
            file1 = files[i]
            file2 = files[j]
            n_grams_1 = n_grams[file1]
            n_grams_2 = n_grams[file2]
            exact_sim = round(jaccard(n_grams_1, n_grams_2), 5)
            
            print(f"{file1}, {file2}: exact={exact_sim}")
            res[(file1, file2)] = exact_sim

    return res

def min_hash_cross_check(k, n_grams):
    res = {}
    files = list(n_grams.keys())

    for i in range(len(files) - 1):
        for j in range(i + 1, len(files)):
            file1 = files[i]
            file2 = files[j]
            n_grams_1 = n_grams[file1]
            n_grams_2 = n_grams[file2]
            minhash1 = MinHash.from_set(k, n_grams_1)
            minhash2 = MinHash.from_set(k, n_grams_2)
            minhash_sim = round(minhash1.jaccard(minhash2), 5)

            print(f"\n{file1}, {file2}: minhash={minhash_sim}")
            res[(file1, file2)] = minhash_sim

    return res

In [40]:
n = 7
docs = read_docs()
n_grams = n_grams_from_docs(n, docs)
jaccard_sims = jaccard_cross_check(n_grams)

Processing ulyss12.txt..
Processing hamlet.txt..
Processing romeo_and_juliet.txt..
Processing king_lear.txt..
Processing tempest.txt..
ulyss12.txt, hamlet.txt: exact=0.06233
ulyss12.txt, romeo_and_juliet.txt: exact=0.05994
ulyss12.txt, king_lear.txt: exact=0.05703
ulyss12.txt, tempest.txt: exact=0.04028
hamlet.txt, romeo_and_juliet.txt: exact=0.11019
hamlet.txt, king_lear.txt: exact=0.11853
hamlet.txt, tempest.txt: exact=0.09464
romeo_and_juliet.txt, king_lear.txt: exact=0.11289
romeo_and_juliet.txt, tempest.txt: exact=0.08973
king_lear.txt, tempest.txt: exact=0.09876


In [41]:
minhash_sims = {}

for k in [64, 128, 256]:
    print(f"\nk={k}")
    minhash_sims[k] = min_hash_cross_check(k, n_grams)


k=64

ulyss12.txt, hamlet.txt: minhash=0.04688

ulyss12.txt, romeo_and_juliet.txt: minhash=0.04688

ulyss12.txt, king_lear.txt: minhash=0.01562

ulyss12.txt, tempest.txt: minhash=0.03125

hamlet.txt, romeo_and_juliet.txt: minhash=0.0625

hamlet.txt, king_lear.txt: minhash=0.07812

hamlet.txt, tempest.txt: minhash=0.0625

romeo_and_juliet.txt, king_lear.txt: minhash=0.04688

romeo_and_juliet.txt, tempest.txt: minhash=0.04688

king_lear.txt, tempest.txt: minhash=0.04688

k=128

ulyss12.txt, hamlet.txt: minhash=0.07031

ulyss12.txt, romeo_and_juliet.txt: minhash=0.05469


KeyboardInterrupt: 

In [None]:
for pair in jaccard_sims.keys():
    print()
    print(pair)
    print(f"exact:       {jaccard_sims[pair]}")
    for k in minhash_sims:
        print(f"minhash_{k}:  {minhash_sims[k][pair]}")

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

def cluster_docs(docs, n_gram, n_clusters):
    texts = [x[1] for x in docs]
    doc_labels = [x[0] for x in docs]

    vectorizer = CountVectorizer(ngram_range=(n_gram, n_gram))
    X = vectorizer.fit_transform(texts)
    km = KMeans(n_clusters=n_clusters)
    km.fit(X)

    clusters = {}
    for i in range(len(km.labels_)):
        cluster = km.labels_[i]
        doc_label = doc_labels[i]
        if not clusters.get(cluster):
            clusters[cluster] = [doc_label]
        else:
            clusters[cluster].append(doc_label)

    return clusters

In [55]:
cluster_docs(docs, 7, 2)

{1: ['ulyss12.txt'],
 0: ['hamlet.txt', 'romeo_and_juliet.txt', 'king_lear.txt', 'tempest.txt']}