# Document deduplication

Perform a basic deduplication of documents using cosine similarity.


Use **scipy=1.3.1** to avoid library load error `(Library not loaded: @rpath/libopenblas.dylib)`.

In [1]:
import os
import re
import glob
import warnings

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from gensim.similarities import Similarity

In [2]:
clean_doc_cache = {}

def cleaned_doc_generator(dir, cleaner, id_pattern=None, extension='txt'):
    '''A generator that loads files from a directory and returns a cleaned document.
    This also caches the cleaned data.
    '''

    for fpath in glob.glob(os.path.join(dir, f'*.{extension}')):
        fname = fpath.split('/')[-1]

        if id_pattern is None:
            file_hash = hash(fname)
        else:
            match = re.search(id_pattern, fname)
            if match:
                file_hash = match.group(0)
            else:
                warnings.warn(f'No valid id found in file {fname}. Skipping...')
                continue

        if file_hash not in clean_doc_cache:
            with open(fpath, 'rb') as fl:
                doc = fl.read().decode('utf-8', errors='ignore')
                text = cleaner(doc)
                clean_doc_cache[file_hash] = text

        yield clean_doc_cache[file_hash]

In [3]:
DATA_DIR = '../../src/wb_nlp'
EXTENSION = 'py'

cleaner = lambda doc: simple_preprocess(doc, deacc=True, max_len=30)

In [4]:
%%time
doc_generator = cleaned_doc_generator(DATA_DIR, cleaner, extension=EXTENSION)
cleaned_scripts = [s for s in doc_generator]

CPU times: user 1.48 ms, sys: 1.37 ms, total: 2.84 ms
Wall time: 2.2 ms


In [5]:
%%time
doc_generator = cleaned_doc_generator(DATA_DIR, cleaner, extension=EXTENSION)
cleaned_scripts = [s for s in doc_generator]

CPU times: user 371 µs, sys: 318 µs, total: 689 µs
Wall time: 449 µs


In [6]:
doc_generator = cleaned_doc_generator(DATA_DIR, cleaner, extension=EXTENSION)

bigram = Phrases(doc_generator, min_count=1)
bigram_phraser = Phraser(bigram)

In [16]:
# texts_bigrams = [[text for text in bigram_phraser[ simple_preprocess(doc, deacc=True)]] for doc in documents]
doc_generator = cleaned_doc_generator(DATA_DIR, cleaner, extension=EXTENSION)

doc_bigrams = [[token for token in bigram_phraser[doc]] for doc in doc_generator]
dictionary = corpora.Dictionary(doc_bigrams)

In [19]:
corpus = [dictionary.doc2bow(doc) for doc in doc_bigrams]

index = Similarity(
    corpus=corpus,
    num_features=len(dictionary),
    output_prefix='sim_out')

In [22]:
doc_id = 0
similar_docs = {}

for similarities in index:
    similar_docs[doc_id] = list(enumerate(similarities))

    doc_id += 1

In [23]:
similar_docs

{0: [(0, 1.0000004), (1, 0.016426586)], 1: [(0, 0.016426586), (1, 0.99999994)]}

In [25]:
index.vector_by_id(0)

array([0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.34299716,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.34299716, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ], dtype=float32)