# Document deduplication

Perform a basic deduplication of documents using cosine similarity.


Use **scipy=1.3.1** to avoid library load error `(Library not loaded: @rpath/libopenblas.dylib)`.

In [1]:
import os
import re
import glob
import pickle
import warnings
import importlib

import numpy as np

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from gensim.similarities import Similarity

In [2]:
from wb_nlp.cleaning import cleaner
from wb_nlp import dir_manager
importlib.reload(cleaner)

<module 'wb_nlp.cleaning.cleaner' from '/Users/avsolatorio/WBG/wb_nlp/src/wb_nlp/cleaning/cleaner.py'>

## Configs and constants

In [3]:
DATA_DIR = '../../src/wb_nlp'
EXTENSION = 'py'

SIMILARITY_THRESHOLD = 0.9

DUPLICATES_DIR = dir_manager.get_data_dir('preprocessed', 'duplicates')

if not os.path.isdir(DUPLICATES_DIR):
    os.makedirs(DUPLICATES_DIR)

DUPLICATES_CORPUS_FILE = os.path.join(DUPLICATES_DIR, 'corpus_generator.pickle')
SIMILARITY_OUTPUT_PREFIX = os.path.join(DUPLICATES_DIR, 'corpus_similarity.gensim')
DUPLICATE_DOC_IDS_FILE = os.path.join(DUPLICATES_DIR, 'duplicate_doc_ids.dict.pickle')

In [4]:
simple_cleaner = cleaner.SimpleCleaner()
simple_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, simple_cleaner.clean_text, extension=EXTENSION)

# lda_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, lda_cleaner.clean_text, extension=EXTENSION)
# lda_cleaner = cleaner.LDACleaner()

# word2vec_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, word2vec_cleaner.clean_text, extension=EXTENSION)
# word2vec_cleaner = cleaner.Word2VecCleaner()

In [5]:
corpus_generator = simple_corpus_generator

### Load documents and compute phrases

In [6]:
try:
    corpus_generator.reset()
except ValueError:
    pass

bigram = Phrases(corpus_generator, min_count=1)
bigram_phraser = Phraser(bigram)

### Generate the bigram docs, dictionary, and corpus

In [7]:
doc_bigrams = list(corpus_generator.stream_gensim_transformer(bigram_phraser))
dictionary = corpora.Dictionary(doc_bigrams)
corpus = [dictionary.doc2bow(doc) for doc in doc_bigrams]

### Build the similarity index

In [8]:
index = Similarity(
    corpus=corpus,
    num_features=len(dictionary),
    output_prefix=SIMILARITY_OUTPUT_PREFIX)

In [9]:
print(index.vector_by_id(0))
print(index.similarity_by_id(0))

[0.17149858 0.17149858 0.17149858 0.17149858 0.34299716 0.17149858
 0.17149858 0.17149858 0.17149858 0.17149858 0.17149858 0.17149858
 0.34299716 0.17149858 0.17149858 0.17149858 0.17149858 0.17149858
 0.17149858 0.17149858 0.17149858 0.17149858 0.17149858 0.17149858
 0.17149858 0.17149858 0.17149858 0.17149858 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
[0.99999994 0.01642659]


### Collect duplicate documents based on similarity threshold

In [10]:
unique_doc_ids = set()
duplicated_docs = []

for doc_id, doc in enumerate(corpus):
    indices = np.where(index.similarity_by_id(0) > SIMILARITY_THRESHOLD)[0]

    if len(indices) == 1:
        unique_doc_ids.add(doc_id)
    else:
        duplicated_docs.append(indices)

In [11]:
payload = dict(
    unique_doc_ids=unique_doc_ids,
    duplicated_docs=duplicated_docs)

with open(DUPLICATE_DOC_IDS_FILE, 'wb') as fl:
    pickle.dump(payload, fl)

### Save the corpus generator

In [12]:
corpus_generator.save(DUPLICATES_CORPUS_FILE)

### Load saved corpus generator

In [13]:
load_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, simple_cleaner.clean_text, extension=EXTENSION)
load_corpus_generator.load(DUPLICATES_CORPUS_FILE)

In [14]:
# load_corpus_generator.reset()
# for i in load_corpus_generator:
#     print(i)