# Document deduplication

Perform a basic deduplication of documents using cosine similarity.


Use **scipy=1.3.1** to avoid library load error `(Library not loaded: @rpath/libopenblas.dylib)`.

In [1]:
import os
import re
import glob
import warnings
import importlib

from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora
from gensim.similarities import Similarity

In [2]:
from wb_nlp.cleaning import cleaner
importlib.reload(cleaner)

<module 'wb_nlp.cleaning.cleaner' from '/Users/avsolatorio/WBG/wb_nlp/src/wb_nlp/cleaning/cleaner.py'>

In [3]:
DATA_DIR = '../../src/wb_nlp'
EXTENSION = 'py'

simple_cleaner = cleaner.SimpleCleaner()
lda_cleaner = cleaner.LDACleaner()
word2vec_cleaner = cleaner.Word2VecCleaner()

simple_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, simple_cleaner.clean_text, extension=EXTENSION)
lda_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, lda_cleaner.clean_text, extension=EXTENSION)
word2vec_corpus_generator = cleaner.CorpusCleaner(DATA_DIR, word2vec_cleaner.clean_text, extension=EXTENSION)

In [4]:
corpus_generator = simple_corpus_generator

## Load documents and compute phrases

In [5]:
try:
    corpus_generator.reset()
except ValueError:
    pass

bigram = Phrases(corpus_generator, min_count=1)
bigram_phraser = Phraser(bigram)

In [6]:
doc_bigrams = list(corpus_generator.stream_gensim_transformer(bigram_phraser))
dictionary = corpora.Dictionary(doc_bigrams)
corpus = [dictionary.doc2bow(doc) for doc in doc_bigrams]

index = Similarity(
    corpus=corpus,
    num_features=len(dictionary),
    output_prefix='corpus_similarities')

In [7]:
index.vector_by_id(0)

array([0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.34299716,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.34299716, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.17149858, 0.17149858,
       0.17149858, 0.17149858, 0.17149858, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ], dtype=float32)

In [8]:
index.similarity_by_id(0)

array([0.99999994, 0.01642659], dtype=float32)

In [9]:
index.similarity_by_id(1)

array([0.01642659, 1.        ], dtype=float32)