# Find Duplicate Documents
in the notebook we will build a large corpus by merging several distinct corpora and exclude duplicate documents. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging 
import random
import os
from tqdm import tqdm
from cltk.corpus.readers import get_corpus_reader

In [3]:
import sys
import inspect
from pathlib import Path 
currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from mlyoucanuse.text_deduplicater import TextDeduplicater

In [4]:
logging.basicConfig(level=logging.INFO)

## Load our Corpus readers

In [5]:
perseus_latin_reader = get_corpus_reader(corpus_name='latin_text_perseus', language='latin')
latin_library_reader = get_corpus_reader(corpus_name='latin_text_latin_library', language='latin')

## Create our Text Deduplicator

In [6]:
deduper = TextDeduplicater()

## Simple proof of the Dedupe functionality
Add Caesar docs and pick one of the files at random and add it as a duplicate

In [7]:
caesar = [file for  file in latin_library_reader._fileids if 'caesar' in file]
print(caesar)
deduper = TextDeduplicater()
for file in tqdm(caesar):
    text = list(latin_library_reader.docs( file ))[0]
    deduper.add_document(file, text)
dupe_file = random.choice(caesar)
text = list(latin_library_reader.docs(dupe_file ))[0]
deduper.add_document(dupe_file, text)
print(f'Unique doc names: {deduper.get_unique_doc_names()}')
print(f'Duplicate doc names: {deduper.get_possible_duplicate_doc_names()}')

['caesar/alex.txt', 'caesar/bc1.txt', 'caesar/bc2.txt', 'caesar/bc3.txt', 'caesar/bellafr.txt', 'caesar/gall1.txt', 'caesar/gall2.txt', 'caesar/gall3.txt', 'caesar/gall4.txt', 'caesar/gall5.txt', 'caesar/gall6.txt', 'caesar/gall7.txt', 'caesar/gall8.txt', 'caesar/hisp.txt', 'suetonius/suet.caesar.txt', 'xylander/caesar.txt']


100%|██████████| 16/16 [00:00<00:00, 16.41it/s]


Unique doc names: ['suetonius/suet.caesar.txt', 'xylander/caesar.txt', 'caesar/gall2.txt', 'caesar/gall1.txt', 'caesar/bc2.txt', 'caesar/bellafr.txt', 'caesar/bc1.txt', 'caesar/bc3.txt', 'caesar/gall6.txt', 'caesar/gall7.txt', 'caesar/gall4.txt', 'caesar/gall3.txt', 'caesar/alex.txt', 'caesar/hisp.txt', 'caesar/gall5.txt', 'caesar/gall8.txt']
Duplicate doc names: [('caesar/bc3.txt', 'caesar/bc3.txt')]


## Let's check the whole corpus contents
if a file has warnings about no trigrams found, it's an indicator that the file is empty.

In [8]:
# Reset the duplicator
deduper = TextDeduplicater()

for file in tqdm(latin_library_reader._fileids):
    text = list(latin_library_reader.docs(file))[0]
    deduper.add_document(file, text)

100%|██████████| 2141/2141 [01:29<00:00, 21.09it/s]


## Surprise, some problematic files

In [9]:
possible_dupes = deduper.get_possible_duplicate_doc_names()
possible_dupes

[('albertanus/albertanus.sermo4.txt', 'albertanus/albertanus.sermo2.txt'),
 ('lucan/lucan7.txt', 'lucan/lucan8.txt'),
 ('albertanus/albertanus.sermo4.txt', 'albertanus/albertanus.sermo3.txt'),
 ('albertanus/albertanus.sermo1.txt', 'albertanus/albertanus.sermo3.txt'),
 ('albertanus/albertanus.sermo1.txt', 'albertanus/albertanus.sermo2.txt'),
 ('albertanus/albertanus.sermo1.txt', 'albertanus/albertanus.sermo4.txt'),
 ('albertanus/albertanus.sermo2.txt', 'albertanus/albertanus.sermo3.txt')]

## Let's check their similarity scores

In [10]:
print('Duplicate documents and similarity scores')
for doc_one, doc_two in possible_dupes:
    print(doc_one, doc_two,  
    deduper.calculate_similarity( 
    list(latin_library_reader.docs(doc_one))[0],
    list(latin_library_reader.docs(doc_two))[0]))

Duplicate documents and similarity scores
albertanus/albertanus.sermo4.txt albertanus/albertanus.sermo2.txt 1.0
lucan/lucan7.txt lucan/lucan8.txt 0.9987385682749921
albertanus/albertanus.sermo4.txt albertanus/albertanus.sermo3.txt 1.0
albertanus/albertanus.sermo1.txt albertanus/albertanus.sermo3.txt 1.0
albertanus/albertanus.sermo1.txt albertanus/albertanus.sermo2.txt 1.0
albertanus/albertanus.sermo1.txt albertanus/albertanus.sermo4.txt 1.0
albertanus/albertanus.sermo2.txt albertanus/albertanus.sermo3.txt 1.0


## Note: the reported duplicate files are actually errors in the corpus
The Lucan file is nearly the same file twice, somehow.

In [11]:
print(list(latin_library_reader.docs('albertanus/albertanus.sermo1.txt'))[0][:75])
print(list(latin_library_reader.docs('albertanus/albertanus.sermo2.txt'))[0][:75])
print(list(latin_library_reader.docs('albertanus/albertanus.sermo3.txt'))[0][:75])
print(list(latin_library_reader.docs('albertanus/albertanus.sermo4.txt'))[0][:75])
print(list(latin_library_reader.docs('lucan/lucan7.txt'))[0][:200])
print(list(latin_library_reader.docs('lucan/lucan8.txt'))[0][:200])

Albertano of Brescia 
[an error occurred while processing this directive]


Albertano of Brescia 
[an error occurred while processing this directive]


Albertano of Brescia 
[an error occurred while processing this directive]


Albertano of Brescia 
[an error occurred while processing this directive]


Lucan Liber VII
		 

		 
		 
	 
	
 

 M. ANNAEI LVCANI BELLI CIVILIS LIBER SEPTIMVS
 

 

Segnior, Oceano quam lex aeterna uocabat,
luctificus Titan numquam magis aethera contra
egit equos cursumque p
Lucan Liber VIII
		 

		 
		 
	 
	
 

 M. ANNAEI LVCANI BELLI CIVILIS LIBER OCTAVVS
 

 

 
Segnior, Oceano quam lex aeterna uocabat,
luctificus Titan numquam magis aethera contra
egit equos cursumque


## Let's compare with the Perseus corpus anyway

In [12]:
for file in tqdm(perseus_latin_reader._fileids):
    text = ''.join(list(perseus_latin_reader.paras(file)))
    deduper.add_document(file, text)

100%|██████████| 293/293 [00:28<00:00, 10.36it/s]


## Find out which files are duplicate
Note: of course this does not exclude any files were several texts are put together.

In [13]:
possible_dupes2 = deduper.get_possible_duplicate_doc_names()
possible_dupes2 = [tmp for tmp in possible_dupes2 if tmp not in possible_dupes]
possible_dupes2

[('cicero__in-the-senate-after-his-return__latin.json',
  'cicero/postreditum.txt'),
 ('nepos/nepos.han.txt', 'nepos-cornelius__hannibal__latin.json'),
 ('seneca-lucius-annaeus-plays__apocolocyntosis__latin.json',
  'sen/sen.apoc.txt')]