In [1]:
import glob
import re

INPUT_FOLDER = '../../../travelogues-corpus/16th_century/books'

def read_file(f):
  with open(f, 'r') as file:
    return file.read()

filenames = [f for f in glob.glob(INPUT_FOLDER + '**/*.txt')]
texts = [ read_file(f) for f in filenames ]

f'Loaded {len(texts)} documents'

'Loaded 66 documents'

In [2]:
import re

blank_line_regex = r"(?:\r?\n){2,}"
pages_per_text = list(map(lambda l: re.split(blank_line_regex, l.strip()), texts))

cleaned_pages_per_text = []
for pages in pages_per_text:
  # ASCII characters only
  cleaned = [ re.sub('[^A-Za-z0-9 ]+', '', page) for page in pages ]

  # Replace multiple spaces by one
  cleand = [ re.sub('\\s+', ' ', page) for page in cleaned ]

  # Remove empty pages
  cleaned = [ p.strip() for p in cleaned if len(p.strip()) > 0 ]

  cleaned_pages_per_text.append(cleaned)

In [3]:
from datasketch import MinHashLSH
from util.text import Text
import pickle

# 'Text' minhash helper objects
texts = []

for text_idx, text in enumerate(cleaned_pages_per_text):
  for page_idx, page in enumerate(text):
    texts.append(Text(text_idx, page_idx, page))

lsh = MinHashLSH(threshold=0.8, num_perm=128)

for t in texts:
  lsh.insert(t.id(), t.minhash())

# Save for later (computation takes a lot of time...)
pickle.dump(lsh, open('./lsh.bin', 'wb'))

f'Built LSH index for {len(texts)} pages'

'Built LSH index for 15906 pages'

In [4]:
import pandas as pd 

neighbour_pairs = []

for t in texts:
  result = lsh.query(t.minhash())

  # Result will always contain the message itself - remove
  result = [r for r in result if r != t.id()]

  for neighbour in result:
    neighbour_pairs.append([ t.id(), neighbour ])

df = pd.DataFrame(neighbour_pairs, columns=['a', 'b'])
df

Unnamed: 0,a,b
0,07,736
1,07,5323
2,07,53631
3,07,5274
4,07,53194
5,07,768
6,07,10291
7,07,0381
8,07,6261
9,07,20628


In [5]:
df.to_csv('results_pagewise.csv', index=False)