In [3]:
#Libraries
import sys
import fitz #version 1.21.1
from operator import itemgetter
import unicodedata
import re
import json
from dataclasses import dataclass
from nltk.corpus import stopwords #version 3.8.1

#Working directory root
dir = '/home/jovyan/work/'
#Source PDF to be loaded
document = fitz.open('/home/jovyan/work/source/biller.pdf')
#0 for Latin, #1 for English
language = 1
#Page range to be extracted
startPage = 192
endPage = 997
#Number of pages to be extracted
pageCount = 1000
#Font size to be extracted
mainTextSize = 10.960000038146973
#Behind edges any text is ignored
leftEdge = None
rightEdge = 370
#Number of characters needed for new span detection
newSpanChars = 5
#Total number of persons plus one
deposCount = 76 + 1
#Heading to be removed
heading = 'the edition and translation'
#Stop words
interpunction = [',', '.', '…', ':', ';', '\'', '|', '/', '-', '--', '?', '!', '‘', '’', '\\', '[', ']', '(', ')', 's']
stopWords = set(stopwords.words('english')).union(interpunction)

#Predefined strings
personAllowed = [
    "The deposition of",
    "The depositions of",
    "The deposition made by",
    "The deposition made before",
    "The deposition made",
    "The inquisition conducted by the prior",
    "The hearing of"
]
personForbidden = [
    "Esclarmonda"
]

deposAllowed = [
    "Another deposition",
    "days before the kalends",
    "days before the nones",
    "days before the ides",
    "days before the Ides",
    "From the nones",
    "From the month",
    "From the same day",
    "From the day",
    "From the year",
    "From the years",
    "From the same day and year",
    "From the same year",
    "From the Monday",
    "From the Tuesday",
    "From the Wednesday",
    "From the Thursday",
    "From the Saturday",
    "From the Sunday",
    "From Saint Denis",
    "From Saint Vincent",
    "From the eve of Pentecost",
    "the following day",
    "the following Tuesday",
    "the same year and on the same day",
    "the same year as above",
    "the same year as",
    "the year of Our Lord one",
    "The deposition of Esclarmonda",
    "saying that in front of the house",
    "he added that Maina",
    "he added that he heard",
    "he added to his confession",
    "He also added there",
    "he said that Estampas",
    "he said that on a certain day",
    "when Bernard Godalh",
    "From the morrow",
    "in the same year of Our Lord",
    "the aforesaid witness , asked about",
    "the Tuesday after the next feast",
    "Afterwards , on the same day",
    "On which day he did not appear",
    "he returned the next morning",
    "Julian [ the ] Angevin",
    "Stephen Brito",
    "John Moret",
    "Stephen Brito",
    "William Colom",
    "Bernard Cunhard",
    "Martin Bergonho",
    "John Board",
    "Perrin Bergonho",
    "John Bergonho"  
]
deposForbidden = [
    "go the following day"
]

characterMap = [
    ("\uf643", "0"),
    ("\uf644", "1"),
    ("\uf645", "2"),
    ("\uf646", "3"),
    ("\uf647", "4"),
    ("\uf648", "5"),
    ("\uf649", "6"),
    ("\uf64a", "7"),
    ("\uf64b", "8"),
    ("\uf64c", "9"),
    ("-", " - "),
    ("—", " -- "),
    (",", " , "),
    ("\.", " . "),
    (";", " ; "),
    (":", " : "),
    ("‘", " ' "),
    ("’", " ' "),
    ("\?", " ? "),
    ("!", " ! "),
    ("\[", " [ "),
    ("\]", " ] "),
    ("\(", " ) "),
    ("\(", " ) "),
    (" +", " "),
]

itemAllowed = [
    "Item"
]
itemForbidden = [
]

@dataclass
class Token:
    word: str
    person: int
    depos: int
    item: int
    page: int
    page_book: int
    prgrph: int
    line: int
    prgrph_word: int
    line_word: int

def fixCharacters(characterMap, text):
    for charTuple in characterMap:
        text = re.sub(charTuple[0], charTuple[1], text)
    return text

def fixHyphenation(text):
    for pageIndex, page in enumerate(text):
        for blockIndex, block in enumerate(page):
            for lineIndex, line in enumerate(block):
                if len(line) > 0 and line[-1] == '-' and len(text[pageIndex][blockIndex]) > lineIndex + 1:
                    text[pageIndex][blockIndex][lineIndex] = text[pageIndex][blockIndex][lineIndex][:-1] #all words except the last one
                    text[pageIndex][blockIndex][lineIndex] += ((text[pageIndex][blockIndex][lineIndex + 1]).split(' '))[0]
                    cutLenght = len(((text[pageIndex][blockIndex][lineIndex + 1]).split(' '))[0])
                    cutLenght += 1
                    text[pageIndex][blockIndex][lineIndex + 1] = (text[pageIndex][blockIndex][lineIndex + 1])[cutLenght:]
    return text

def removeHeadings(text, heading):
    for page in text:
        if (page[0][0].startswith(heading)):
            del page[0]
    return text

def detectPattern(text, allowed, forbidden):
    for sentence in forbidden:
        if sentence in text:
            return False
    for sentence in allowed:
        if sentence in text:
            return True
    return False
    
def analyzeDocument(document, startPage, endPage, pageCount, mainTextSize, leftEdge, rightEdge, newSpanChars):
    actualPage = 0
    text = []    
    
    for page in document:
        actualPage = actualPage + 1
        if actualPage < startPage:
            continue
        if actualPage > endPage:
            continue
        if actualPage % 2 == language:
            continue
        pageCount -= 1
        if pageCount < 0:
            continue

        pageText = []
        blocks = page.get_text('dict')['blocks']
        for block in blocks:
            blockText = []
            for line in block['lines']:
                for span in line['spans']:
                    if (span['size'] != mainTextSize): #Only main text
                        continue
                    if (span['bbox'][0] > rightEdge ): #Ignore right edge notes
                        continue
                    lineText = fixCharacters(characterMap, span['text'])

                    #if (span['font'] == 'MinionPro-It' ): #Ignore right edge notes
                    #    print (span['bbox'][0]);
                    #    print (lineText);
                                        
                    if len(lineText) <= newSpanChars and len(blockText) > 0:
                        blockText[-1] += lineText
                    else:
                        blockText.append(lineText)
            if blockText != []:
                pageText.append(blockText)
        text.append(pageText)
    return text

def tokensOfPerson(which):
    person_tokens = list(filter(lambda token: token.person == which and token.depos != 0, table))
    return person_tokens

def tokensOfPersonWithSummary(which):
    person_tokens = list(filter(lambda token: token.person == which, table))
    return person_tokens

def tokensOfDeposition(which, block):
    person_tokens = list(filter(lambda token: token.person == which and token.depos == block, table))
    return person_tokens

def wordsFromObject(tokens, stops):
    words = []
    for token in tokens:
        if (token.word.lower() not in stops):
            words.append(token.word)
    return words

def textOfPerson(which, stops):
    words = wordsFromObject(tokensOfPerson(which), stops)
    person_words = ' '.join(words)
    return person_words

def textOfDeposition(which, block, stops):
    words = wordsFromObject(tokensOfDeposition(which, block), stops)
    person_words = ' '.join(words)
    return person_words

def createPersonTextFiles():
    text_file_all = open(dir + "/depos/all.txt", "w")
    text_file_word_all = open(dir + "/depos-word/all.txt", "w")
    text_file_stop_all = open(dir + "/depos-stop/all.txt", "w")
    for i in range(1, deposCount):
        text_file = open(dir + "/depos/" + str("{:02d}".format(i)) + ".txt", "w")
        text_file.write(textOfPerson(i, []))
        text_file_all.write(textOfPerson(i, []))
        text_file.close()
        text_file_word = open(dir + "/depos-word/" + str("{:02d}".format(i)) + ".txt", "w")
        text_file_word.write(textOfPerson(i, interpunction))
        text_file_word_all.write(textOfPerson(i, interpunction))
        text_file_word.close()
        text_file_stop = open(dir + "/depos-stop/" + str("{:02d}".format(i)) + ".txt", "w")
        text_file_stop.write(textOfPerson(i, stopWords))
        text_file_stop_all.write(textOfPerson(i, stopWords))
        text_file_stop.close()
    text_file_all.close()
    text_file_stop_all.close()

def createDepositionTextFiles():
    for i in range(1, deposCount):
        for x in range(0, depos_count[i] + 1):
            text_file = open(dir + "/depos/" + str("{:02d}".format(i)) + '-' + str("{:02d}".format(x)) + ".txt", "w")
            text_file.write(textOfDeposition(i, x, []))
            text_file.close()
            text_file_word = open(dir + "/depos-word/" + str("{:02d}".format(i)) + '-' + str("{:02d}".format(x)) + ".txt", "w")
            text_file_word.write(textOfDeposition(i, x, interpunction))
            text_file_word.close()
            text_file_stop = open(dir + "/depos-stop/" + str("{:02d}".format(i)) + '-' + str("{:02d}".format(x)) + ".txt", "w")
            text_file_stop.write(textOfDeposition(i, x, stopWords))
            text_file_stop.close()

In [5]:
#Get text and process it
text = analyzeDocument(document, startPage, endPage, pageCount, mainTextSize, leftEdge, rightEdge, newSpanChars)
text = removeHeadings(text, heading);
text = fixHyphenation(text);

#Table of words
table = []
person = 0
depos = 0
depos_count = []
depos_count.append(0)
dist = 0
item = 0
for pageIndex, page in enumerate(text):
    for prgrphIndex, prgrph in enumerate(page):
        prgrph_word = 0
        for lineIndex, line in enumerate(prgrph):
            if (lineIndex == 0 and detectPattern(line, personAllowed, personForbidden) == True):
                person += 1
                depos_count.append(0)
                depos = 0
                item = 0
            if ((prgrph_word < 5 or person == 3) and dist <= 0 and detectPattern(line, deposAllowed, deposForbidden) == True):
                dist = 30
                depos += 1
                depos_count[person] += 1
                item = 0
            if (prgrph_word == 0 and detectPattern(line, itemAllowed, itemForbidden) == True):
                item += 1
            line_word = 0
            words = line.split(' ')
            for word in words:
                prgrph_word += 1
                line_word += 1
                dist -= 1
                t = Token(word, person, depos, item, pageIndex + 1, startPage - 17 + pageIndex * 2, prgrphIndex + 1, lineIndex + 1, prgrph_word, line_word)
                table.append(t)

In [None]:
#Create text files
createPersonTextFiles()
createDepositionTextFiles()

In [8]:
#Print structure
person = []
person.append("")
for i in range(1, deposCount):
    person.append(textOfPerson(i, stopWords))
    print (i, '==', person[i][:100])
    
    tokens = tokensOfPerson(i)
    depos = 1;
    item = 0;
    for tokenIndex, token in enumerate(tokens):
        if (depos != token.depos):
            depos = token.depos
            short = ''
            for y in range(tokenIndex, tokenIndex + 10):
                short += ' ' + tokens[y].word
            print ('  ', depos, '  ', token.page_book, ' ', short)
            item = 0
        if (item != token.item):
            item = token.item
            print ('    - ' + str(tokens[tokenIndex]))

1 == month June 1273  year Lord one thousand two hundred seventy three  day kalends June   William Molièr
   2    181    Another deposition .  In the same year as above
   3    183    In the same year as above , on the morrow
   4    183    In the same year as above , five days before
2 == 7 days kalends July 1273  year seven days kalends July   Petron  illa wife William Castanet   Verfei
   2    189    In the same year as above , on the kalends
    - Token(word='Item', person=2, depos=2, item=1, page=9, page_book=191, prgrph=1, line=1, prgrph_word=1, line_word=1)
3 == year Lord one thousand two hundred seventy three day kalends July   Michael Pech Rodil   Burgundian 
   2    195    In the same year and on the same day as
   3    195    In the same year and on the same day as
   4    195    In the same year and on the same day as
   5    195    In the same year and on the same day as
   6    195    In the same year and on the same day as
   7    195    In the same year and on the same 