In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
import regex as re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from PyPDF2 import PdfWriter, PdfReader

In [2]:
source_material_dir = Path("/Users/vigji/code/cainsjb/source_material")

#### Renumber pages

In [2]:
fname = source_material_dir / "cains_jawbone.txt"
with open(fname, "r") as f:
    text = f.read()

forced_words = ["Head", "May"]
def extract_ne(quote):
    words = word_tokenize(quote, language="english")
    sents = sent_tokenize(quote, language="english")
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    interesting_words = [" ".join(i[0] for i in t)
        for t in tree if hasattr(t, "label") and t.label() == "NE"] + [w for w in words if w in forced_words]
    
    interesting_sents = {k: [] for k in interesting_words}
    for word in set(interesting_words):
        for s in sents:
            if word in s:
                interesting_sents[word].append(s)
    return interesting_words, interesting_sents
    #for w in words

pages = text.split("________________")

# Extract all name entities:
all_ne = [extract_ne(page) for page in pages]

# Put together occurrencies from all pages:
pages_dict = dict()
for i, (words, sents) in enumerate(all_ne):
    for word in words:
        if word in pages_dict.keys():
            if sents[word] not in pages_dict[word]["fragments"]:
                pages_dict[word]["pages"].append(i)
                pages_dict[word]["fragments"].append(sents[word])
        else:
            pages_dict[word] = dict(pages=[i], fragments=[sents[word]])
            
k = 0        


tosave = ""
for i, page in enumerate(pages):
    tosave += f"------------------\n\n[{i + 1}]\n" + page + "\n\n\n"
    
with open(fname.parent / f"{fname.stem}_renumbered.txt", "w") as f:
    f.write(tosave)

#### Split pdf

In [3]:
inputpdf = PdfReader(open(source_material_dir / "cains_jawbone.pdf", "rb"))

for i in range(len(inputpdf.pages)):
    output = PdfWriter()
    output.add_page(inputpdf.pages[i])
    with open(source_material_dir / "pages" / f"page_{i}.pdf", "wb") as outputStream:
        output.write(outputStream)

In [132]:
pages_dict["Aquarius"]

{'pages': [0, 42],
 'fragments': [['Call me nervous, call me fey, if you will ; at least this little pen, this mottled black and silver Aquarius, with its nib specially tempered to my order in Amsterdam, is greedy.'],
  ['Where would my income be if Aquarius were to turn Gemini?']]}

In [133]:
pages_dict["Alexander"]

{'pages': [14, 22, 36, 42, 54, 67],
 'fragments': [['Alexander, the only noteworthy Pope of my native land, was demonstrably affected.'],
  ['I must say I envy Alexander having his first, and perhaps his second, in there.'],
  ['I looked across the table to the great brimming bowl of yellow jasmine ; young Alexander had sent him up the night before with an invitation to a private view of the Paulo Post Avorticists.'],
  ['Alexander’s my name.'],
  ['James had got off, the Earl and his brother Alexander had emphatically not.'],
  ['Just like reverting to old tunes after they were damned and dead : how often had I not caught myself whistling Alexander’s Ragtime Wedding Feast in my frugal bath.']]}

In [134]:
pages_dict["Head"]

{'pages': [11, 65, 66, 73],
 'fragments': [['Then I fumbled two aspirin tablets into my mouth : Noel Coward’s King Charles’s Head.'],
  ['Green and white and rose, grit, wisdom and reliability, the find old Head, as we called him, had quipped it.'],
  ['\nthe darker works of Beardsley and Felicien Rops, and ringing with the gloat curses of the Head, as we called him, lubriciously gasping in the grip of ether.'],
  ['\nphotographs of young and laughing athletes, lads who had profited and gone on, and ringing with those words of the Head, as we called him, that one by one the touch of life has turned to truth.']]}

In [135]:
pages_dict["May"]

{'pages': [22, 57, 70, 74, 78, 79, 97, 99],
 'fragments': [['Bryant and, of course, May.'],
  ['Nuts in May, how Freudian.'],
  ['I thought of May.',
   'Over them came old odour of red May.',
   'I felt that I was letting May down.'],
  ['It seemed almost certain that the blight would be destroyed : the blight on the May, or on the delight that is as wide-eyed as a marigold.'],
  ['I understood why he had once said to me about something being as flush as May.'],
  ['And at the same time, of course, I wanted to do my best for May.'],
  ['I thought of May.', 'May be.', 'May be not.'],
  ['He, the reckless old cock, slips down past Woolworth’s and she continues full-sail toward the Kursal, as flush---oh, you wicked woman---as May.']]}

In [114]:
pages[57]

'\nConsidering it was my name month, I wasn’t having too much luck. Henry, though a bit on the spectacular side---to fly the viscera of his third, of the old family lawyer, at his small flagstaff, a little argued the exhibitionist---was sane enough. And this stranger, to judge by the over-vague conversation he began to force on me---different in this from the agriculturist, who had been utterly silent save for the burning question, and the brats who had only uttered mutually---was distinctly nuts. Nuts in May, how Freudian. Be not a Freud ; thy help is near. But was it? Henry was in desperate case, and this other was short-sighted enough not to realise that I should care. The former was stooping over the cooling remains of his fourth---the rash intruding charlady---when there came a horribly official knock at the little blue door. (Was it Inspector Barraclough, or only some stolid-witted local?) But little the latter cared. He went on talking about Browning.\n'

In [62]:
words = word_tokenize(page, language="english")
sentencies = sent_tokenize(page, language="english")
tags = nltk.pos_tag(words)
tree = nltk.ne_chunk(tags, binary=True)

In [160]:
for k in sorted(pages_dict):
    pgs, sents = pages_dict[k]["pages"], pages_dict[k]["fragments"]
    if len(set(pgs)) > 1:
        print("---------")
        print(k)
        print([p+1 for p in pgs])
        for s in sents:
            print(f"--{s}")

---------
Alexander
[15, 23, 37, 43, 55, 68]
--['Alexander, the only noteworthy Pope of my native land, was demonstrably affected.']
--['I must say I envy Alexander having his first, and perhaps his second, in there.']
--['I looked across the table to the great brimming bowl of yellow jasmine ; young Alexander had sent him up the night before with an invitation to a private view of the Paulo Post Avorticists.']
--['Alexander’s my name.']
--['James had got off, the Earl and his brother Alexander had emphatically not.']
--['Just like reverting to old tunes after they were damned and dead : how often had I not caught myself whistling Alexander’s Ragtime Wedding Feast in my frugal bath.']
---------
Aquarius
[1, 43]
--['Call me nervous, call me fey, if you will ; at least this little pen, this mottled black and silver Aquarius, with its nib specially tempered to my order in Amsterdam, is greedy.']
--['Where would my income be if Aquarius were to turn Gemini?']
---------
Bill
[16, 23, 61]
--

In [162]:
pages[85]

'\nNext day I let Caroline Jasmine---what a name!---do her very damnedest for my guest. But I was doubtful of her influence all the while. What a man! Henry, I supposed, was about his business and concern, such as it was. What is removed drops horribly in a pail. Why should that stick in my head? Just because a tool I have used, and shall use again, turned, as it were, under my hand last week and said it? And what more had he said? Has anyone supposed it lucky to be born? I hasten to inform him or her it is just as lucky to die, and I know it. That should be, I thought, a consolation for my patient. Surely such a confirmed old tub-thumper would not have had the wit to think out the Mithradates inoculation for himself, and put it into practice? Perish the thought, and the fellow. Also my ravishing correspondent would have told me. I found myself thinking with a strange weakness of the poet’s lines :\n\n\n        But we have all bent low and low and kissed the\n                quiet feet

In [156]:
text.lower().index("fox")

5155

In [158]:
text[3000:5300]

' piled abomination is actually before me. But the queen can do no wrong. The rain that came heavily is drying off lightly. There, jauntily tripping from the edge of one puddle to another is crisp Sir Roland Mowthalorn, shuddering old thing, intent to buy the day’s buttonhole from gin-faced Annie behind the church. I remember clearly, perhaps because I ought to have my wits about me for another purpose, how Sir Roland’s father, Sir Weedon, once saw Henry taking the part of Lesurques and mixed him up with Le Cirque d’Hiver. Instead of really explaining, she points me gaily to a little boy about, she says, to tumble into the sea. Perhaps he has already tumbled in. In the snowy cumulus above the orange there seems to be now a hole. She tells me to mash all with a spoon. If she had said a mashie. But she is so beautiful. Can I suspect her?\n________________\n\n\nI hated my eye for being caught by what didn’t concern me : the powerful grip of the new young man. But it was parading a couple 

In [148]:
for k in sorted(pages_dict):
    pages, sents = pages_dict[k]["pages"], pages_dict[k]["fragments"]
    #if len(set(pages)) > 1:
    print("---------")
    print(k)
    print([p+1 for p in pages])
    for s in sents:
        print(f"--{s}")

---------
Achard
[45]
--['I stayed myself with devilled Epicam and Royans aux Achard, levered into me with Peter Barleys and washed down and out, foul thought, with Villacabras.']
---------
Achilles
[6]
--['\n\n\nI considered that venerable whose winter Achilles thought to take from the lips of Cressida.']
---------
Adriatic
[21]
--['Far, far from here the Adriatic breaks in a warm bay among the green Illyrian hills.']
---------
Alexander
[15, 23, 37, 43, 55, 68]
--['Alexander, the only noteworthy Pope of my native land, was demonstrably affected.']
--['I must say I envy Alexander having his first, and perhaps his second, in there.']
--['I looked across the table to the great brimming bowl of yellow jasmine ; young Alexander had sent him up the night before with an invitation to a private view of the Paulo Post Avorticists.']
--['Alexander’s my name.']
--['James had got off, the Earl and his brother Alexander had emphatically not.']
--['Just like reverting to old tunes after they were 

In [30]:
for word, pages in pages_dict.items():
    print(word, pages)

Amsterdam [0]
Aquarius [0, 42]
Casy Ferris [0]
Which [1]
London [1, 81]
Henry [1, 3, 4, 10, 12, 13, 15, 20, 25, 30, 31, 34, 35, 40, 41, 43, 44, 46, 48, 57, 59, 62, 67, 70, 75, 76, 77, 79, 81, 82, 84, 86, 90, 91, 93, 96, 99]
Hodder [2]
Oppenheim [2]
Austin Freeman [2]
Clement [2]
Lesurques [3]
Sir Roland [3]
Le Cirque [3]
Sir Weedon [3]
Maestro Jimson [3]
Thomas Hardy [4]
Cerebos [4]
Emperor [4]
Electric Febrifuge [5]
Achilles [5]
Speckled Band [5]
Cressida [5]
Search [6]
Conrad [6]
Typhoon [6]
Forth [6]
Baker Steet [6]
Charles Victor Hugo [6]
English [6]
Mr. Hall [7, 18]
Kilmarnock [7]
Belmerino [7]
Guido [7]
Charles [8, 59]
Knight [8]
Peebles University [8]
Boots [9]
Hyacinth [9]
Etam [9]
Egyptian [9]
Jasmine [9, 45, 98]
Ireland [10]
Grundy Sapphic [10]
Orchard [10]
Dean [10]
Atimetus [11]
Bloomsbury [12]
Rintrah [12]
Janetta Sheringham [13]
John [13, 20, 21]
John Hewit [14]
Bart [14]
Sarah [14]
England [14, 19, 40, 45, 46, 82]
Sarah Drew [14]
Alexander [14, 22, 36, 42, 54, 67]
Bill [