In [1]:
# reading files
from os import listdir
from os.path import isfile, join

# re
import re

# sentence tokenize / parse
import spacy
en = spacy.load('en_core_web_sm')
# enforce max length
en.max_length = 1500000

In [2]:
'''
This method reads and returns the text of each .txt file in a directory.
These files should be the "cleaned" output of the c-w gutenberg routine

Args:
    directory_name: str

Returns:
    texts: container for the inputted texts. list
'''
def read_data(directory_name):
    texts = []
    files = [f for f in listdir(directory_name) if isfile(join(directory_name, f))]
    
    for file in files:
        if (".txt" in file):
            with open(directory_name + file, encoding="utf-8") as f:
                #lowercase text and append
                texts.append(f.read().lower())
    
    return texts

In [3]:
'''
This method reads and returns the text of each .txt file in a directory.
These inputs files should be the "cleaned" output of the c-w gutenberg routine

- remove new lines
- lowercase
- advance past headmatter
- remove chapter / vol / part numbers

Args:
    raw_text: str

Returns:
    text: rougly cleaned text
'''
def preprocess(raw_text):
    
    if raw_text is None:
        return
    
    text = raw_text
    
    # very high level pre-processing...
    text = text.replace("\n", " ")
    
    # lower
    text = text.lower()
    
    # listify, splitting on spaces
    text = text.split(' ')
    
    # advance us past headmatter
    for i in range(len(text) - 1):
        window_val = ' '.join(text[i:i+2])
        if (window_val == "chapter 1" or window_val == "chapter i"):
            text = text[i+2:]
            break
    
    # back to string
    text = ' '.join(text)
    
    # strip all the extraneous spaces (more than 2)
    text = re.sub('\s{2,}', ' ', text)
    
    # remove volume numbers
    text = re.sub("volume i{1,}|volume [0-9]{1,}|volume one|volume two|volume three", "", text)
    
    # remove part numbers
    text = re.sub("part i{1,}|volume [0-9]{1,}|part one|part two|part three", "", text)
    
    # remove chapters
    text = re.sub("chapter [a-z]+|chapter [0-9]+", "", text)
    
    # get rid of empties
    text = [word for word in text if word != ""]
    
    # back to string
    text = ''.join(text)
    
    return text

In [5]:
'''
This method relies on the spacy parser and returns a clause-level tokenization of a given text.
Where "clause" is simply text between two delimiters like commas.
E.g. the sentence
"this is a test, of many things, but mostly your abilities." would split into:

- this is a test
- of many things
- but mostly your abilities

See https://stackoverflow.com/a/65300589 for germ of method logic

Args:
    text: document-level rep of text after being read in and roughly-cleaned
 
Returns:
    texts: cleaned. a list holding each clause for the text
'''
def clause_parse(text):
    doc = en(text)
    
    # keep track of covered words
    seen = set()

    chunks = []
    for sent in doc.sents:
        heads = [cc for cc in sent.root.children if cc.dep_ == 'conj']

        for head in heads:
            words = [ww for ww in head.subtree]
            for word in words:
                seen.add(word)
            chunk = (' '.join([ww.text for ww in words]))
            chunks.append((head.i, chunk))

        unseen = [ww for ww in sent if ww not in seen]
        chunk = ' '.join([ww.text for ww in unseen])
        chunks.append((sent.root.i, chunk))

    chunks = sorted(chunks, key=lambda x: x[0])

    cleaned = []

    for ii, chunk in chunks:
        # replace boundary char
        chunk = chunk.replace(",", "SEP")
        chunk = chunk.replace(".", "SEP")
        # also replace orphan "and" sections
        chunk = [c.strip() for c in chunk.split("SEP") if c.strip() not in ("", "and", None)]
        cleaned.append(chunk)
    
    clauses = sum(cleaned, [])
    
    return [clause for clause in clauses if clause != ""]

In [6]:
directory = "../Gutenberg/cleaned_texts/"

texts = read_data(directory)

germinal = preprocess(texts[0])

germinal_clauses = clause_parse(germinal)

In [7]:
print(germinal[0: 558])

print('============')

clause_count = 0
for c in germinal_clauses:
    print (c)
    clause_count += 1
    if clause_count == 11:
        break

 over the open plain, beneath a starless sky as dark and thick as ink, a man walked alone along the highway from marchiennes to montsou, a straight paved road ten kilometres in length, intersecting the beetroot fields. he could not even see the black soil before him, and only felt the immense flat horizon by the gusts of march wind, squalls as strong as on the sea, and frozen from sweeping leagues of marsh and naked earth. no tree could be seen against the sky, and the road unrolled as straight as a pier in the midst of the blinding spray of darkness. 
over the open plain
beneath a starless sky as dark and thick as ink
a man walked alone along the highway from marchiennes to montsou
a straight paved road ten kilometres in length
intersecting the beetroot fields
he could not even see the black soil before him
only felt the immense flat horizon by the gusts of march wind
squalls as strong as on the sea
and frozen from sweeping leagues of marsh and naked earth
no tree could be seen agains

In [271]:
def nouns_and_modifiers(clauses):
    
    # obj: vanilla object
    # pobj: object of a pr
    # iobj: indirect obj
    # pobj: prep. obj
    # npadvmod: noun phrase as adverbial modifier (common for measurements; 20 years old)
    # nsubj: noun subject
    # nsubjpass: passive nominal subject
    # acomp: adjectival complement
    
    np_labels = ['compound', 'obj','dobj','iobj','pobj','npadvmod','nsubj','nsubjpass']
    
    # modifiers ... acl, amod, nummod, nn, advmod
    
    detail_dict = {}
    count = 0
    for clause in clauses:
        print("==============")
        doc = en(clause)
        for word in doc:
            # if word.dep_ in np_labels 
            if word.pos_ == "NOUN" or word.pos_ == "VERB":
                detail_chunks = [] 
                size = sum(1 for dummy in word.subtree)
                if size > 1:
                    detail_chunk = []
                    for descendant in word.subtree:
                        detail_chunk.append(descendant.text)
                    detail_chunks.append(detail_chunk)
                if detail_chunks:
                    print(detail_chunks)
  #      detail_dict[count] = "|".join(sum(detail_chunks, []))
  #      count += 1
            
  #  print(detail_dict)
                    

In [272]:
nouns_and_modifiers(germinal_clauses[0:11])

[['the', 'open', 'plain']]
[['a', 'starless', 'sky']]
[['a', 'man']]
[['a', 'man', 'walked', 'alone', 'along', 'the', 'highway', 'from', 'marchiennes', 'to', 'montsou']]
[['the', 'highway']]
[['to', 'montsou']]
[['a', 'straight', 'paved', 'road', 'ten', 'kilometres', 'in', 'length']]
[['ten', 'kilometres', 'in', 'length']]
[['intersecting', 'the', 'beetroot', 'fields']]
[['the', 'beetroot', 'fields']]
[['he', 'could', 'not', 'even', 'see', 'the', 'black', 'soil', 'before', 'him']]
[['the', 'black', 'soil', 'before', 'him']]
[['only', 'felt', 'the', 'immense', 'flat', 'horizon', 'by', 'the', 'gusts', 'of', 'march', 'wind']]
[['the', 'immense', 'flat', 'horizon', 'by', 'the', 'gusts', 'of', 'march', 'wind']]
[['the', 'gusts', 'of', 'march', 'wind']]
[['march', 'wind']]
[['squalls', 'as', 'strong', 'as', 'on', 'the', 'sea']]
[['the', 'sea']]
[['and', 'frozen', 'from', 'sweeping', 'leagues', 'of', 'marsh', 'and', 'naked', 'earth']]
[['sweeping', 'leagues', 'of', 'marsh', 'and', 'naked', 'e