In [258]:
# re
import re

# data reading / cleaning utils
from gut_tokenize import read_data

# pandas
import pandas as pd

# nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [236]:
# read in test corpus
directory = "../Gutenberg/relic_test_set/"
titles = []
texts = []
# use util
titles, texts = read_data(directory)

In [237]:
titles = [t[:-10] for t in titles]

In [238]:
'''
Detects the presence of dialogue markers in a sentence
'''
def dialogue_helper(d):
    
    if '"' in d or '\"' in d or '“' in d or '”' in d or d.startswith('-') or d.strip().startswith('-') or len(d.strip().split()) == 1 != []:
        return ''
    else:
        return d

In [278]:
'''
Function to quickly break apart a .txt file into paragraphs
Eschews sentence_tokenize or similar strategies because the output goal is paragraphs
Preprocessing includes: removing excessive spaces and line-breaks, and converting em-dashes to normal dashes
As to adhere to the RELiC format
'''
def paragraph_tokenize(text, title):
    
    # default start and stop boundaries
    start_index = 0
    stop_index = len(text)
    
    # manual cleanup for applicable boundary markers
    if title == 'north_and_south':
        start_index = text.find("chapter i.")
        stop_index = text.find("the end.")
    elif title == 'germinal':
        start_index = text.find("chapter i")
    elif title == 'pride_and_prejudice':
        start_index = text.find("chapter 61") + 1
    elif title == 'the_jungle':
        start_index = text.find("chapter xxxi") + 1
    elif title == 'the_great_gatsby':
        start_index = text.find("thomas parke d’invilliers")
    
    text = text[start_index : stop_index]
    # split 
    text = text.split("\n\n")
    # remove extra spaces and new line characters, plus sub out the em-dash
    text = [re.sub("\s{2,}", " ", t.replace("\n", " ").strip().replace("—", "-").replace("--", "-")) for t in text]
    # remove dialogue
    text = [t for t in text if dialogue_helper(t) != '']
    # remove paragraphs shorter than 10 'tokenized' words (including punc)
    text = [t for t in text if len(nltk.word_tokenize(t)) >= 10]
    
    return text

In [279]:
tokenized_texts = []
for text, title in zip(texts, titles):
    tokenized_texts.append(paragraph_tokenize(text, title))

In [280]:
'''
RELiC data is grouped in arrangements of between 1 sentence and 4 sentences.
This method returns a dictionary whose keys are the name of the text and a 'grouping' id, and whose values are blocks of sentences
Slide over each paragraph, returning groups of 4.
'''
def create_paragraph_dictionary(text, title):
    
    # print(f"creating paragraph dictionary for {title}")
    
    # container
    text_dict = {}
    # for every paragraph in text
    for i in range(len(text)):
        paragraph = text[i]
        # break into sentences
        sentences = nltk.sent_tokenize(paragraph)
        num_sentences = len(sentences)
        
        if num_sentences <= 4:
            text_dict[title + '_para_' + str(i)] = " ".join(sentences)
            
        elif num_sentences > 4 and num_sentences <= 8:
            text_dict[title + '_para_' + str(i) + '_1_4'] = " ".join(sentences[:4])
            text_dict[title + '_para_' + str(i) + '_5_' + str(num_sentences)] = " ".join(sentences[4:])
            
        elif num_sentences > 8 and num_sentences <=12:
            
            text_dict[title + '_para_' + str(i) + '_1_4'] = " ".join(sentences[:4])
            text_dict[title + '_para_' + str(i) + '_5_8'] = " ".join(sentences[4:8])
            text_dict[title + '_para_' + str(i) + '_9_' + str(num_sentences)] = " ".join(sentences[8:])
            
        else:
            text_dict[title + '_para_' + str(i) + '_1_4'] = " ".join(sentences[:4])
            text_dict[title + '_para_' + str(i) + '_5_8'] = " ".join(sentences[4:8])
            text_dict[title + '_para_' + str(i) + '_9_12' + str(num_sentences)] = " ".join(sentences[8:12])
    
    return text_dict

In [281]:
def write_dict_to_file(title, paragraph_dict):
    
    para_ids = paragraph_dict.keys()
    paras = paragraph_dict.values()
    
    book_df = pd.DataFrame({'ids': para_ids,'text': paras,})
    
    book_df.to_csv('data/relic/relic-test-corpus/' + title + '_paragraphs.csv')
    
    print(f"wrote {len(paras)} paragraphs to .csv for {title}")

In [282]:
for text, title in zip(tokenized_texts, titles):
    write_dict_to_file(title, create_paragraph_dictionary(text, title))

wrote 2474 paragraphs to .csv for germinal
wrote 833 paragraphs to .csv for pride_and_prejudice
wrote 800 paragraphs to .csv for the_jungle
wrote 907 paragraphs to .csv for north_and_south
wrote 536 paragraphs to .csv for the_great_gatsby
