In [39]:
import requests, re, nltk
from bs4 import BeautifulSoup
from nltk import clean_html
from collections import Counter
import operator

In [44]:
def get_base_text(URL):
  r = requests.get(URL)
  return r.text

# Cleaning first part of text obtained from project guttenberg
def clean_guttenberg_header(text):
  ex = r"The Project Gutenberg eBook.*?\*\*\* START OF THE PROJECT GUTENBERG EBOOK.*?\*\*\*"
  main_text = re.sub(ex, "", text, flags=re.DOTALL)
  return main_text

# Cleaning out the footer extra info added by project gutteneberg
def clean_guttenberg_footer(text):
  ex = r"\*\*\* END OF THE PROJECT GUTENBERG EBOOK.*"
  main_text = re.sub(ex, "", text, flags=re.DOTALL)
  return main_text

def clean_guttenberg(text):
  clean = clean_guttenberg_header(text)
  clean = clean_guttenberg_footer(clean)
  return clean

In [41]:
# Example useage with The murder of Roger Ackroyd by Agatha Christie
base_text = get_base_text("https://www.gutenberg.org/cache/epub/69087/pg69087.txt")


In [46]:
book_full = clean_guttenberg(base_text)
# print(book)

In [79]:
# Takes a string and removes articles and punctuation(specified), then returns the words seperated by commas. This is done after sections of text that we want are found so no information is lost
def NormalizeInput(text):
    # getting rid of multiple spaces
    normalized = re.sub(r'[ \t]+', ' ', text)
    # getting rid of multiple new lines at a time
    normalized = re.sub(r'\n+', '\n', normalized)
    normalized = re.sub(r'\s*\n', '\n', normalized)
    # making lowercase
    normalized = normalized.lower()
    # Can add other stuff like to clean up more as well if needed

    return normalized

In [80]:
book_norm = NormalizeInput(book_full)
print(book_norm)


 the murder of
 roger ackroyd
 by
 agatha christie
 author of
 the secret of chimneys,
 the murder on the links, etc.
 [illustration]
 grosset & dunlap
 publishers new york
 copyright, 1926,
 by dodd, mead and company, inc.
 to punkie,
 who likes an orthodox detective
 story, murder, inquest, and suspicion
 falling on every one in turn!
 contents
 chapter page
 i dr. sheppard at the breakfast table 1
 ii who’s who in king’s abbot 7
 iii the man who grew vegetable marrows 17
 iv dinner at fernly 31
 v murder 49
 vi the tunisian dagger 65
 vii i learn my neighbor’s profession 75
 viii inspector raglan is confident 92
 ix the goldfish pond 106
 x the parlormaid 118
 xi poirot pays a call 136
 xii round the table 145
 xiii the goose quill 156
 xiv mrs. ackroyd 165
 xv geoffrey raymond 178
 xvi an evening at mah jong 190
 xvii parker 202
 xviii charles kent 218
 xix flora ackroyd 226
 xx miss russell 238
 xxi the paragraph in the paper 251
 xxii ursula’s story 260
 xxiii poirot’s little re

In [89]:
# helper function to convert an integer to a Roman numeral
def int_to_roman(n):
  roman_numerals = {
    1: 'i', 4: 'iv', 5: 'v', 9: 'ix', 10: 'x',
    40: 'xl', 50: 'l', 90: 'xc', 100: 'c'
  }
  result = ''
  for value in sorted(roman_numerals.keys(), reverse=True):
    while n >= value:
      result += roman_numerals[value]
      n -= value
  return result

# Generate an array with chapter titles "chapter i", "chapter ii", ...
def generate_roman_chapters(num_chapters):
  return [f"chapter {int_to_roman(i)}" for i in range(1, num_chapters + 1)]

# Function to split up text into chapters
def split_text_into_chapters(text, chapter_titles):
  chapter_regex = "|".join(re.escape(title) for title in chapter_titles)
  pattern = re.compile(chapter_regex, re.IGNORECASE)

  chapter_positions = [m.start() for m in pattern.finditer(text)]

  chapters = []

  for i in range(len(chapter_positions)):
    start = chapter_positions[i]
    end = chapter_positions[i+1] if i+1 < len(chapter_positions) else None

    chapter_context = text[start:end].strip() if end is not None else text[start:].strip()
    chapters.append(chapter_context)

  return chapters


In [84]:
# Example use of the help function with the book I selected. For some books this might have to be more manually set or another help function may be needed
chapter_titles = generate_roman_chapters(27)
print(chapter_titles)
chapter_array = ['chapter i', 'chapter ii', 'chapter iii', 'chapter iv', 'chapter v', 'chapter vi', 'chapter vii', 'chapter viii', 'chapter ix', 'chapter x', 'chapter xi', 'chapter xii', 'chapter xiii', 'chapter xiv', 'chapter xv', 'chapter xvi', 'chapter xvii', 'chapter xviii', 'chapter xix', 'chapter xx', 'chapter xxi', 'chapter xxii', 'chapter xxiii', 'chapter xxiv', 'chapter xxv', 'chapter xxvi', 'chapter xxvii']


['chapter i', 'chapter ii', 'chapter iii', 'chapter iv', 'chapter v', 'chapter vi', 'chapter vii', 'chapter viii', 'chapter ix', 'chapter x', 'chapter xi', 'chapter xii', 'chapter xiii', 'chapter xiv', 'chapter xv', 'chapter xvi', 'chapter xvii', 'chapter xviii', 'chapter xix', 'chapter xx', 'chapter xxi', 'chapter xxii', 'chapter xxiii', 'chapter xxiv', 'chapter xxv', 'chapter xxvi', 'chapter xxvii']


In [90]:
book_chapters = split_text_into_chapters(book_norm, chapter_titles)

In [95]:
print(book_chapters[26])

chapter xxvii
 apologia
five a.m. i am very tired—but i have finished my task. my arm aches
from writing.
a strange end to my manuscript. i meant it to be published some day as
the history of one of poirot’s failures! odd, how things pan out.
all along i’ve had a premonition of disaster, from the moment i saw
ralph paton and mrs. ferrars with their heads together. i thought then
that she was confiding in him; as it happened i was quite wrong there,
but the idea persisted even after i went into the study with ackroyd
that night, until he told me the truth.
poor old ackroyd. i’m always glad that i gave him a chance. i urged him
to read that letter before it was too late. or let me be honest—didn’t
i subconsciously realize that with a pig-headed chap like him, it was
my best chance of getting him _not_ to read it? his nervousness that
night was interesting psychologically. he knew danger was close at
hand. and yet he never suspected _me_.
the dagger was an afterthought. i’d brought up a v