In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import string
from nltk.corpus import stopwords
from collections import Counter
import re 

In [2]:
def remove_illistrations(text: str) -> str:
    """
    Remove Illustrations from the text
    """

    # Remove Illustrations
    cleaned = re.sub(
        r"\[Illustration(?::.*?\]{1,2}|\])",
        "",
        text,
        flags=re.DOTALL
    )

    # Remove left over line
    cleaned = re.sub(r"\n\s*\n", "\n", cleaned)
    return cleaned

In [17]:
def remove_gutenberg_header_footer(text: str) -> str:
    """
    Remove the Project Gutenberg header and footer, and trim the text.
    Only keeps text starting from the last occurrence of 'CHAPTER I' followed by a newline.
    """

    start_marker = "CHAPTER I\n"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    # Use the last occurrence of the chapter marker followed by a newline
    start_idx = text.find(start_marker)
    if start_idx != -1:
        # Keep text starting at the final chapter marker,
        # then remove its heading line
        text = text[start_idx + len(start_marker):]

    # Locate and remove the footer
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]

    return text.strip()

In [23]:
def split_by_chapter(text: str) -> list[str]:
    """
    Split a book into chapters, remove chapter titles,
    and skip very short sections (like table of contents).
    """

    chapters = []
    chapter_marker = "CHAPTER"

    # Split the text at each occurrence of the chapter marker
    parts = text.split(chapter_marker)

    for part in parts:
        # Remove leading/trailing whitespace
        chapter_text = part.strip()

        # Skip very short sections (likely TOC or preface)
        if len(chapter_text.split()) < 20:
            continue

        # Convert newlines to spaces
        chapter_text = chapter_text.replace("\n", " ")

        chapters.append(chapter_text)

    return chapters

In [24]:
def create_corpus(books: list[str]) -> list[str]:
    Corpus = None
    
    for book in books:
        book_no_images = remove_illistrations(book)
        book_corpus = remove_gutenberg_header_footer(book_no_images)
        print(book_corpus)
        book_chapters = split_by_chapter(book_corpus)
        print(book_chapters[0])

        if Corpus is None:
            Corpus = book_chapters
        else:
            Corpus += book_chapters

    return Corpus

# Fully Clean Corpus of Chapter

In [25]:
books = []
with open('Books/Emma.txt', 'r') as emma:
    books.append(emma.read())

with open('Books/Pride_and_Prejudice_Jane_Austin.txt', 'r') as pride:
    books.append(pride.read())

with open('Books/Sense_and_Sensibility.txt', 'r') as sense:
    books.append(sense.read())

In [26]:
Corpus = create_corpus(books)

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and
happy disposition, seemed to unite some of the best blessings of
existence; and had lived nearly twenty-one years in the world with very
little to distress or vex her.
She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister’s marriage,
been mistress of his house from a very early period. Her mother had
died too long ago for her to have more than an indistinct remembrance
of her caresses; and her place had been supplied by an excellent woman
as governess, who had fallen little short of a mother in affection.
Sixteen years had Miss Taylor been in Mr. Woodhouse’s family, less as a
governess than a friend, very fond of both daughters, but particularly
of Emma. Between _them_ it was more the intimacy of sisters. Even
before Miss Taylor had ceased to hold the nominal office of governess,
the mildness of her temper had hardly allowed her to impose any
restr

In [27]:
vec = CountVectorizer()

vec.fit(Corpus)

vec.get_feature_names_out()

array(['000', '10', '18th', ..., 'zealously', 'zigzags', 'èclat'],
      shape=(10626,), dtype=object)

In [28]:
def text_process(corp):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """    
    STOPWORDS = stopwords.words('english')
    # Check characters to see if they are in punctuation
    nopunc = [char for char in corp if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # remove new lines
    nopunc.replace('\n', '')
    
    # Now just remove any stopwords
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])

In [35]:
clean_corpus = []
for chapter in Corpus:
    clean_corpus.append(text_process(chapter))

In [37]:
def word_ngrams(texts, n=2):
    counts = Counter()
    
    for chapter in texts:
        words = str(chapter).split()
        for i in range(len(words) - n + 1):
            ngram = tuple(words[i:i+n])
            counts[ngram] += 1
    
    return counts

# Example: bigrams
bigrams = word_ngrams(clean_corpus, n=3)
print(bigrams.most_common(20))

[(('mr', 'frank', 'churchill'), 37), (('mr', 'john', 'knightley'), 26), (('mrs', 'john', 'dashwood'), 23), (('said', 'mr', 'knightley'), 18), (('miss', 'de', 'bourgh'), 16), (('said', 'mrs', 'jennings'), 15), (('dear', 'miss', 'woodhouse'), 14), (('mr', 'mrs', 'weston'), 14), (('said', 'mrs', 'weston'), 13), (('said', 'mrs', 'bennet'), 13), (('mrs', 'john', 'knightley'), 12), (('lady', 'catherine', 'de'), 12), (('without', 'saying', 'word'), 11), (('catherine', 'de', 'bourgh'), 11), (('mr', 'mrs', 'gardiner'), 11), (('said', 'mr', 'woodhouse'), 10), (('dare', 'say', 'shall'), 10), (('said', 'mr', 'bennet'), 10), (('said', 'miss', 'bingley'), 10), (('said', 'mrs', 'dashwood'), 10)]
