In [1]:
import re 
from nltk.corpus import stopwords
import string
from collections import Counter
from collections import defaultdict
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Preprocessing

In [2]:
def remove_illistrations(text: str) -> str:
    """
    Remove Illustrations from the text
    """

    # Remove Illustrations
    cleaned = re.sub(
        r"\[Illustration(?::.*?\]{1,2}|\])",
        "",
        text,
        flags=re.DOTALL
    )

    # Remove left over line
    cleaned = re.sub(r"\n\s*\n", "\n", cleaned)
    return cleaned

In [3]:
def remove_gutenberg_header_footer(text: str) -> str:
    """
    Remove the Project Gutenberg header and footer, and trim the text.
    Only keeps text starting from the last occurrence of 'CHAPTER I' followed by a newline.
    """

    start_marker = "CHAPTER I\n"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    # Use the last occurrence of the chapter marker followed by a newline
    start_idx = text.find(start_marker)
    if start_idx != -1:
        # Keep text starting at the final chapter marker,
        # then remove its heading line
        text = text[start_idx + len(start_marker):]

    # Locate and remove the footer
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]

    return text.strip()

In [4]:
def split_by_chapter(text: str) -> list[str]:
    """
    Split a book into chapters, remove chapter titles,
    and skip very short sections (like table of contents).
    """

    chapters = []
    chapter_marker = "CHAPTER"

    # Split the text at each occurrence of the chapter marker
    parts = text.split(chapter_marker)

    for part in parts:
        # Remove leading/trailing whitespace
        chapter_text = part.strip()

        # Skip very short sections (likely TOC or preface)
        if len(chapter_text.split()) < 20:
            continue

        # Convert newlines to spaces
        chapter_text = chapter_text.replace("\n", " ")

        # _word_ used for formatting somehow by project gutenberg
        chapter_text = chapter_text.replace("_", " ")

        chapters.append(chapter_text)

    return chapters

In [5]:
def create_corpus(books: list[str]) -> list[str]:
    Corpus = None
    
    for book in books:
        book_no_images = remove_illistrations(book)
        book_corpus = remove_gutenberg_header_footer(book_no_images)
        book_chapters = split_by_chapter(book_corpus)

        if Corpus is None:
            Corpus = book_chapters
        else:
            Corpus += book_chapters

    return Corpus

In [6]:
books = []
with open('Books/Emma.txt', 'r') as emma:
    books.append(emma.read())

with open('Books/Pride_and_Prejudice_Jane_Austin.txt', 'r') as pride:
    books.append(pride.read())

with open('Books/Sense_and_Sensibility.txt', 'r') as sense:
    books.append(sense.read())

In [7]:
Corpus = create_corpus(books)

# N-Grams

In [8]:
def text_process(corp):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a string of the cleaned text
    """    
    STOPWORDS = set(stopwords.words("english"))
    # Check characters to see if they are in punctuation
    nopunc = [char for char in corp if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # remove new lines
    nopunc = nopunc.replace('\n', '')
    
    # Now just remove any stopwords
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])

In [9]:
clean_corpus = []
for chapter in Corpus:
    clean_corpus.append(text_process(chapter))

In [10]:
def word_ngrams(texts, n=2):
    counts = Counter()
    
    for chapter in texts:
        words = str(chapter).split()
        for i in range(len(words) - n + 1):
            ngram = tuple(words[i:i+n])
            counts[ngram] += 1
    
    return counts

In [11]:
# Example: bigrams
bigrams = word_ngrams(clean_corpus, n=2)
print(bigrams.most_common(20))

[(('mr', 'knightley'), 233), (('mr', 'darcy'), 219), (('mrs', 'weston'), 210), (('mrs', 'jennings'), 198), (('every', 'thing'), 192), (('mr', 'elton'), 163), (('mrs', 'bennet'), 137), (('mr', 'weston'), 129), (('miss', 'woodhouse'), 129), (('young', 'man'), 129), (('mr', 'collins'), 121), (('frank', 'churchill'), 120), (('every', 'body'), 117), (('mrs', 'elton'), 113), (('dare', 'say'), 109), (('mrs', 'dashwood'), 109), (('great', 'deal'), 108), (('colonel', 'brandon'), 105), (('mr', 'woodhouse'), 103), (('sir', 'john'), 97)]


In [12]:
def ngram_lookup(ngram_counts, n):
    lookup = defaultdict(list)

    for ngram, count in ngram_counts.items():
        key = ngram[:-1]      # first n-1 words
        next_word = ngram[-1]

        lookup[key] += [next_word] * count

    return lookup

In [13]:
def generate_sentence(lookup, n=1, length=20):
    start = random.choice(list(lookup.keys()))
    sentence = list(start)

    for _ in range(length):
        key = tuple(sentence[-(n-1):])

        if key not in lookup:
            break

        sentence.append(random.choice(lookup[key]))

    return " ".join(sentence)

In [14]:
N = 5
trigrams = word_ngrams(clean_corpus, n=N)
lookup = ngram_lookup(trigrams, n=N)

print(generate_sentence(lookup, n=N))


spend large fortune said mrs dashwood children rich without help must begin improvements house observed elinor difficulties soon vanish magnificent orders would travel family


# TF-IDF

Just did this for fun, found out that most of the algorthims that need it have it built in. 

In [15]:
tr_idf_model  = TfidfVectorizer(
    stop_words="english",
    max_df=0.95,
    min_df=2,
)
tf_idf_vector = tr_idf_model.fit_transform(Corpus)

In [16]:
words_set = tr_idf_model.get_feature_names_out()
tf_idf_array = tf_idf_vector.toarray()
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

df_tf_idf

Unnamed: 0,26th,28th,abatement,abbey,abhor,abhorred,abhorrence,abide,abilities,ability,...,york,yorkshire,young,younge,younger,youngest,youth,youthful,zeal,zealous
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.021750,0.0,0.0,0.019941,0.000000,0.000000,0.0,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.041932,0.036662,0.0,0.0,0.000000,0.060812,0.000000,0.0,0.000000
2,0.0,0.0,0.000000,0.033557,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.059847,0.0,0.0,0.000000,0.089343,0.000000,0.0,0.000000
3,0.0,0.0,0.000000,0.040157,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.100264,0.0,0.0,0.000000,0.017819,0.026559,0.0,0.000000
4,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.020600,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,0.0,0.0,0.038396,0.000000,0.0,0.0,0.028446,0.0,0.0,0.0,...,0.0,0.000000,0.009299,0.0,0.0,0.000000,0.023137,0.000000,0.0,0.038396
161,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.029928,0.0,0.0,0.027438,0.000000,0.000000,0.0,0.000000
162,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.043978,0.000000,0.000000,0.0,0.000000
163,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
