# Preprocessing

In [1]:
import re 

In [2]:
def remove_illistrations(text: str) -> str:
    """
    Remove Illustrations from the text
    """

    # Remove Illustrations
    cleaned = re.sub(
        r"\[Illustration(?::.*?\]{1,2}|\])",
        "",
        text,
        flags=re.DOTALL
    )

    # Remove left over line
    cleaned = re.sub(r"\n\s*\n", "\n", cleaned)
    return cleaned

In [3]:
def remove_gutenberg_header_footer(text: str) -> str:
    """
    Remove the Project Gutenberg header and footer, and trim the text.
    Only keeps text starting from the last occurrence of 'CHAPTER I' followed by a newline.
    """

    start_marker = "CHAPTER I\n"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    # Use the last occurrence of the chapter marker followed by a newline
    start_idx = text.find(start_marker)
    if start_idx != -1:
        # Keep text starting at the final chapter marker,
        # then remove its heading line
        text = text[start_idx + len(start_marker):]

    # Locate and remove the footer
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]

    return text.strip()

In [4]:
def split_by_chapter(text: str) -> list[str]:
    """
    Split a book into chapters, remove chapter titles,
    and skip very short sections (like table of contents).
    """

    chapters = []
    chapter_marker = "CHAPTER"

    # Split the text at each occurrence of the chapter marker
    parts = text.split(chapter_marker)

    for part in parts:
        # Remove leading/trailing whitespace
        chapter_text = part.strip()

        # Skip very short sections (likely TOC or preface)
        if len(chapter_text.split()) < 20:
            continue

        # Convert newlines to spaces
        chapter_text = chapter_text.replace("\n", " ")

        # _word_ used for formatting somehow by project gutenberg
        chapter_text = chapter_text.replace("_", " ")

        chapters.append(chapter_text)

    return chapters

In [5]:
def create_corpus(books: list[str]) -> list[str]:
    Corpus = None
    
    for book in books:
        book_no_images = remove_illistrations(book)
        book_corpus = remove_gutenberg_header_footer(book_no_images)
        book_chapters = split_by_chapter(book_corpus)

        if Corpus is None:
            Corpus = book_chapters
        else:
            Corpus += book_chapters

    return Corpus

In [6]:
books = []
with open('Books/Emma.txt', 'r') as emma:
    books.append(emma.read())

with open('Books/Pride_and_Prejudice_Jane_Austin.txt', 'r') as pride:
    books.append(pride.read())

with open('Books/Sense_and_Sensibility.txt', 'r') as sense:
    books.append(sense.read())

In [7]:
Corpus = create_corpus(books)

# LDA

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
def print_topics(model, vectorizer, n_words=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic #{idx}")
        print("  " + " ".join(words[i] for i in topic.argsort()[-n_words:]))

In [11]:
vectorizer = CountVectorizer(
    stop_words="english",
    max_df=0.95,
    min_df=2,
)

X = vectorizer.fit_transform(Corpus)

In [12]:
lda = LatentDirichletAllocation(
    n_components=10,         # experiment with 5–30
    learning_method="batch", # stable & reproducible
    random_state=42,
)
lda.fit(X)

0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",10
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [13]:
print_topics(lda, vectorizer)

Topic #0
  woman heart affection marianne ferrars accomplished brother robert miss edward
Topic #1
  miss wickham did jane bingley said bennet darcy elizabeth mr
Topic #2
  heart day mr think mother did said sister elinor marianne
Topic #3
  think good weston said elton knightley miss harriet emma mr
Topic #4
  bennet lady bingley colonel room mr miss elizabeth darcy said
Topic #5
  father said miss thing john did mr good think dear
Topic #6
  did woodhouse thing think know weston said miss emma mr
Topic #7
  did sister know miss jennings lucy edward said marianne elinor
Topic #8
  campbell cole dixon emma bates thing mr fairfax miss jane
Topic #9
  did shall know dear said sister dashwood willoughby elinor marianne


In [14]:
chapter_topic_distrib = lda.transform(X)

for i, distrib in enumerate(chapter_topic_distrib):
    print(f"Chapter {i}: dominant topic = {distrib.argmax()}")

Chapter 0: dominant topic = 3
Chapter 1: dominant topic = 3
Chapter 2: dominant topic = 8
Chapter 3: dominant topic = 3
Chapter 4: dominant topic = 6
Chapter 5: dominant topic = 3
Chapter 6: dominant topic = 3
Chapter 7: dominant topic = 3
Chapter 8: dominant topic = 5
Chapter 9: dominant topic = 5
Chapter 10: dominant topic = 3
Chapter 11: dominant topic = 5
Chapter 12: dominant topic = 3
Chapter 13: dominant topic = 3
Chapter 14: dominant topic = 3
Chapter 15: dominant topic = 3
Chapter 16: dominant topic = 3
Chapter 17: dominant topic = 3
Chapter 18: dominant topic = 8
Chapter 19: dominant topic = 8
Chapter 20: dominant topic = 6
Chapter 21: dominant topic = 3
Chapter 22: dominant topic = 6
Chapter 23: dominant topic = 3
Chapter 24: dominant topic = 3
Chapter 25: dominant topic = 3
Chapter 26: dominant topic = 6
Chapter 27: dominant topic = 6
Chapter 28: dominant topic = 6
Chapter 29: dominant topic = 3
Chapter 30: dominant topic = 3
Chapter 31: dominant topic = 3
Chapter 32: domina

# TextRank

<https://medium.com/@yassineerraji/understanding-textrank-a-deep-dive-into-graph-based-text-summarization-and-keyword-extraction-905d1fb5d266>

How it works conceptually:

1. Each sentence = a node

2. Sentences are connected if they share words

3. Sentences that connect to many others score higher

4. Top-scoring sentences become the summary

In [17]:
import spacy
import pytextrank

/Users/wilsonbeima/Local_Documents/Shenandoah_Work/5_Sem_Fall25/AI/research_project/venv/lib/python3.12/site-packages


In [18]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x13a8698e0>

In [19]:
chapter_num = 0

print('Original Document Size:',len(Corpus[chapter_num]), '\n')
doc = nlp(Corpus[chapter_num])

final_summary = None
phrase_count = 1
for sent in doc._.textrank.summary(limit_phrases=2, limit_sentences=5):
    print(f"Phrase: #{phrase_count}\n")
    print(sent, '\n')
    print('------------------------------------------------------------------', '\n')
    
    if final_summary is None:
        final_summary = str(sent)
    else:
        final_summary += " " + str(sent)

    phrase_count += 1
    
print('Total Summary Length:',len(final_summary), '\n')

Original Document Size: 17790 

Phrase: #1

I am sure she will be an excellent servant; and it will be a great comfort to poor Miss Taylor to have somebody about her that she is used to see. 

------------------------------------------------------------------ 

Phrase: #2

“Poor Mr. and Miss Woodhouse, if you please; but I cannot possibly say ‘poor Miss Taylor.’ 

------------------------------------------------------------------ 

Phrase: #3

“But, Mr. Knightley, she is really very sorry to lose poor Miss Taylor, and I am sure she  will  miss her more than she thinks for.” 

------------------------------------------------------------------ 

Phrase: #4

Sixteen years had Miss Taylor been in Mr. Woodhouse’s family, less as a governess than a friend, very fond of both daughters, but particularly of Emma. 

------------------------------------------------------------------ 

Phrase: #5

Even before Miss Taylor had ceased to hold the nominal office of governess, the mildness of her tempe

# Basic Sentence Frequency Summarization

<https://stackabuse.com/text-summarization-with-nltk-in-python/>

In [20]:
from collections import Counter

In [21]:
word_frequencies = Counter(Corpus)

# Normalize frequencies
max_freq = max(word_frequencies.values())
for word in word_frequencies:
    word_frequencies[word] /= max_freq

In [22]:
Corpus1 = Corpus[0]
sentences = Corpus1.split('.')  # naive sentence split

sentence_scores = {}
for sent in sentences:
    sentence_words = re.findall(r'\b\w+\b', sent.lower())
    score = sum(word_frequencies.get(word, 0) for word in sentence_words)
    if len(sentence_words) < 30:  # optional length filter
        sentence_scores[sent] = score

In [23]:
top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:5]

summary = '. '.join(top_sentences).strip() + '.'
print(summary)


Sixteen years had Miss Taylor been in Mr.  Woodhouse’s family, less as a governess than a friend, very fond of both daughters, but particularly of Emma.  Between  them  it was more the intimacy of sisters.  The danger, however, was at present so unperceived, that they did not by any means rank as misfortunes with her.  Sorrow came—a gentle sorrow—but not at all in the shape of any disagreeable consciousness.


# Transformer

In [26]:
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6"  # explicitly calls model
)

Device set to use mps:0


In [28]:
text = Corpus[0]  # one chapter
summary = summarizer(text, min_length=30, do_sample=False)

print(summary[0]['summary_text'])

Token indices sequence length is longer than the specified maximum sequence length for this model (4310 > 1024). Running this sequence through the model will result in indexing errors


 Emma Woodhouse had lived nearly twenty-one years in the world with a comfortable home and happy disposition . She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister’s marriage, been mistress of his house from a very early period . Emma had been a friend and companion of Isabella Weston, who had been generous and kind . Emma said: “I am so happy, I am so sad, I can’t think that I am going to marry my daughter, but I am not a good friend,” Emma said, “My father is a good man, and I am a good


In [29]:
summarizer = pipeline(
    "summarization",
    model="allenai/led-base-16384"  # longer context length model
)

Device set to use mps:0


In [30]:
text = Corpus[0]  # one chapter
summary = summarizer(text, do_sample=False)

print(summary[0]['summary_text'])

Input ids are automatically padded from 4311 to 5120 to be a multiple of `config.attention_window`: 1024


s husband. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man. He was a very good man, to be sure, and a very good young man, and I have a great regard for him. But if you want to shew him any attention, my dear, ask him to come and dine with us some day. That will be a much better thing. Mr. Knightley had a cheerful manner, which always did him good; and his many inquiries after “poor Isabella” and her children were answered most satisfactorily. When this was over, Mr. Knightley gratefully observed, “It is very kind of you, Mr. Knightley, to come out at this late hour to call upon us. I am afraid you must have had a shocking walk.” “Not at all, sir. It is a beautiful moonlight night; and so mild that I must draw back from your great fire.” “But you must have found it
