In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import string
from nltk.corpus import stopwords
from collections import Counter
import re 

# Preprocessing

In [15]:
def remove_illistrations(text: str) -> str:
    """
    Remove Illustrations from the text
    """

    # Remove Illustrations
    cleaned = re.sub(
        r"\[Illustration(?::.*?\]{1,2}|\])",
        "",
        text,
        flags=re.DOTALL
    )

    # Remove left over line
    cleaned = re.sub(r"\n\s*\n", "\n", cleaned)
    return cleaned

In [16]:
def remove_gutenberg_header_footer(text: str) -> str:
    """
    Remove the Project Gutenberg header and footer, and trim the text.
    Only keeps text starting from the last occurrence of 'CHAPTER I' followed by a newline.
    """

    start_marker = "CHAPTER I\n"
    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

    # Use the last occurrence of the chapter marker followed by a newline
    start_idx = text.find(start_marker)
    if start_idx != -1:
        # Keep text starting at the final chapter marker,
        # then remove its heading line
        text = text[start_idx + len(start_marker):]

    # Locate and remove the footer
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]

    return text.strip()

In [17]:
def split_by_chapter(text: str) -> list[str]:
    """
    Split a book into chapters, remove chapter titles,
    and skip very short sections (like table of contents).
    """

    chapters = []
    chapter_marker = "CHAPTER"

    # Split the text at each occurrence of the chapter marker
    parts = text.split(chapter_marker)

    for part in parts:
        # Remove leading/trailing whitespace
        chapter_text = part.strip()

        # Skip very short sections (likely TOC or preface)
        if len(chapter_text.split()) < 20:
            continue

        # Convert newlines to spaces
        chapter_text = chapter_text.replace("\n", " ")

        # _word_ used for formatting somehow by project gutenberg
        chapter_text = chapter_text.replace("_", " ")

        chapters.append(chapter_text)

    return chapters

In [18]:
def create_corpus(books: list[str]) -> list[str]:
    Corpus = None
    
    for book in books:
        book_no_images = remove_illistrations(book)
        book_corpus = remove_gutenberg_header_footer(book_no_images)
        book_chapters = split_by_chapter(book_corpus)

        if Corpus is None:
            Corpus = book_chapters
        else:
            Corpus += book_chapters

    return Corpus

In [19]:
def print_topics(model, vectorizer, n_words=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic #{idx}")
        print("  " + " ".join(words[i] for i in topic.argsort()[-n_words:]))

In [20]:
books = []
with open('Books/Emma.txt', 'r') as emma:
    books.append(emma.read())

with open('Books/Pride_and_Prejudice_Jane_Austin.txt', 'r') as pride:
    books.append(pride.read())

with open('Books/Sense_and_Sensibility.txt', 'r') as sense:
    books.append(sense.read())

In [21]:
Corpus = create_corpus(books)

# LDA

In [9]:
vectorizer = CountVectorizer(
    stop_words="english",
    max_df=0.95,
    min_df=2,
)

X = vectorizer.fit_transform(Corpus)

In [19]:
lda = LatentDirichletAllocation(
    n_components=10,         # experiment with 5–30
    learning_method="batch", # stable & reproducible
    random_state=42,
)
lda.fit(X)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [20]:
print_topics(lda, vectorizer)

Topic #0
  feelings soon man said bingley bennet elizabeth wickham darcy mr
Topic #1
  pressed positively xii tenderly confirming cordially intruding calculated wisely new
Topic #2
  lady edward house dashwood sister said willoughby jennings elinor marianne
Topic #3
  good jane say think thing know said emma miss mr
Topic #4
  churchill woodhouse thing did elton miss knightley emma harriet mr
Topic #5
  darcy miss bennet did sister jane bingley said mr elizabeth
Topic #6
  sister brandon said marianne ferrars mother dashwood lucy edward elinor
Topic #7
  thing weston knightley think good said miss harriet emma mr
Topic #8
  said bingley miss catherine bennet lady darcy collins elizabeth mr
Topic #9
  master did thought wickham uncle gardiner said darcy mr elizabeth


In [22]:
chapter_topic_distrib = lda.transform(X)

for i, distrib in enumerate(chapter_topic_distrib):
    print(f"Chapter {i}: dominant topic = {distrib.argmax()}")

Chapter 0: dominant topic = 7
Chapter 1: dominant topic = 3
Chapter 2: dominant topic = 7
Chapter 3: dominant topic = 7
Chapter 4: dominant topic = 7
Chapter 5: dominant topic = 7
Chapter 6: dominant topic = 7
Chapter 7: dominant topic = 7
Chapter 8: dominant topic = 7
Chapter 9: dominant topic = 3
Chapter 10: dominant topic = 7
Chapter 11: dominant topic = 7
Chapter 12: dominant topic = 4
Chapter 13: dominant topic = 3
Chapter 14: dominant topic = 3
Chapter 15: dominant topic = 7
Chapter 16: dominant topic = 4
Chapter 17: dominant topic = 3
Chapter 18: dominant topic = 3
Chapter 19: dominant topic = 3
Chapter 20: dominant topic = 3
Chapter 21: dominant topic = 4
Chapter 22: dominant topic = 3
Chapter 23: dominant topic = 3
Chapter 24: dominant topic = 3
Chapter 25: dominant topic = 3
Chapter 26: dominant topic = 7
Chapter 27: dominant topic = 4
Chapter 28: dominant topic = 7
Chapter 29: dominant topic = 3
Chapter 30: dominant topic = 3
Chapter 31: dominant topic = 3
Chapter 32: domina

# N-Grams

In [11]:
def text_process(corp):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """    
    STOPWORDS = stopwords.words('english')
    # Check characters to see if they are in punctuation
    nopunc = [char for char in corp if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # remove new lines
    nopunc.replace('\n', '')
    
    # Now just remove any stopwords
    return ' '.join([word.lower() for word in nopunc.split() if word.lower() not in STOPWORDS])

In [12]:
clean_corpus = []
for chapter in Corpus:
    clean_corpus.append(text_process(chapter))

In [13]:
def word_ngrams(texts, n=2):
    counts = Counter()
    
    for chapter in texts:
        words = str(chapter).split()
        for i in range(len(words) - n + 1):
            ngram = tuple(words[i:i+n])
            counts[ngram] += 1
    
    return counts

# Example: bigrams
bigrams = word_ngrams(clean_corpus, n=3)
print(bigrams.most_common(20))

[(('mr', 'frank', 'churchill'), 37), (('mr', 'john', 'knightley'), 26), (('mrs', 'john', 'dashwood'), 24), (('said', 'mr', 'knightley'), 18), (('miss', 'de', 'bourgh'), 16), (('said', 'mrs', 'jennings'), 15), (('dear', 'miss', 'woodhouse'), 14), (('mr', 'mrs', 'weston'), 14), (('said', 'mrs', 'weston'), 13), (('said', 'mrs', 'bennet'), 13), (('mrs', 'john', 'knightley'), 12), (('lady', 'catherine', 'de'), 12), (('without', 'saying', 'word'), 11), (('catherine', 'de', 'bourgh'), 11), (('mr', 'mrs', 'gardiner'), 11), (('said', 'mr', 'woodhouse'), 10), (('dare', 'say', 'shall'), 10), (('said', 'mr', 'bennet'), 10), (('said', 'miss', 'bingley'), 10), (('said', 'mrs', 'dashwood'), 10)]


# TF-IDF

In [9]:
tr_idf_model  = TfidfVectorizer(
    stop_words="english",
    max_df=0.95,
    min_df=2,
)
tf_idf_vector = tr_idf_model.fit_transform(Corpus)

In [10]:
words_set = tr_idf_model.get_feature_names_out()
tf_idf_array = tf_idf_vector.toarray()
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

df_tf_idf

Unnamed: 0,26th,28th,abatement,abbey,abhor,abhorred,abhorrence,abide,abilities,ability,...,york,yorkshire,young,younge,younger,youngest,youth,youthful,zeal,zealous
0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.021750,0.0,0.0,0.019941,0.000000,0.000000,0.0,0.000000
1,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.041932,0.036662,0.0,0.0,0.000000,0.060812,0.000000,0.0,0.000000
2,0.0,0.0,0.000000,0.033557,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.059847,0.0,0.0,0.000000,0.089343,0.000000,0.0,0.000000
3,0.0,0.0,0.000000,0.040157,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.100264,0.0,0.0,0.000000,0.017819,0.026559,0.0,0.000000
4,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.020600,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,0.0,0.0,0.038396,0.000000,0.0,0.0,0.028446,0.0,0.0,0.0,...,0.0,0.000000,0.009299,0.0,0.0,0.000000,0.023137,0.000000,0.0,0.038396
161,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.029928,0.0,0.0,0.027438,0.000000,0.000000,0.0,0.000000
162,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.043978,0.000000,0.000000,0.0,0.000000
163,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000


In [22]:
import spacy
import pytextrank

In [23]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x116fb1810>

In [30]:
print('Original Document Size:',len(Corpus[0]))
doc = nlp(Corpus[0])

for sent in doc._.textrank.summary(limit_phrases=4, limit_sentences=4):
    print(sent)
    print('Summary Length:',len(sent))

Original Document Size: 17790
Mr. Weston is such a good-humoured, pleasant, excellent man, that he thoroughly deserves a good wife;—and you would not have had Miss Taylor live with us for ever, and bear all my odd humours, when she might have a house of her own?”
Summary Length: 52
But if, which I rather imagine, your making the match, as you call it, means only your planning it, your saying to yourself one idle day, ‘I think it would be a very good thing for Miss Taylor if Mr. Weston were to marry her,’ and saying it again to yourself every now and then afterwards, why do you talk of success?
Summary Length: 73
I am sure she will be an excellent servant; and it will be a great comfort to poor Miss Taylor to have somebody about her that she is used to see.
Summary Length: 33
“Poor Mr. and Miss Woodhouse, if you please; but I cannot possibly say ‘poor Miss Taylor.’
Summary Length: 23


In [29]:
print(Corpus[0])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her. She was the youngest of the two daughters of a most affectionate, indulgent father; and had, in consequence of her sister’s marriage, been mistress of his house from a very early period. Her mother had died too long ago for her to have more than an indistinct remembrance of her caresses; and her place had been supplied by an excellent woman as governess, who had fallen little short of a mother in affection. Sixteen years had Miss Taylor been in Mr. Woodhouse’s family, less as a governess than a friend, very fond of both daughters, but particularly of Emma. Between  them  it was more the intimacy of sisters. Even before Miss Taylor had ceased to hold the nominal office of governess, the mildness of her temper had hardly allowed her to impose any restr