In [2]:
# Paths
input_dir = "../data/chapters"

import re
import os
def chapter_number(filename):
    # Extract first number from filename (e.g., "chapter_10_nostopwords.txt" -> 10)
    match = re.search(r"(\d+)", filename)
    return int(match.group(1)) if match else 0

files = sorted(
    [f for f in os.listdir(input_dir) if f.endswith(".txt")],
    key=chapter_number
)


In [None]:
chapters = []
chapter_names = []

for f in files:
    with open(os.path.join(input_dir, f), "r", encoding="utf-8") as infile:
        text = infile.read()
        chapters.append(text)
        chapter_names.append(f)

['chapter_1.txt',
 'chapter_2.txt',
 'chapter_3.txt',
 'chapter_4.txt',
 'chapter_5.txt',
 'chapter_6.txt',
 'chapter_7.txt',
 'chapter_8.txt',
 'chapter_9.txt',
 'chapter_10.txt']

In [16]:
%pip install spacy
%pip install sklearn
import sklearn
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ 
        for token in doc 
        if token.is_alpha and not token.is_stop
    ]
    return " ".join(tokens)

processed_chapters = [preprocess(ch) for ch in chapters]

Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Bag-of-Words
vectorizer = CountVectorizer(max_df=0.9, min_df=2)
X_bow = vectorizer.fit_transform(processed_chapters)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2)
X_tfidf = tfidf_vectorizer.fit_transform(processed_chapters)

^C
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.




ModuleNotFoundError: No module named 'numpy.rec'

  You can safely remove it manually.


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.3.0.post1-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.17.3-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
Using cached gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
Using cached scipy-1.13.1-cp311-cp311-win_amd64.whl (46.2 MB)
Using cached smart_open-7.3.0.post1-py3-none-any.whl (61 kB)
Using cached wrapt-1.17.3-cp311-cp311-win_amd64.whl (38 kB)
Installing collected packages: wrapt, numpy, smart-open, scipy, gensim

  Attempting uninstall: wrapt

    Found existing installation: wrapt 1.17.3

   

In [None]:
from gensim import corpora, models

tokenized_chapters = [ch.split() for ch in processed_chapters]

dictionary = corpora.Dictionary(tokenized_chapters)
corpus = [dictionary.doc2bow(text) for text in tokenized_chapters]

lda_model = models.LdaModel(
    corpus,
    num_topics=5,  # try 3–6
    id2word=dictionary,
    passes=15,
    random_state=42
)

topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)
vis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

topic_dist = []
for bow in corpus:
    dist = lda_model.get_document_topics(bow, minimum_probability=0)
    topic_dist.append([p for _, p in dist])

df_topics = pd.DataFrame(topic_dist, columns=[f"Topic {i}" for i in range(lda_model.num_topics)])
df_topics["chapter"] = chapter_names

df_topics.set_index("chapter").plot(kind="bar", stacked=True, figsize=(12,6))
plt.ylabel("Topic proportion")
plt.title("Topic distribution across chapters")
plt.show()