In [7]:
file_path = "../data/full_text.txt"

with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

In [8]:
print(text[:1000]) 

Chapter One

Arrest--Conversation with Mrs. Grubach--Then Miss Bürstner


Someone must have been telling lies about Josef K., he knew he had done
nothing wrong but, one morning, he was arrested. Every day at eight in
the morning he was brought his breakfast by Mrs. Grubach's cook--Mrs.
Grubach was his landlady--but today she didn't come. That had never
happened before. K. waited a little while, looked from his pillow at the
old woman who lived opposite and who was watching him with an
inquisitiveness quite unusual for her, and finally, both hungry and
disconcerted, rang the bell. There was immediately a knock at the door
and a man entered. He had never seen the man in this house before. He
was slim but firmly built, his clothes were black and close-fitting,
with many folds and pockets, buckles and buttons and a belt, all of
which gave the impression of being very practical but without making it
very clear what they were actually for. "Who are you?" asked K., sitting
half upright in his

In [9]:
import re

# Split on "Chapter" followed by word characters
chapters = re.split(r'\bChapter\s+[A-Za-z]+\b', text)

# First element before Chapter One will be empty/introduction → remove it
chapters = [ch.strip() for ch in chapters if ch.strip()]

# Print the first chapter to verify
print("Number of chapters:", len(chapters))
print("First chapter content:")
print(chapters[0][:500])

Number of chapters: 10
First chapter content:
Arrest--Conversation with Mrs. Grubach--Then Miss Bürstner


Someone must have been telling lies about Josef K., he knew he had done
nothing wrong but, one morning, he was arrested. Every day at eight in
the morning he was brought his breakfast by Mrs. Grubach's cook--Mrs.
Grubach was his landlady--but today she didn't come. That had never
happened before. K. waited a little while, looked from his pillow at the
old woman who lived opposite and who was watching him with an
inquisitiveness quite u


In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# clean first chapter
words = word_tokenize(chapters[0].lower())
filtered_words = [w for w in words if w.isalpha() and w not in stop_words]

#clean all chapters
filtered_words = []
for i in range(0, len(chapters)):
    words = word_tokenize(chapters[i].lower())
    filtered_words.extend([w for w in words if w.isalpha() and w not in stop_words]) #this will keep all words that are alphabetic and not in stop words

length_of_filtered_words = len(filtered_words)
print("Number of words in first chapter after cleaning:", len(filtered_words))
print("First 100 words in first chapter after cleaning:", filtered_words[:100])
print("Total number of words across all chapters after cleaning:", length_of_filtered_words)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\drawn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\drawn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of words in first chapter after cleaning: 35265
First 100 words in first chapter after cleaning: ['arrest', 'conversation', 'grubach', 'miss', 'bürstner', 'someone', 'must', 'telling', 'lies', 'josef', 'knew', 'done', 'nothing', 'wrong', 'one', 'morning', 'arrested', 'every', 'day', 'eight', 'morning', 'brought', 'breakfast', 'grubach', 'cook', 'grubach', 'landlady', 'today', 'come', 'never', 'happened', 'waited', 'little', 'looked', 'pillow', 'old', 'woman', 'lived', 'opposite', 'watching', 'inquisitiveness', 'quite', 'unusual', 'finally', 'hungry', 'disconcerted', 'rang', 'bell', 'immediately', 'knock', 'door', 'man', 'entered', 'never', 'seen', 'man', 'house', 'slim', 'firmly', 'built', 'clothes', 'black', 'many', 'folds', 'pockets', 'buckles', 'buttons', 'belt', 'gave', 'impression', 'practical', 'without', 'making', 'clear', 'actually', 'asked', 'sitting', 'half', 'upright', 'bed', 'man', 'however', 'ignored', 'question', 'arrival', 'simply', 'accepted', 'merely', 'replied'

In [15]:
# one file for each chapter
import os       
output_dir = "../data/cleaned_chapters"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
for i, chapter in enumerate(chapters):
    chapter_file_path = os.path.join(output_dir, f"chapter_{i+1}_nostopwords.txt")
    with open(chapter_file_path, "w", encoding="utf-8") as f:
        f.write(chapter.strip())        
# one file for all chapters
all_chapters_file_path = os.path.join(output_dir, "all_chapters_nostopwords.txt")
with open(all_chapters_file_path, "w", encoding="utf-8") as f:
    f.write("\n\n".join(chapters).strip())  

