In [5]:
import os
import pandas as pd
import re

In [4]:
FINALIZED_DIR = "../data/finalized"

files = []
texts = []

# loops through all .txt files in finalized directory
for filename in os.listdir(FINALIZED_DIR):
    if filename.lower().endswith(".txt"):
        path = os.path.join(FINALIZED_DIR, filename)

        with open(path, "r", encoding="utf-8") as f:
            text = f.read()

        files.append(filename)
        texts.append(text)

# Create DataFrame
df = pd.DataFrame({
    "source_file": files,
    "text": texts
})

df

Unnamed: 0,source_file,text
0,cureus_cleaned.txt,This article discusses the power of meditation...
1,guided_meditation_befriending_yourself.docx_cl...,Guided Meditation: Befriending Yourself Allow ...
2,guided_meditation_connecting_to_community.docx...,Guided Meditation: Connecting to Community For...
3,guided_meditation_connecting_with_values.docx_...,Guided Meditation: Connecting with Values For ...
4,guided_meditation_no_agenda.docx_cleaned.txt,"Guided Meditation: No Agenda So, notice the po..."
5,guided_meditation_self-compassion.docx_cleaned...,Guided meditation: Self Compassion Allow yours...
6,guided_meditation_thoughts_and_emotions.docx_c...,Guided Meditation: Thoughts and Emotions This ...
7,introduction_to_no_agenda_practice.docx_cleane...,Introduction to no agenda practice The no agen...
8,just_like_me_female_voice.docx_cleaned.txt,Just Like Me Female Voice When practicing this...
9,week_five_just_like_me_male_voice.docx_cleaned...,"Week Five: Just Like Me So for this practice, ..."


In [7]:
def chunk_text(text, chunk_size=150):
    """
    Splits text into word-based chunks. Each chunk has approx chunk_size words.
    Returns a list of chunk strings.
    """
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk.strip())

    return chunks

In [8]:
chunk_records = []

for idx, row in df.iterrows():
    source = row["source_file"]
    full_text = row["text"]

    chunks = chunk_text(full_text, chunk_size=150)

    for i, chunk in enumerate(chunks):
        chunk_records.append({
            "chunk_id": f"{source}_chunk_{i}",
            "source_file": source,
            "chunk_index": i,
            "text": chunk,
            "word_count": len(chunk.split())
        })

In [9]:
chunks_df = pd.DataFrame(chunk_records)
chunks_df

Unnamed: 0,chunk_id,source_file,chunk_index,text,word_count
0,cureus_cleaned.txt_chunk_0,cureus_cleaned.txt,0,This article discusses the power of meditation...,150
1,cureus_cleaned.txt_chunk_1,cureus_cleaned.txt,1,yield more beneficial clinical outcomes. Every...,150
2,cureus_cleaned.txt_chunk_2,cureus_cleaned.txt,2,that constant reinforcement of happy thoughts ...,150
3,cureus_cleaned.txt_chunk_3,cureus_cleaned.txt,3,energy and possibility is there remains a myst...,150
4,cureus_cleaned.txt_chunk_4,cureus_cleaned.txt,4,benefits at the genetic or immunological level...,150
...,...,...,...,...,...
131,week_two_stop_meditation_9_minutes.docx_cleane...,week_two_stop_meditation_9_minutes.docx_cleane...,0,Week Two: STOP Meditation So you can decide fo...,150
132,week_two_stop_meditation_9_minutes.docx_cleane...,week_two_stop_meditation_9_minutes.docx_cleane...,1,this stressful situation with as much vividnes...,150
133,week_two_stop_meditation_9_minutes.docx_cleane...,week_two_stop_meditation_9_minutes.docx_cleane...,2,"freeze frame that, I want you to imagine that ...",150
134,week_two_stop_meditation_9_minutes.docx_cleane...,week_two_stop_meditation_9_minutes.docx_cleane...,3,there's any signals. It's like the little yell...,150


In [None]:
# save CSV file
chunks_df.to_csv("../data/chunks/meditation_chunks.csv", index=False)

In [13]:
chunks_df['word_count'].describe()

count    136.000000
mean     137.433824
std       33.687607
min        8.000000
25%      150.000000
50%      150.000000
75%      150.000000
max      150.000000
Name: word_count, dtype: float64