In [6]:
import os
import pandas as pd
import re

In [9]:
PROJECT_ROOT = "/content/MEDITATION_RAG"
FINALIZED_DIR = os.path.join(PROJECT_ROOT, "data", "finalized")

files = []
texts = []

for filename in os.listdir(FINALIZED_DIR):
    if filename.lower().endswith(".txt"):
        path = os.path.join(FINALIZED_DIR, filename)
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()

        files.append(filename)
        texts.append(text)

df = pd.DataFrame({
    "source_file": files,
    "text": texts
})

df

Unnamed: 0,source_file,text
0,week_six_appreciation_meditation.docx_cleaned.txt,Week Six: Appreciation Allowing yourself to fi...
1,week_three_heal_13_minutes.docx_cleaned.txt,"Week Three: Heal With this practice, we learn ..."
2,guided_meditation_connecting_with_values.docx_...,Guided Meditation: Connecting with Values For ...
3,week_six_gratitude.docx_cleaned.txt,Week Six: Gratitude This is a breath and grati...
4,introduction_to_no_agenda_practice.docx_cleane...,Introduction to no agenda practice The no agen...
5,guided_meditation_thoughts_and_emotions.docx_c...,Guided Meditation: Thoughts and Emotions This ...
6,week_one_grounding_meditation_5_minutes.docx_c...,Week One: Grounding Meditation So find a relax...
7,guided_meditation_self-compassion.docx_cleaned...,Guided meditation: Self Compassion Allow yours...
8,week_four_thoughts_and_emotions.docx_cleaned.txt,Week Four: Thoughts and Emotions This is a gui...
9,week_four_working_with_thoughts.docx_1_cleaned...,Week Four: Working with Thoughts 00:01 So allo...


In [10]:
def chunk_text(text, chunk_size=150):
    """
    Splits text into word-based chunks. Each chunk has approx chunk_size words.
    Returns a list of chunk strings.
    """
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk.strip())

    return chunks

In [11]:
chunk_records = []

for idx, row in df.iterrows():
    source = row["source_file"]
    full_text = row["text"]

    chunks = chunk_text(full_text, chunk_size=150)

    for i, chunk in enumerate(chunks):
        chunk_records.append({
            "chunk_id": f"{source}_chunk_{i}",
            "source_file": source,
            "chunk_index": i,
            "text": chunk,
            "word_count": len(chunk.split())
        })

In [12]:
chunks_df = pd.DataFrame(chunk_records)
chunks_df

Unnamed: 0,chunk_id,source_file,chunk_index,text,word_count
0,week_six_appreciation_meditation.docx_cleaned....,week_six_appreciation_meditation.docx_cleaned.txt,0,Week Six: Appreciation Allowing yourself to fi...,150
1,week_six_appreciation_meditation.docx_cleaned....,week_six_appreciation_meditation.docx_cleaned.txt,1,you being alive. It's nourishment to the body....,150
2,week_six_appreciation_meditation.docx_cleaned....,week_six_appreciation_meditation.docx_cleaned.txt,2,offering another genuine heartfelt thank you. ...,150
3,week_six_appreciation_meditation.docx_cleaned....,week_six_appreciation_meditation.docx_cleaned.txt,3,"good fortune or good health, try offering, aga...",75
4,week_three_heal_13_minutes.docx_cleaned.txt_ch...,week_three_heal_13_minutes.docx_cleaned.txt,0,"Week Three: Heal With this practice, we learn ...",150
...,...,...,...,...,...
131,cureus_cleaned.txt_chunk_15,cureus_cleaned.txt,15,". In today's world, a very under-spoken proble...",150
132,cureus_cleaned.txt_chunk_16,cureus_cleaned.txt,16,"only groups . As seen by the results, both med...",150
133,cureus_cleaned.txt_chunk_17,cureus_cleaned.txt,17,"the above, we can say that meditation has been...",150
134,cureus_cleaned.txt_chunk_18,cureus_cleaned.txt,18,"effect it has on the disease; moreover, the ex...",150


In [14]:
chunks_df['word_count'].describe()

Unnamed: 0,word_count
count,136.0
mean,137.433824
std,33.687607
min,8.0
25%,150.0
50%,150.0
75%,150.0
max,150.0
