# Preprocess Data and Generate Chunks

In [3]:
import re
from pathlib import Path

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start = end - overlap  
    return chunks

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s.,;:!?\'"-]', '', text)
    return text.lower() 

def process_documents(folder_path):
    all_chunks = []
    txt_files = Path(folder_path).glob("*.txt")
    
    for file_path in txt_files:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
            cleaned_text = clean_text(text)
            chunks = chunk_text(cleaned_text)
            for chunk in chunks:
                all_chunks.append({
                    "text": chunk,
                    "source": file_path.name,
                })
    
    return all_chunks

folder_path = "Data"
processed_chunks = process_documents(folder_path)
print(f"\nTotal chunks generated: {len(processed_chunks)}\n")

'''
for idx, chunk in enumerate(processed_chunks, 1):
    print(f"Chunk #{idx}")
    print(f"Source: {chunk['source']}")
    print(f"Text Preview: {chunk['text'][:150]}...") 
    print(f"Length: {len(chunk['text'].split())} words")
    print("-" * 80)
'''
with open("chunks.txt", "w", encoding="utf-8") as f:
    for idx, chunk in enumerate(processed_chunks, 1):
        f.write(f"Chunk #{idx}\n")
        f.write(f"Source: {chunk['source']}\n")
        f.write(f"Text: {chunk['text']}\n")
        f.write("-" * 80 + "\n\n")


Total chunks generated: 15

