Let’s Build a Realistic Mini Pipeline<br>
Below is a fully working example that:



    Downloads multiple large text files from the web.

    Saves them locally.

    Reads them back.

    Chunks them efficiently (token-based).


Download → Read → Chunk

In [1]:
import os
import requests
import tiktoken
from typing import List, Dict, Generator

In [3]:
# -----------------------------------------
# STEP 1: Download large text files
# -----------------------------------------
def download_files(urls: List[str], save_dir: str = "data"):
    os.makedirs(save_dir, exist_ok=True)
    downloaded_files = []
    for url in urls:
        filename = os.path.basename(url).split("?")[0] or "file.txt"
        filepath = os.path.join(save_dir, filename)
        print(f"Downloading {filename} ...")
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        with open(filepath, "wb") as f:
            f.write(response.content)
        downloaded_files.append(filepath)
    return downloaded_files


# -----------------------------------------
# STEP 2: Read files into memory
# -----------------------------------------
def load_text_files(folder_path: str):
    docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
            docs.append({"id": filename, "text": text})
    return docs


# -----------------------------------------
# STEP 3: Token-based chunking
# -----------------------------------------
def chunk_documents(
    docs,
    max_tokens=500,
    overlap_tokens=100,
    model_name="gpt-3.5-turbo"
) -> Generator[Dict[str, str], None, None]:
    enc = tiktoken.encoding_for_model(model_name)

    for d in docs:
        tokens = enc.encode(d["text"])
        i = 0
        chunk_id = 0
        while i < len(tokens):
            chunk_tokens = tokens[i:i + max_tokens]
            text_chunk = enc.decode(chunk_tokens)
            yield {
                "doc_id": d["id"],
                "chunk_id": f"{d['id']}_chunk_{chunk_id}",
                "text": text_chunk
            }
            i += max_tokens - overlap_tokens
            chunk_id += 1


Usage

In [4]:
urls = [
    "https://www.gutenberg.org/cache/epub/1661/pg1661.txt",  # Sherlock Holmes
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt",  # Moby Dick
    "https://www.gutenberg.org/cache/epub/1342/pg1342.txt"   # Pride and Prejudice
]

save_dir = "data"
download_files(urls, save_dir)

docs = load_text_files(save_dir)
print(f"✅ Loaded {len(docs)} documents")


Downloading pg1661.txt ...
Downloading pg2701.txt ...
Downloading pg1342.txt ...
✅ Loaded 3 documents


In [22]:
chunks = list(chunk_documents(docs, max_tokens=400, overlap_tokens=80))
print(f"✅ Created {len(chunks)} chunks")

✅ Created 1967 chunks


In [24]:


# Show a sample
for c in chunks[:3]:
    print("\n---")
    print("doc:", c["doc_id"])
    print("chunk:", c["chunk_id"])
    print("preview:", c["text"][:150])


---
doc: pg2701.txt
chunk: pg2701.txt_chunk_0
preview: ﻿The Project Gutenberg eBook of Moby Dick; Or, The Whale
    
This ebook is for the use of anyone anywhere in the United States and
most other parts o

---
doc: pg2701.txt
chunk: pg2701.txt_chunk_1
preview: CHAPTER 11. Nightgown.

CHAPTER 12. Biographical.

CHAPTER 13. Wheelbarrow.

CHAPTER 14. Nantucket.

CHAPTER 15. Chowder.

CHAPTER 16. The Ship.

CHAP

---
doc: pg2701.txt
chunk: pg2701.txt_chunk_2
preview: . Ahab’s Boat and Crew. Fedallah.

CHAPTER 51. The Spirit-Spout.

CHAPTER 52. The Albatross.

CHAPTER 53. The Gam.

CHAPTER 54. The Town-Ho’s Story.


