In [4]:
from typing import List

def split_into_chunks(doc_file: str) -> List[str]:
    """Splits a doc into lists of chunks.

    Args:
        doc_file: The doc to be split.

    Returns:
        List[str]: A list of chunks.
    """

    with open(doc_file, 'r', encoding='utf-8') as file:
        content = file.read()

    return [chunk for chunk in content.split('\n\n')]

print("Splitting 'PridePrejudice.txt' into chunks...\n")

chunks = split_into_chunks('PridePrejudice.txt')

for i, chunk in enumerate(chunks):
    print(f"Chunk {i}:{chunk}\n")

Splitting 'PridePrejudice.txt' into chunks...

Chunk 0:﻿The Project Gutenberg eBook of Pride and Prejudice
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Chunk 1:Title: Pride and Prejudice

Chunk 2:Author: Jane Austen

Chunk 3:Release date: June 1, 1998 [eBook #1342]
                Most recently updated: September 22, 2025

Chunk 4:Language: English

Chunk 5:Credits: Chuck Greif and the Online Distributed Proofreading Team at http://www.pgdp.net (This file was produced from images available at The Internet Archive)

Chunk 6:
*** START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***



In [5]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def embed_chunk(chunk: str) -> List[float]:
    embedding = embedder.encode(chunk)
    return embedding.tolist()

test_embedding = embed_chunk("test chunk")
print(f"Embedding for 'test chunk': {test_embedding}\n")
print(f"Length of embedding: {len(test_embedding)}\n")

  from .autonotebook import tqdm as notebook_tqdm


Embedding for 'test chunk': [-0.036458637565374374, 0.04724870249629021, -0.003158319741487503, -0.006889220792800188, 0.005261742975562811, -0.07006584852933884, 0.008725923486053944, 0.01518509816378355, -0.03395070508122444, -0.01867808774113655, -0.02176148071885109, -0.08186377584934235, -0.06083095073699951, -0.015296144410967827, 0.06043209135532379, -0.0961143746972084, 0.010825298726558685, 0.020066402852535248, -0.04342357814311981, -0.051691994071006775, -0.001781388302333653, -0.007015865296125412, -0.04097861051559448, 0.05564867705106735, 0.06080647185444832, 0.0465521402657032, -0.0611729770898819, -0.02040955424308777, 0.05964475870132446, -0.0669371709227562, 0.09273034334182739, -0.030671831220388412, 0.016356755048036575, 0.030302146449685097, 0.0891784206032753, 0.016223398968577385, 0.004152817185968161, 0.031111542135477066, 0.004403314087539911, 0.005862946156412363, 0.027244165539741516, -0.07630465924739838, 0.03954094648361206, 0.01670697331428528, 0.026030860

In [6]:
embeddings = [embed_chunk(chunk) for chunk in chunks]
print(f"Generated {len(embeddings)} embeddings.\n")
print(f"First embedding: {embeddings[0]}\n")


Generated 2695 embeddings.

First embedding: [-0.023426789790391922, -0.010952586308121681, -0.031114036217331886, 0.017601247876882553, 0.006154946051537991, 0.11365875601768494, -0.026358095929026604, -0.03085598535835743, -0.030203621834516525, 0.02281181886792183, 0.001973570790141821, 0.05235270783305168, -0.02917432226240635, -0.02552647516131401, -0.08323349803686142, 0.010747233405709267, 0.012266529724001884, 0.04029672592878342, 0.02920977957546711, -0.04604239761829376, -0.029710358008742332, 0.0353192463517189, 0.031167134642601013, 0.026255052536725998, -0.0009883471066132188, -0.04884086921811104, 0.03049473650753498, 0.009913278743624687, -0.035050418227910995, -0.008327393792569637, 0.023207200691103935, -0.027868246659636497, 0.09484678506851196, -0.05346160754561424, -0.041333407163619995, -0.03052891232073307, 0.057773344218730927, -0.023918770253658295, 0.017471862956881523, 0.034345805644989014, -0.028945814818143845, -0.027677519246935844, -0.007710382807999849, 0

In [8]:
import chromadb

chromadb_client = chromadb.EphemeralClient()
chromadb_collection = chromadb_client.get_or_create_collection(name="pride_prejudice")

def save_embeddings(chunks: List[str], embeddings: List[List[float]]):
    """Saves chunks and their embeddings to ChromaDB.

    Args:
        chunks: List of text chunks.
        embeddings: Corresponding list of embeddings.
    """
    ids = [str(i) for i in range(len(chunks))]
    chromadb_collection.add(
        documents=chunks,
        embeddings=embeddings,
        ids=ids
    )


save_embeddings(chunks, embeddings)