### Example Code for Creating and Querying a ChromaDB with Chunks Generated from Yourbench
Adapted from UDA-benchmark. Using /tmp/zlu39/.conda_envs/UDA-benchmark environment

In [None]:
from datasets import load_dataset, DownloadMode

dataset = load_dataset('zhichul/0718_yourbench_five_papers', name='chunked', download_mode=DownloadMode.FORCE_REDOWNLOAD)
print(dataset)

Generating train split: 100%|██████████| 5/5 [00:00<00:00, 573.46 examples/s]

DatasetDict({
    train: Dataset({
        features: ['document_id', 'document_text', 'document_filename', 'document_metadata', 'raw_chunk_summaries', 'chunk_summaries', 'raw_document_summary', 'document_summary', 'summarization_model', 'chunks', 'multihop_chunks', 'chunk_info_metrics', 'chunking_model'],
        num_rows: 5
    })
})





In [20]:
text_chunks = []
for chunks_by_doc in dataset['train']['chunks']:
    for chunk in chunks_by_doc:
        text_chunks.append(chunk['chunk_text'])
print(len(text_chunks))

520


In [17]:
import chromadb
import torch
import chromadb.utils.embedding_functions as embedding_functions

# Create the vector_db collection 
# and store the embeddings
model_name = "all-MiniLM-L6-v2"
chroma_client = chromadb.Client()
device_info = "cuda" if torch.cuda.is_available() else "cpu"
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_name, device=device_info
)
collection = chroma_client.create_collection(
    "demo_vdb", embedding_function=ef, metadata={"hnsw:space": "cosine"}, get_or_create=True
)
id_list = [str(i) for i in range(len(text_chunks))]
collection.add(documents=text_chunks, ids=id_list)

In [23]:
# Fetch the top_k most similar chunks according to the query
top_k = 5
question = "What is the goal of synthetic continued pretraining?"
fetct_res = collection.query(query_texts=[question], n_results=top_k, include=['metadatas', 'documents', 'distances', 'embeddings'])
contexts = fetct_res["documents"][0]

# Show a snapshot of the context
print(f"The most relevant contexts to the question: {question}")
for idx,context in enumerate(contexts):
    print(f"===== Context {idx+1} =======")
    print(context)


The most relevant contexts to the question: What is the goal of synthetic continued pretraining?
; Schumann & Rehbein, 2019). Contemporary works employ co-training (Lang et al. , 2022) and self-training to improve language model performance, often on mathematical reasoning tasks (Huang et al. , 2023; Gulcehre et al. , 2023; Zhang et al. , 2024a), or synthesize input-output pairs for instruction tuning, usually by con- ditioning on a curated seed set (Wang et al. , 2023b; Honovich et al. , 2023; Taori et al. , 2023; Peng et al. , 2023; Yuan et al. , 2024b; Li et al. , 2024). Continual learning and pretraining. Continual learning is rooted in historical work on connec- tionist networks (McCloskey & Cohen, 1989; Ratcliff, 1990) and considers learning with tasks ar- riving in an online manner (Schlimmer & Fisher, 1986; Grossberg, 2012). The main focus is on mitigating a neural net’s “catastrophic forgetting” of previously encountered tasks (Robins, 
 2. Related Works  Post-training. Post-t

In [26]:
fetct_res

{'ids': [['163', '395', '53', '61', '71']],
 'embeddings': [array([[-0.04351589, -0.06309311, -0.02049338, ...,  0.05304797,
           0.0011077 ,  0.00092487],
         [-0.05412519, -0.06544636,  0.01237015, ..., -0.01653033,
          -0.04229851,  0.01795014],
         [-0.09530476, -0.07220636, -0.02631474, ..., -0.06363436,
          -0.00936357,  0.02696936],
         [-0.03810095, -0.08134458,  0.05342281, ..., -0.03218824,
           0.00582854,  0.04326789],
         [-0.0706144 , -0.09598251,  0.0337091 , ..., -0.09405132,
          -0.04735962, -0.07118285]], shape=(5, 384))],
 'documents': [['; Schumann & Rehbein, 2019). Contemporary works employ co-training (Lang et al. , 2022) and self-training to improve language model performance, often on mathematical reasoning tasks (Huang et al. , 2023; Gulcehre et al. , 2023; Zhang et al. , 2024a), or synthesize input-output pairs for instruction tuning, usually by con- ditioning on a curated seed set (Wang et al. , 2023b; Honovic