# Working With ChromaDB
We will be using ChromaDB as our VectorDB. This allows us to be able to use OpenAI embeddings to query our collection (Although, we are not only limited to OpenAI embeddings).


In [None]:
# %pip install llama-index sentence-transformers huggingface-hub
# %pip install pydantic --upgrade
# %pip install langchain


# Init Llama Index

In [11]:
# import
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index import SimpleDirectoryReader, ServiceContext
from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex
from llama_index.composability import ComposableGraph
from llama_index.llms import OpenAI
from llama_index.embeddings import HuggingFaceEmbedding
from IPython.display import Markdown, display
import os
import urllib.request
import chromadb
from dotenv import load_dotenv
load_dotenv()
import openai
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, set_global_service_context
import nest_asyncio
import logging
import sys





# optionally set a global service context


# ChromaDB client 
chroma_client = chromadb.Client()

# OpenAI API key

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY

# Embedding functions
import chromadb.utils.embedding_functions as embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_API_KEY,
                model_name="text-embedding-ada-002"
            )


from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from llama_index import ServiceContext, set_global_service_context

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")
# embed_model = OpenAIEmbedding(embed_batch_size=10)

hf_embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)



service_context = ServiceContext.from_defaults(embed_model=embed_model)

# optionally set a global service context
set_global_service_context(service_context)

In [None]:


# # Create directory if it doesn't exist
# os.makedirs('data/paul_graham/', exist_ok=True)

# # Download the file
# url = 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt'
# file_path = 'data/paul_graham/paul_graham_essay.txt'
# urllib.request.urlretrieve(url, file_path)

## Document Loader

In [13]:


nest_asyncio.apply()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
reader = SimpleDirectoryReader("./necrovox_docs/")
documents = reader.load_data()

In [None]:
from llama_index.node_parser import SentenceSplitter

nodes = SentenceSplitter().get_nodes_from_documents(documents)

In [None]:
from llama_index.storage.docstore import SimpleDocumentStore

docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

## Creating Collection

## Changing the distance function
create_collection also takes an optional metadata argument which can be used to customize the distance method of the embedding space by setting the value of hnsw:space.

```python
 collection = client.create_collection(
        name="collection_name",
        metadata={"hnsw:space": "cosine"} # l2 is the default
    )

```

Valid options for hnsw:space are "l2", "ip, "or "cosine". The default is "l2" which is the squared L2 norm.

In [None]:
# collection = client.create_collection(name="necromunda", embedding_function=openai_ef)

In [8]:
collection = chroma_client.get_or_create_collection(name="necromunda", embedding_function=openai_ef)
# Alt embed function 2
# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
# collection = chroma_client.get_or_create_collection(name="necromunda", embedding_function=embed_model)

### Ephemeral Collection (Optional)

In [14]:
# create client and a new collection
# chroma_client = chromadb.EphemeralClient()
# chroma_collection = chroma_client.create_collection("quickstart")

# define embedding function and embed model
embed_model = OpenAIEmbedding(embed_batch_size=10)
# embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# load documents
# documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

# Query Data
query_engine = index.as_query_engine()
response = query_engine.query("How do you make an injury roll?")
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.open

<b>To make an injury roll, the opposing player rolls two six-sided dice (D6) one after the other, also known as a D66 roll. The result of the roll is then looked up on the Lasting Injuries table. If there are multiple Out of Action results from the injury roll, a separate roll is made for each result on the Lasting Injuries table. This process is typically done during Campaign play, but can be skipped during Skirmish play if desired.</b>

### Persistent Collection (Optional)


#### Save to Disk

In [15]:
# save to disk

db = chromadb.PersistentClient(path="./necromunda_db/")
chroma_collection = db.get_or_create_collection("necromunda")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context, show_progress=True
)
index.storage_context.persist(persist_dir="./necromunda_index")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Parsing nodes:   0%|          | 0/1896 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1956 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.open

#### Load from Disk

In [9]:

# load from disk
db = chromadb.PersistentClient(path="./necromunda_db/")
chroma_collection = db.get_or_create_collection("necromunda")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)

In [17]:
# Query Data from the persisted index
query_engine = index.as_query_engine()
response = query_engine.query("How would I build a 1000 point Cawdor gang?  Please provide a list of fighters and equipment.")
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>To build a 1000 point Cawdor gang, you would need to follow the guidelines provided in the context information. The gang should have 1,250 to 2,000 credits, but in this case, you are aiming for 1000 points. 

Based on the given information, you would need a minimum of 10 fighters and one vehicle, and a maximum of 20 fighters and four vehicles. The gang composition rules for a campaign would also apply.

For the Cawdor gang, you can include Leaders and Champions who get a free starting skill or ability. You would need to record this on the fighter's Fighter card.

When it comes to equipment, you should follow the standard equipment rules. You can also include rare and/or illegal items from the Trading Post, but you need to pre-determine the Rare (X) and Illegal (X) level, such as up to Rare (10).

Additionally, you can hire Hired Guns, Brutes, and Hangers-on. They do not count towards the minimum number of fighters but do count towards the maximum number of fighters. The number allowed is detailed in the Reputation section, and they must be represented by appropriate models.

Unfortunately, without further information or a specific list of fighters and equipment for a 1000 point Cawdor gang, I cannot provide a detailed list.</b>

## Fine-Tuning

### Fine-Tuning Imports

In [None]:
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

### Autotrain with HuggingFace

#### Install Autotrain if needed

In [None]:
%pip install autotrain-advanced
# !pip install huggingface_hub
!python -m autotrain setup --update-torch 

### Login to HuggingFace

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [18]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [19]:
train_nodes = load_corpus([".\\necrovox_docs\\rules.md", ".\\necrovox_docs\\gangs.md"], verbose=True)

Loading files ['.\\necrovox_docs\\rules.md', '.\\necrovox_docs\\gangs.md']
Loaded 948 docs


NameError: name 'SentenceSplitter' is not defined

In [None]:
val_nodes = load_corpus([".\\necrovox_docs\\rules.md", ".\\necrovox_docs\\gangs.md"], verbose=True)

In [21]:
from llama_index.finetuning import (
    generate_qa_embedding_pairs,
    EmbeddingQAFinetuneDataset,
)

In [20]:
llm = OpenAI(model="gpt-3.5-turbo")

In [22]:
from llama_index.finetuning.cross_encoders.cross_encoder import (
    CrossEncoderFinetuneEngine,
)

from llama_index.finetuning.cross_encoders.dataset_gen import (
    generate_ce_fine_tuning_dataset,
    generate_synthetic_queries_over_documents,
)


In [33]:
retriver = index.as_retriever()

In [37]:
dir(index)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_add_nodes_to_index',
 '_aget_node_with_embedding',
 '_async_add_nodes_to_index',
 '_build_index_from_nodes',
 '_delete_node',
 '_docstore',
 '_get_node_with_embedding',
 '_graph_store',
 '_index_struct',
 '_insert',
 '_insert_batch_size',
 '_is_protocol',
 '_service_context',
 '_show_progress',
 '_storage_context',
 '_store_nodes_override',
 '_use_async',
 '_vector_store',
 'as_chat_engine',
 'as_query_engine',
 'as_retriever',
 'build_index_from_nodes',
 'delete',
 'delete

In [36]:
query_list = generate_synthetic_queries_over_documents(
    documents=retriver, llm=llm
)

TypeError: 'VectorIndexRetriever' object is not iterable

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)

train_dataset.save_json("train_dataset2.json")
val_dataset.save_json("val_dataset2.json")

In [None]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset2.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset2.json")

In [None]:
import json


# load data in from .jsonl format
def load_dataset_from_other_nb(path):
    fp = open(path, "r")
    qr_pairs = []
    for line in fp:
        qa_pair = json.loads(line)
        query_str = qa_pair["query"]
        response_str = qa_pair["response"]
        qr_pairs.append((query_str, response_str))

    return qr_pairs

In [None]:
# qr_pairs = load_dataset_from_other_nb("train_dataset2.json")
# eval_dataset = QueryResponseDataset.from_qr_pairs(qr_pairs)

In [None]:
vector_index = VectorStoreIndex(nodes)

In [None]:
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)

In [None]:
eval_dataset = QueryResponseDataset.from_json("./train_dataset2.json")

In [None]:
from llama_index.prompts import PromptTemplate

qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

vector_retriever = vector_index.as_retriever(similarity_top_k=1)

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine


In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

# finetune_engine = SentenceTransformersFinetuneEngine(
#     train_dataset,
#     model_id="BAAI/bge-small-en-v1.5",
#     model_output_path="test_model",
#     val_dataset=val_dataset,

# )

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en-v1.5",
    model_output_path="test_model",
    val_dataset=val_dataset,

)

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("BAAI/bge-small-en-v1.5")
config.save_pretrained("test_model")


In [None]:
config2 = AutoConfig.from_pretrained("test_model")

In [None]:
config

In [None]:
config.save_pretrained(save_directory="./necromunda_model")

In [None]:
finetune_engine.epochs = 4

In [None]:
finetune_engine.model_output_path ="./necromunda_model"

In [None]:
finetune_engine.model_output_path ="test_model"

In [None]:
finetune_engine.finetune()

In [None]:
embed_model

In [None]:
from llama_index.embeddings import OpenAIEmbedding
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=True,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, service_context=service_context, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
# %pip install jupyter --upgrade

In [None]:
# %pip install ipywidgets --upgrade

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

In [None]:
df_ada = pd.DataFrame(ada_val_results)

In [None]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

In [None]:
# bge = "local:BAAI/bge-small-en"
bge = "local:BAAI/bge-small-en-v1.5"
# "BAAI/bge-small-en-v1.5"
bge_val_results = evaluate(val_dataset, bge)

In [None]:
df_bge = pd.DataFrame(bge_val_results)

In [None]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

In [None]:
# evaluate_st(val_dataset, "BAAI/bge-small-en", name="bge")
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")


In [None]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)

In [None]:
df_finetuned = pd.DataFrame(val_results_finetuned)

In [None]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

In [None]:
evaluate_st(val_dataset, "test_model", name="finetuned")

In [None]:
df_ada["model"] = "ada"
df_bge["model"] = "bge"
df_finetuned["model"] = "fine_tuned"

In [None]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby("model").mean("is_hit")

In [None]:
df_st_bge = pd.read_csv(
    "results/Information-Retrieval_evaluation_bge_results.csv"
)
df_st_finetuned = pd.read_csv(
    "results/Information-Retrieval_evaluation_finetuned_results.csv"
)

In [None]:
df_st_bge["model"] = "bge"
df_st_finetuned["model"] = "fine_tuned"
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index("model")
df_st_all

In [None]:
ft_model = finetune_engine.get_finetuned_model()

In [None]:
ft_model.query_instruction

### Update and Delete Documents

In [None]:
doc_to_update = chroma_collection.get(limit=1)
doc_to_update["metadatas"][0] = {
    **doc_to_update["metadatas"][0],
    **{"author": "Paul Graham"},
}
chroma_collection.update(
    ids=[doc_to_update["ids"][0]], metadatas=[doc_to_update["metadatas"][0]]
)
updated_doc = chroma_collection.get(limit=1)
print(updated_doc["metadatas"][0])

# delete the last document
print("count before", chroma_collection.count())
chroma_collection.delete(ids=[doc_to_update["ids"][0]])
print("count after", chroma_collection.count())

## Adding Documents

In [None]:

with open("D:\\I_Drive_Backup\\Projects\\necroman\\necroman\\necroman_python\\necrovox_docs\\rules.md", "r", encoding="utf-8") as f:
    rules_text = f.readlines()
    rules_text = "".join(rules_text)

    # chunk the text into 1000 character chunks
    rules_text = [rules_text[i:i+1000] for i in range(0, len(rules_text), 1000)]

    documents = []
    metadatas = []
    ids = []

    for i, chunk in enumerate(rules_text):
        documents.append(chunk)
        metadatas.append({
            "name": "rules",
            "source": "necrovox_docs",
            "type": "markdown",
            "content-type" : "rules",
            "content-subtype" : "core",
            })
        ids.append(f"rules_{i}")

    # create a document
    collection.add(
        documents=documents,
        metadatas=metadatas,
            ids=ids
    )

In [None]:
with open("D:\\I_Drive_Backup\\Projects\\necroman\\necroman\\necroman_python\\necrovox_docs\\gangs.md", "r", encoding="utf-8") as f:
    rules_text = f.readlines()
    rules_text = "".join(rules_text)

    # chunk the text into 1000 character chunks
    rules_text = [rules_text[i:i+1000] for i in range(0, len(rules_text), 1000)]

    documents = []
    metadatas = []
    ids = []

    for i, chunk in enumerate(rules_text):
        documents.append(chunk)
        metadatas.append({
            "name": "gangs",
            "source": "necrovox_docs",
            "type": "markdown",
            "content-type" : "rules",
            "content-subtype" : "gangs",
            })
        ids.append(f"gangs_{i}")

    # create a document
    collection.add(
        documents=documents,
        metadatas=metadatas,
            ids=ids
    )

In [None]:
results = collection.query(
    query_texts=["How do I roll on the injury table?"],
    n_results=2
)


In [None]:
results

In [None]:
collection.delete(ids=["id1", "id2"])

In [None]:
collection.get(
    ids=["rules_1", "rules_2"],
    where={"content-type": "rules"}
)

In [None]:
client.heartbeat()

In [None]:
client = chromadb.PersistentClient(path="necromunda.db")

In [None]:
%pip install chromadb openai