# Create HF vector database


In [None]:
from dotenv import load_dotenv

load_dotenv("../../.env")

In [None]:
import nest_asyncio

nest_asyncio.apply()

### Create a set of Llama-index Documents with each section in the jsonl file


In [None]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
import json
import pickle


def create_docs(input_file):
    with open(input_file, "r") as f:
        documents = []
        for i, line in enumerate(f):
            data = json.loads(line)
            documents.append(
                Document(
                    doc_id=data["doc_id"],
                    text=data["content"],
                    metadata={
                        "url": data["url"],
                        "title": data["name"],
                        "tokens": data["tokens"],
                        "retrieve_doc": data["retrieve_doc"],
                        "source": data["source"],
                    },
                    # LLM will see the 'url' of each chunk
                    excluded_llm_metadata_keys=[
                        # "url",
                        "title",
                        "tokens",
                        "retrieve_doc",
                        "source",
                    ],
                    # Embedding model will embed the 'title' of each chunk
                    excluded_embed_metadata_keys=[
                        "url",
                        # "title",
                        "tokens",
                        "retrieve_doc",
                        "source",
                    ],
                )
            )
        return documents


# documents = create_docs("../transformers_data.jsonl")
# documents = create_docs("../peft_data.jsonl")
# documents = create_docs("../trl_data.jsonl")
# documents = create_docs("../llama_index_data.jsonl")
documents = create_docs("../openai-cookbook_data.jsonl")
print(documents[0])
print(documents[0].metadata)

In [None]:
# print(
#     "The LLM sees this: \n",
#     documents[0].get_content(metadata_mode=MetadataMode.LLM),
# )
print(
    "The Embedding model sees this: \n",
    documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

In [None]:
import chromadb

# create client and a new collection
DB_COLLECTION = "chroma-db-openai-cookbooks"
chroma_client = chromadb.PersistentClient(path=f"../{DB_COLLECTION}")
chroma_collection = chroma_client.create_collection(DB_COLLECTION)


from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

document_dict = {doc.doc_id: doc for doc in documents}
DOCUMENT_NAME = f"../{DB_COLLECTION}/document_dict_openai.pkl"

with open(DOCUMENT_NAME, "wb") as f:
    pickle.dump(document_dict, f)

# with open(DOCUMENT_NAME, "rb") as f:
#     document_dict = pickle.load(f)

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
    transformations=[SentenceSplitter(chunk_size=800, chunk_overlap=400)],
    show_progress=True,
    use_async=True,
    storage_context=storage_context,
)

### Test the DB

In [None]:
retriever = index.as_retriever(
    similarity_top_k=10,
    use_async=True,
    embed_model=OpenAIEmbedding(model="text-embedding-3-large", mode="similarity"),
)

In [None]:
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore, BaseNode, TextNode


# query = "fine-tune a pretrained model"
# query = "fine-tune an llm"
query = "how to fine-tune an llm?"

nodes_context = []
nodes = retriever.retrieve(query)


# Filter nodes with the same ref_doc_id
def filter_nodes_by_unique_doc_id(nodes):
    unique_nodes = {}
    for node in nodes:
        doc_id = node.node.ref_doc_id
        if doc_id is not None and doc_id not in unique_nodes:
            unique_nodes[doc_id] = node
    return list(unique_nodes.values())


nodes = filter_nodes_by_unique_doc_id(nodes)
print(len(nodes))

for node in nodes:
    print("Node ID\t", node.node_id)
    print("Title\t", node.metadata["title"])
    print("Text\t", node.text)
    print("Score\t", node.score)
    print("Metadata\t", node.metadata)
    print("-_" * 20)
    if node.metadata["retrieve_doc"] == True:
        print("This node will be replaced by the document")
        doc = document_dict[node.node.ref_doc_id]
        # print(doc.text)
        new_node = NodeWithScore(
            node=TextNode(text=doc.text, metadata=node.metadata), score=node.score
        )
        print(new_node.text)
        nodes_context.append(new_node)
    else:
        nodes_context.append(node)

print(len(nodes_context))

In [None]:
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import ChatMessage, MessageRole
from pydantic import BaseModel, Field

system_prompt = (
    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine-tuning models, giving 'memory' to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, Llama-Index, LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
    "You are provided information found in Hugging Face's documentation and the RAG course. "
    "Only some information might be relevant to the question, so ignore the irrelevant part and use the relevant part to answer the question."
    "Only respond with information given to you documentation. DO NOT use additional information, even if you know the answer. "
    "If the answer is somewhere in the documentation, answer the question (depending on the questions and the variety of relevant information in the documentation, give complete and helpful answers."
    "Here is the information you can use, the order is not important: \n\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n\n"
    "REMEMBER:\n"
    "You are a witty AI teacher, helpfully answering questions from students of an applied artificial intelligence course on Large Language Models (LLMs or llm). Topics covered include training models, fine tuning models, giving memory to LLMs, prompting, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, making LLMs interact with tool use, AI agents, reinforcement learning with human feedback. Questions should be understood with this context."
    "You are provided information found in Hugging Face's documentation and the RAG course. "
    "Here are the rules you must follow:\n"
    "* Only respond with information inside the documentation. DO NOT provide additional information, even if you know the answer. "
    "* If the answer is in the documentation, answer the question (depending on the questions and the variety of relevant information in the json documentation. Your answer needs to be pertinent and not redundant giving a clear explanation as if you were a teacher. "
    "* Only use information summarized from the documentation, do not respond otherwise. "
    "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
    "* Do not reference any links, urls or hyperlinks in your answers.\n"
    "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
    "Now answer the following question: \n"
)

chat_text_qa_msgs: list[ChatMessage] = [
    ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
    ChatMessage(
        role=MessageRole.USER,
        content="{query_str}",
    ),
]

TEXT_QA_TEMPLATE = ChatPromptTemplate(chat_text_qa_msgs)

In [None]:
from IPython.display import Markdown
from llama_index.core.data_structs import Node
from llama_index.core.schema import NodeWithScore
from llama_index.core import get_response_synthesizer
from llama_index.llms.gemini import Gemini
from llama_index.llms.openai import OpenAI

# llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=None)
# llm = Gemini(model="models/gemini-1.5-pro", temperature=1, max_tokens=None)
# llm = OpenAI(temperature=1, model="gpt-3.5-turbo", max_tokens=None)
llm = OpenAI(temperature=1, model="gpt-4o-mini", max_tokens=None)

response_synthesizer = get_response_synthesizer(
    llm=llm, response_mode="simple_summarize", text_qa_template=TEXT_QA_TEMPLATE
)

response = response_synthesizer.synthesize(query, nodes=nodes_context)
# print(response.response)
display(Markdown(response.response))

# for src in response.source_nodes:
#     print(src.node.ref_doc_id)
#     print("Node ID\t", src.node_id)
#     print("Title\t", src.metadata["title"])
#     print("Text\t", src.text)
#     print("Score\t", src.score)
#     print("Metadata\t", src.metadata)
#     print("-_" * 20)