In [2]:
import os

from second_brain.config import settings

os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY

In [3]:
from langchain_mongodb.retrievers import (
    MongoDBAtlasParentDocumentRetriever,
)

from second_brain.application.rag import get_splitter
from second_brain.application.rag.embeddings import EmbeddingModelBuilder

embedding_model = EmbeddingModelBuilder().get_model()
parent_doc_retriever = MongoDBAtlasParentDocumentRetriever.from_connection_string(
    connection_string=settings.MONGODB_URI,
    embedding_model=embedding_model,
    child_splitter=get_splitter(200),
    parent_splitter=get_splitter(800),
    database_name=settings.MONGODB_DATABASE_NAME,
    collection_name="rag_data",
    text_key="page_content",
    search_kwargs={"k": 10},
)

In [4]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

# Retrieve and parse documents
retrieve = {
    "context": parent_doc_retriever
    | (lambda docs: "\n\n".join([d.page_content for d in docs])),
    "question": RunnablePassthrough(),
}
template = """Answer the question based only on the following context. If no context is provided, respond with I DON'T KNOW: \
{context}

Question: {question}
"""
# Define the chat prompt
prompt = ChatPromptTemplate.from_template(template)
# Define the model to be used for chat completion
llm = ChatOpenAI(temperature=0, model="gpt-4o-2024-11-20")
# Parse output as a string
parse_output = StrOutputParser()
# Naive RAG chain
rag_chain = retrieve | prompt | llm | parse_output

In [5]:
answer = rag_chain.invoke("How can I optimize LLMs for inference?")
print(answer)


To optimize LLMs for inference, you can use the following techniques:

1. **Lower Precision (Quantization):** Operate at reduced numerical precision, such as 8-bit or 4-bit, to achieve computational advantages without a significant decline in model performance. This reduces memory requirements and speeds up inference.

2. **Flash Attention:** Use Flash Attention, a memory-efficient variation of the attention algorithm, which optimizes GPU memory utilization by relying on faster on-chip memory (SRAM) instead of slower VRAM. It provides mathematically identical outputs while being faster and more memory-efficient.

3. **Architectural Innovations:** Leverage specialized model architectures designed for efficient inference in autoregressive text generation. Examples include Alibi, Rotary embeddings, Multi-Query Attention (MQA), and Grouped-Query Attention (GQA). These innovations improve the handling of long input contexts and reduce computational overhead.


In [6]:
answer = rag_chain.invoke("What is RLHF?")
print(answer)

RLHF stands for Reinforcement Learning from Human Feedback. It is a technique used to fine-tune models, such as language models (LMs), by incorporating human feedback to guide the learning process. This feedback can take the form of preferences, scores, or other evaluative signals provided by humans, which are used to train a reward model. The reward model then helps optimize the policy of the model using reinforcement learning algorithms like PPO (Proximal Policy Optimization). RLHF has been applied to improve the performance of large language models by aligning their outputs more closely with human preferences and values.


In [8]:
answer = rag_chain.invoke("What is RAGAS?")
print(answer)

I DON'T KNOW
