In [None]:
from dotenv import load_dotenv
from pathlib import Path

load_dotenv(override=True)
base_dir = Path().resolve().parent

In [None]:
from langchain_openai import ChatOpenAI

llm_model = "gpt-3.5-turbo"
llm = ChatOpenAI(temperature=0, model=llm_model)

In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

chunk_size = 600
chunk_overlap = 150

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n", ".", " ", ""],
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
)

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(f"{base_dir}/data/CourseOutline.pdf")
pages = loader.load()
len(pages)

In [None]:
spitted_docs = r_splitter.split_documents(pages)
len(spitted_docs)
spitted_docs[0].page_content

In [None]:
persist_directory_name = "persist_vectorstore"
persist_directory = Path.joinpath(base_dir, persist_directory_name)
persist_directory_str = persist_directory.as_posix()
persist_directory_str

In [None]:
!rm -rf ../peripersist_vectorstore

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model="text-embedding-3-small")

vectordb = Chroma.from_documents(
    documents=spitted_docs, embedding=embedding, persist_directory=persist_directory_str
)

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(model="text-embedding-3-small")

# Reusing the persistant
vectordb = Chroma(embedding_function=embedding, persist_directory=persist_directory_str)

In [None]:
question = "How many topics in the course?"

In [None]:
similar_vector_embeds = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3)
len(similar_vector_embeds)

In [None]:
# Migrating from RetrievalQA (Depricated) the LCEL implementation
# https://python.langchain.com/docs/versions/migrating_chains/retrieval_qa/
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vectordb.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke("How many lecture in the course?")

In [None]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage, AIMessage
from pydantic import BaseModel, Field


class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: list[BaseMessage] = Field(default_factory=list)

    def add_messages(self, messages: list[BaseMessage]) -> None:
        """Add a list of messages to the store"""
        self.messages.extend(messages)

    def clear(self) -> None:
        self.messages = []


# Here we use a global variable to store the chat message history.
# This will make it easier to inspect it to see the underlying results.
store = {}


def get_by_session_id(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = InMemoryHistory()
    return store[session_id]


history = get_by_session_id("1")
history.add_message(AIMessage(content="hello"))
print(store)  # noqa: T201

In [None]:
# Experiment: refining the question based on the sample split document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda

prompt_refine_prompt = PromptTemplate.from_template(
    "You are a helpful assistant. Detect what the context might about and Rewrite the query to match the given context. "
    "If the context is unclear, refine the question in general:\n\n"
    "Context: {context}\nOriginal: {query}\nRewritten:"
)

# Define the refinement chain
question_refinement_chain = (
    {"context": lambda x: x["context"], "query": lambda x: x["query"]}
    | prompt_refine_prompt
    | llm
    | StrOutputParser()
    | RunnableLambda(lambda rewritten: {"question": rewritten.strip()})
)

# Example usage
print(
    question_refinement_chain.invoke(
        {
            "context": spitted_docs[0].page_content,
            "query": "How the student will be evaluated?",
        }
    )
)

In [None]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.runnables import RunnableLambda

# See full prompt at https://smith.langchain.c  om/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")

prompt_refine_prompt = PromptTemplate.from_template(
    "You are a helpful assistant. Detect what the text might about and Rewrite the query to match the given context. "
    "If the text is unclear, refine the question in general:\n\n"
    "Text: {text}\nOriginal: {query}\nRewritten:"
)

question_refinement_chain = (
    {"text": lambda x: x["context"], "query": lambda x: x["question"]}
    | prompt_refine_prompt
    | llm
    | StrOutputParser()
    | RunnableLambda(lambda rewritten: {"question": rewritten.strip()})
)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


question_input = RunnableLambda(lambda d: d["question"])


def get_text():
    return spitted_docs[0].page_content


rag_chain = (
    {
        "context": question_input
        | vectordb.as_retriever(
            search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20, "lambda_mult": 0.5}
        )
        | format_docs,
        "question": question_input,
    }
    | prompt
    | llm
    | StrOutputParser()
)

full_chain = question_refinement_chain | rag_chain


chain_with_history = RunnableWithMessageHistory(
    rag_chain,
    # Uses the get_by_session_id function defined in the example
    # above.
    get_by_session_id,
    input_messages_key="question",
    history_messages_key="history",
)

print(
    chain_with_history.invoke(  # noqa: T201
        {"question": "Who is the Course teacher?"},
        config={"configurable": {"session_id": "foo"}},
    )
)

# Uses the store defined in the example above.
print(store)  # noqa: T201

print(
    chain_with_history.invoke(  # noqa: T201
        {"question": "Do we have more information about him?"},
        config={"configurable": {"session_id": "foo"}},
    )
)

print(store)  # noqa: T201