In [None]:
from dotenv import load_dotenv
from pathlib import Path

load_dotenv(override=True)
base_dir = Path().resolve().parent

In [11]:
from langchain_openai import ChatOpenAI

llm_model = "gpt-3.5-turbo"
llm = ChatOpenAI(temperature=0, model=llm_model)

In [36]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

chunk_size = 600
chunk_overlap = 150

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=["\n", ".", " ", ""],
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
)

In [37]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(f"{base_dir}/data/CourseOutline.pdf")
pages = loader.load()
len(pages)

4

In [38]:
spitted_docs = r_splitter.split_documents(pages)
len(spitted_docs)

17

In [39]:
persist_directory_name = "persist_vectorstore"
persist_directory = Path.joinpath(base_dir, persist_directory_name)
persist_directory_str = persist_directory.as_posix()
persist_directory_str

'/home/zhbdripon/Documents/document-chat/persist_vectorstore'

In [40]:
!rm -rf ../peripersist_vectorstore

In [41]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=spitted_docs, embedding=embedding, persist_directory=persist_directory_str
)

In [236]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

# Reusing the persistant
vectordb = Chroma(embedding_function=embedding, persist_directory=persist_directory_str)

In [18]:
question = "How many topics in the course?"

In [19]:
similar_vector_embeds = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3)
len(similar_vector_embeds)
similar_vector_embeds

[Document(metadata={'title': 'SOUTHEAST UNIVERSITY', 'source': '/home/zhbdripon/Documents/document-chat/data/CourseOutline.pdf', 'creator': 'Microsoft® Word 2016', 'total_pages': 4, 'author': 'Monirul Hasan', 'page_label': '3', 'creationdate': '2017-10-08T22:16:43+06:00', 'page': 2, 'moddate': '2017-10-08T22:16:43+06:00', 'producer': 'Microsoft® Word 2016'}, page_content='13-15  Requirements, Design Concepts 6,7,8 \n MIDTERM   \n16-17 Design Concepts continue  9,11,12 \n18-19 Part 3 (Quality Concepts) Quality Concepts, SQA, Testing 14,16,17,18 \n20-21 Product Metrics 23 \n   \n22-23 Part 4 (Managing Software Projects) 24-28 \n24-25 Project Estimation, Project Scheduling, Risk Mgt 26,27,28 \n26-27 Recap   \n28 Recap  \n Final  \n \n \n \nGrading Scheme and Marks Distribution \n \nCategory  Marks% Description  \nAttendance  5 No marks will be deducted for one absence. But for every \nsubsequent absences 0.5 marks will be deducted. If a student'),
 Document(metadata={'creationdate': '2017

In [21]:
# Migrating from RetrievalQA (Depricated) the LCEL implementation
# https://python.langchain.com/docs/versions/migrating_chains/retrieval_qa/
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vectordb.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke("How many lecture in the course?")

'The course consists of weekly two 1.5-hour lectures covering various topics related to software and software engineering. The course outline includes a midterm and final exam, as well as assignments based on case studies and analysis. The total number of lectures in the course is not explicitly mentioned in the provided context.'

In [None]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage, AIMessage
from pydantic import BaseModel, Field


class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: list[BaseMessage] = Field(default_factory=list)

    def add_messages(self, messages: list[BaseMessage]) -> None:
        """Add a list of messages to the store"""
        self.messages.extend(messages)

    def clear(self) -> None:
        self.messages = []


# Here we use a global variable to store the chat message history.
# This will make it easier to inspect it to see the underlying results.
store = {}


def get_by_session_id(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = InMemoryHistory()
    return store[session_id]


history = get_by_session_id("1")
history.add_message(AIMessage(content="hello"))
print(store)  # noqa: T201

{'1': InMemoryHistory(messages=[AIMessage(content='hello', additional_kwargs={}, response_metadata={})])}


In [None]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.runnables import RunnableLambda

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


question_input = RunnableLambda(lambda d: d["question"])


chain = (
    {
        "context": question_input
        | vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 4})
        | format_docs,
        "question": question_input,
    }
    | prompt
    | llm
    | StrOutputParser()
)


chain_with_history = RunnableWithMessageHistory(
    chain,
    # Uses the get_by_session_id function defined in the example
    # above.
    get_by_session_id,
    input_messages_key="question",
    history_messages_key="history",
)

print(
    chain_with_history.invoke(  # noqa: T201
        {"question": "What is the faculty name?"},
        config={"configurable": {"session_id": "foo"}},
    )
)

# Uses the store defined in the example above.
print(store)  # noqa: T201

print(
    chain_with_history.invoke(  # noqa: T201
        {"question": "Do we have more information about him?"},
        config={"configurable": {"session_id": "foo"}},
    )
)

print(store)  # noqa: T201

The faculty name is AKM Ahsanul Hoque. He holds the designation of Assistant Professor. His contact information includes email ahoque707@gmail.com and phone number 01713363703.
{'1': InMemoryHistory(messages=[AIMessage(content='hello', additional_kwargs={}, response_metadata={})]), 'foo': InMemoryHistory(messages=[HumanMessage(content='Who is the course teacher?', additional_kwargs={}, response_metadata={}), AIMessage(content='The course teacher is an Assistant Professor. They can be contacted via email at ahoque707@gmail.com or by phone at 01713363703. Counseling is available by appointment during class hours on Sunday to Tuesday from 10:00 AM to 11:30 AM.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Do we have more information about him?', additional_kwargs={}, response_metadata={}), AIMessage(content='Yes, we have his email address ahoque707@gmail.com and phone number 01713363703. He is available for counseling by appointment on Sundays to Tuesdays from 10:00

In [45]:
docs = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 4}).invoke(
    "What is the faculty name?"
)

for doc in docs:
    print(doc.page_content)

AKMAH 
 
Faculty Name 
 
AKM Ahsanul Hoque 
 
Designation Asst. Professor   
Email & Phone 
 
ahoque707@gmail.com 
01713363703  
 
 
Room # 
 
 
 
Counseling by appointment 
 
Class/ 
Counseling 
Hours 
 
Sunday ­ Tuesday 
10:00 AM to 11:30 AM  
 
Course Objectives 
 
This is an interesting, challenging, advance, and higher level course in the area of Computer Science 
and Computer Science & Engineering.  This course will give an idea to the graduating student about t he 
way of develop or engineer, software or system.  In this particular course student has to use his/her all
AKMAH 
 
Faculty Name 
 
AKM Ahsanul Hoque 
 
Designation Asst. Professor   
Email & Phone 
 
ahoque707@gmail.com 
01713363703  
 
 
Room # 
 
 
 
Counseling by appointment 
 
Class/ 
Counseling 
Hours 
 
Sunday ­ Tuesday 
10:00 AM to 11:30 AM  
 
Course Objectives 
 
This is an interesting, challenging, advance, and higher level course in the area of Computer Science 
and Computer Science & Engineering.  This cou