In [159]:
from dotenv import load_dotenv
from pathlib import Path
from langchain.callbacks.manager import tracing_v2_enabled

load_dotenv(override=True)
base_dir = Path().resolve().parent

In [160]:
from langchain_openai import ChatOpenAI

llm_model = "gpt-3.5-turbo"
llm = ChatOpenAI(temperature=0, model=llm_model)

In [217]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

chunk_size = 1000
chunk_overlap = 150

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="\n"
)

In [258]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(f"{base_dir}/data/CourseOutline.pdf")
pages = loader.load()
len(pages)

4

In [259]:
spitted_docs = c_splitter.split_documents(pages)
len(spitted_docs)

9

In [260]:
persist_directory_name = "persist_vectorstore"
persist_directory = Path.joinpath(base_dir, persist_directory_name)
persist_directory_str = persist_directory.as_posix()
persist_directory_str

'/home/zhbdripon/Documents/document-chat/persist_vectorstore'

In [234]:
!rm -rf ../peripersist_vectorstore

In [235]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
    documents=spitted_docs, embedding=embedding, persist_directory=persist_directory_str
)

In [236]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

# Reusing the persistant
vectordb = Chroma(embedding_function=embedding, persist_directory=persist_directory_str)

In [261]:
question = "How many topics in the course?"

In [262]:
similar_vector_embeds = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3)
len(similar_vector_embeds)
similar_vector_embeds

[Document(metadata={'source': '/home/zhbdripon/Documents/document-chat/data/CourseOutline.pdf', 'author': 'Monirul Hasan', 'title': 'SOUTHEAST UNIVERSITY', 'creator': 'Microsoft® Word 2016', 'page': 2, 'moddate': '2017-10-08T22:16:43+06:00', 'page_label': '3', 'creationdate': '2017-10-08T22:16:43+06:00', 'total_pages': 4, 'producer': 'Microsoft® Word 2016'}, page_content='13-15  Requirements, Design Concepts 6,7,8 \n MIDTERM   \n16-17 Design Concepts continue  9,11,12 \n18-19 Part 3 (Quality Concepts) Quality Concepts, SQA, Testing 14,16,17,18 \n20-21 Product Metrics 23 \n   \n22-23 Part 4 (Managing Software Projects) 24-28 \n24-25 Project Estimation, Project Scheduling, Risk Mgt 26,27,28 \n26-27 Recap   \n28 Recap  \n Final  \n \n \n \nGrading Scheme and Marks Distribution \n \nCategory  Marks% Description  \nAttendance  5 No marks will be deducted for one absence. But for every \nsubsequent absences 0.5 marks will be deducted. If a student'),
 Document(metadata={'total_pages': 4, 'pa

In [267]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever())

In [265]:
with tracing_v2_enabled(project_name="document_chat"):
    result = qa_chain({"query": question})
    print(result["result"])

There are a total of 28 topics in the course based on the provided context.


In [266]:
with tracing_v2_enabled(project_name="document_chat"):
    result = qa_chain({"query": "What are they?"})
    print(result["result"])

I'm sorry, I don't have enough information to determine what "they" are based on the context provided.


In [242]:
from langchain.prompts import PromptTemplate

template = """Use following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up the answer. Use three sentence maximum. keep the sentence as concise as possible. Always say thanks for asking at the end of the answer
{context}
Question: {question}
Helpul Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [243]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [244]:
with tracing_v2_enabled(project_name="document_chat"):
    result = qa_chain({"query": question})
    print(result["result"])

The document explains different types of relationships in a database, such as one to many, one to one, and many to many. It also discusses the importance of entities and relationships in the entity relationship data model. Thanks for asking.


In [176]:
result["source_documents"]

[Document(metadata={'title': 'SOUTHEAST UNIVERSITY', 'creator': 'Microsoft® Word 2016', 'page': 0, 'total_pages': 4, 'page_label': '1', 'author': 'Monirul Hasan', 'source': '/home/zhbdripon/Documents/document-chat/data/CourseOutline.pdf', 'producer': 'Microsoft® Word 2016', 'creationdate': '2017-10-08T22:16:43+06:00', 'moddate': '2017-10-08T22:16:43+06:00'}, page_content='Program: \n \nCSE \n \nCourse Code \n \nCSE3035 \n \nSection \n \n3 \n \nCourse Title \n \nInformation System Design & \nSoftware Engineering \n \nClass Room \n \nAR Tower –1201 \n \nCourse Group \n \n \nFaculty Code \n \nAKMAH \n \nFaculty Name \n \nAKM Ahsanul Hoque \n \nDesignation Asst. Professor   \nEmail & Phone'),
 Document(metadata={'moddate': '2017-10-08T22:16:43+06:00', 'creator': 'Microsoft® Word 2016', 'author': 'Monirul Hasan', 'page_label': '1', 'title': 'SOUTHEAST UNIVERSITY', 'source': '/home/zhbdripon/Documents/document-chat/data/CourseOutline.pdf', 'page': 0, 'producer': 'Microsoft® Word 2016', 'tota

In [245]:
# demo map reduce
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=vectordb.as_retriever(), chain_type="map_reduce"
)

In [246]:
with tracing_v2_enabled(project_name="document_chat"):
    result = qa_chain({"query": question})
    print(result["result"])

The document explains different types of relationships in a relational database design, including one to many (1:M), one to one (1:1), and many to many (M:N) relationships. It also discusses how relationships are defined in the entity relationship (ER) data model as associations or interactions between entities. Furthermore, the document provides guidelines for explaining relationship rules and discusses the concept of cardinality for each relationship in a database schema.


In [279]:
# Migrating from RetrievalQA (Depricated) the LCEL implementation
# https://python.langchain.com/docs/versions/migrating_chains/retrieval_qa/
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# See full prompt at https://smith.langchain.com/hub/rlm/rag-prompt
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


qa_chain = (
    {
        "context": vectordb.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke("How many lecture in the course?")

'There are 28 lectures in the course. Each lecture covers different topics related to software engineering and information system design. The course also includes assignments and exams for assessment purposes.'

In [None]:
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# See full prompt at https://smith.langchain.com/hub/langchain-ai/retrieval-qa-chat
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
rag_chain = create_retrieval_chain(vectordb.as_retriever(), combine_docs_chain)

rag_chain.invoke({"input": "How many lecture in the course?"})

{'input': 'How many lecture in the course?',
 'context': [Document(metadata={'producer': 'Prince 9.0 rev 5 (www.princexml.com)', 'creator': 'PyPDF', 'total_pages': 136, 'source': '/home/zhbdripon/Documents/document-chat/data/CS403-1.10-Database-Design-2nd-Edition-CCBY.pdf', 'page_label': '121', 'page': 120, 'title': 'Database Design - 2nd Edition', 'creationdate': ''}, page_content='is either a 15-point course or a 30-point course. A course may have a quota for the number of students enrolled in it\nat any one presentation. A course need not have any students enrolled in it (such as a course that has just been written\nand offered for study).\nStudents are constrained in the number of courses they can be enrolled in at any one time. They may not take courses\nsimultaneously if their combined points total exceeds 180 points.\nFor assessment purposes, a 15-point course may have up to three assignments per presentation and a 30-point course\nmay have up to five assignments per presentatio

In [None]:
# from operator import itemgetter

from langchain_openai.chat_models import ChatOpenAI

from langchain_core.chat_history import BaseChatMessageHistory

# from langchain_core.documents import Document
from langchain_core.messages import BaseMessage, AIMessage

# from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field
# from langchain_core.runnables import (
#     RunnableLambda,
#     ConfigurableFieldSpec,
#     RunnablePassthrough,
# )
# from langchain_core.runnables.history import RunnableWithMessageHistory


class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: list[BaseMessage] = Field(default_factory=list)

    def add_messages(self, messages: list[BaseMessage]) -> None:
        """Add a list of messages to the store"""
        self.messages.extend(messages)

    def clear(self) -> None:
        self.messages = []


# Here we use a global variable to store the chat message history.
# This will make it easier to inspect it to see the underlying results.
store = {}


def get_by_session_id(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = InMemoryHistory()
    return store[session_id]


history = get_by_session_id("1")
history.add_message(AIMessage(content="hello"))
print(store)  # noqa: T201