In [None]:
pip install chromadb pypdf langchain_community

In [None]:
pip install langchain --upgrade

In [None]:
pip install -U langchain-openai

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory,ConversationSummaryBufferMemory, ConversationBufferWindowMemory, ChatMessageHistory
from langchain.chains import ConversationChain
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
import chromadb
from langchain.vectorstores import Chroma

In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [None]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader('PDFs/', glob="./*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000,
                                               chunk_overlap=200)

documents = text_splitter.split_documents(docs)

In [None]:
documents[0]

In [None]:
len(documents)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-ada-002")

In [None]:
persist_directory = 'docs/'

In [None]:
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

In [None]:
retriever = vectordb.as_retriever(search_type="similarity",search_kwargs={"k": 4})

In [None]:
docs = retriever.get_relevant_documents("what does traditional growth strategies focuses on")

In [None]:
docs[0]

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", temperature=0.6, max_tokens=100)

In [None]:
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [None]:
## Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. Answer the question using every piece of relevant context available\
Answer in about 200-300 words. If you don't find answer in context, just say that you don't know. If two questions are asked together, answer them in different paragraphs\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [None]:
import textwrap

def process_llm_response(llm_response):

    print(textwrap.fill(llm_response['answer'], 100))
    print('\n\nSources:')

    # Keep track of already printed sources and page numbers
    printed_sources = set()
    printed_pages = set()

    for doc in llm_response['context']:
        source = doc.metadata['source']
        page = doc.metadata['page']

        # Check if source and page have not been printed before
        if source not in printed_sources or page not in printed_pages:
            print("pdf name:", source, "page no:", page)
            # Add source and page to printed sets
            printed_sources.add(source)
            printed_pages.add(page)

In [None]:
llm_response = conversational_rag_chain.invoke(
    {"input": "what does traditional growth strategies focuses on?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)

process_llm_response(llm_response)