In [None]:
!pip install langchain[llms]
!pip install -U langchain-community
!pip install --upgrade --quiet docx2txt
!pip install --quiet langchain_experimental langchain_openai
!pip install faiss-cpu
!pip install openai
!pip install tiktoken
!pip install pypdf
!pip install chromadb

In [None]:
import os
from getpass import getpass
OPENAI_API_KEY = getpass("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

OPENAI_API_KEY··········


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader

In [None]:
documents = []
for file in os.listdir("/content/drive/MyDrive/ColabNotebooks/docs/"):
    if file.endswith(".pdf"):
        pdf_path = "/content/drive/MyDrive/ColabNotebooks/docs/" + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = "/content/drive/MyDrive/ColabNotebooks/docs/" + file
        loader = TextLoader(text_path)
        documents.extend(loader.load())
    elif file.endswith('.docx'):
        docx_path = "/content/drive/MyDrive/ColabNotebooks/docs/" + file
        loader = Docx2txtLoader(docx_path)
        documents.extend(loader.load())

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Text Splitter

text_splitter = SemanticChunker(OpenAIEmbeddings())
documents = text_splitter.create_documents([d.page_content for d in documents])

In [None]:
from langchain.vectorstores import Chroma
# vectorDB usando OpenAIEmbeddings tranformer y guardamos en directorio de data
vectordb = Chroma.from_documents(
  documents,
  embedding=OpenAIEmbeddings(),
  persist_directory='./data'
)
vectordb.persist()

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model_name='gpt-4o'),
    retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True
)

# testeamos que ya podemos tirarle queries contra nuestros documentos
result = qa_chain({'query': 'What is the protocol number?'})
print(result['result'])

In [None]:
from langchain.prompts import PromptTemplate
from langchain.prompts import SystemMessagePromptTemplate
from langchain.prompts.chat import (ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate)
from langchain.memory import ConversationBufferMemory

general_system_template = r"""
Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----
{context}
----
"""
general_user_template = "Question:```{question}```"
messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]
qa_prompt = ChatPromptTemplate.from_messages( messages )

memory = ConversationBufferMemory(
    memory_key='chat_history',
    return_messages=True,
    output_key='answer'
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI

chain = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0, model_name="gpt-4o"),
    vectordb.as_retriever(search_kwargs={'k': 12}),
    return_source_documents=True,
    memory=memory,
    verbose=True,
    combine_docs_chain_kwargs={'prompt': qa_prompt}, # acá le pasamos nuestro prompt en específico para no recaer en el prompt default de ConversationalRetrievalChain / Igualmente lo mantuvimos parecido para no experimentar demasiado.
)

In [None]:
import sys

chat_history = []
while True:

    query = input('Prompt: ')

    if query == "exit" or query == "quit" or query == "q":
        print('Exiting')
        sys.exit()

    result = chain({'question': query,
                    'chat_history': chat_history
                    })
    print('Answer: ' + result['answer'])

    chat_history.append((query, result['answer']))