<a href="https://colab.research.google.com/github/thanhhuyenpt/DocsBot-GPT/blob/main/DocsBot_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python --version

Python 3.10.12


In [None]:
# install requirements
!pip install langchain==0.0.316 \
openai==0.28.1 \
pygpt4all==1.1.0 \
chromadb==0.3.23 \
urllib3==2.0.2 \
pdfminer.six==20221105 \
python-dotenv==1.0.0 \
unstructured==0.6.6 \
extract-msg==0.41.1 \
tabulate==0.9.0 \
pandoc==2.3 \
pypandoc==1.11 \
llama-cpp-python==0.1.50 \
gradio==3.50.2



In [None]:
!pip install pydantic==1.10.14



#Ingest data

In [None]:
import os
import glob
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# Create knowledge base folder
if not os.path.exists('/content/db'):
  os.makedirs('/content/db')
# Create source_documents folder to store file txt
if not os.path.exists('/content/source_documents'):
  os.makedirs('/content/source_documents')

In [None]:
# Run the cell multiple times if you want to upload multiple files
from google.colab import files
uploaded = files.upload()
# Move files to source_documents folder
for filename in uploaded.keys():
    os.rename(filename, os.path.join('/content/source_documents', filename))

In [None]:
PERSIST_DIRECTORY='/content/db'
EMBEDDINGS_MODEL_NAME='all-MiniLM-L6-v2'
source_directory='/content/source_documents'

In [None]:
from chromadb.config import Settings
# Define the Chroma settings
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=PERSIST_DIRECTORY,
        anonymized_telemetry=False
)

In [None]:
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".txt": (TextLoader, {"encoding": "utf8"})
    # Add more mappings for other file extensions and loaders as needed
}
def load_single_document(file_path: str):
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()[0]

    raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str):
    # Loads all documents from source documents directory
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )
    return [load_single_document(file_path) for file_path in all_files]

chunk_size = 500
chunk_overlap = 50
documents = load_documents(source_directory)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
texts = text_splitter.split_documents(documents)
print(f"Loaded {len(documents)} documents from {source_directory}")
print(f"Split into {len(texts)} chunks of text (max. {chunk_size} characters each)")

# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDINGS_MODEL_NAME)

# Create and store locally vectorstore
db = Chroma.from_documents(texts,
                           embeddings,
                           persist_directory=PERSIST_DIRECTORY,
                           client_settings=CHROMA_SETTINGS)
db.persist()
db = None

Loaded 2 documents from /content/source_documents
Split into 25 chunks of text (max. 500 characters each)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



#Ask questions to your documents using Gradio UI

In [None]:
import openai
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import (StuffDocumentsChain,
                              LLMChain)
from langchain.schema import HumanMessage, AIMessage
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain.callbacks.manager import trace_as_chain_group

import gradio as gr

In [None]:
"""
import LANGCHAIN_API_KEY in case you encounter the error:
langsmith.utils.LangSmithUserError: API key must be provided when using hosted LangSmith API
Create here: https://smith.langchain.com/
"""
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "YOUR_LANGCHAIN_API_KEY"

In [None]:
# Set up our retriever
db = Chroma(embedding_function=embeddings,
            persist_directory=PERSIST_DIRECTORY,
            client_settings=CHROMA_SETTINGS)
retriever = db.as_retriever()



In [None]:
# Define llm
llm = ChatOpenAI(temperature=0, openai_api_key='YOUR_OPENAI_API_KEY')

In [None]:
"""
Set up our chain that can answer questions based on documents:
This controls how each document will be formatted. Specifically,
it will be passed to `format_document` - see that function for more details
"""
document_prompt = PromptTemplate(
    input_variables=["page_content"],
    template="{page_content}"
)
document_variable_name = "context"
# The prompt here should take as an input variable the `document_variable_name`
prompt_template = """Use the following pieces of context to answer user questions.
If you don't know the answer, just say that can not found in knowledge base,
don't try to make up an answer.

--------------

{context}"""
system_prompt = SystemMessagePromptTemplate.from_template(prompt_template)
prompt = ChatPromptTemplate(
    messages=[
        system_prompt,
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
	]
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_docs_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name,
    document_separator="---------"
)

"""
Set up a chain that controls how the search query for the vectorstore is generated:
This controls how the search query is generated.
Should take `chat_history` and `question` as input variables.
"""
template = """Combine the chat history and follow up question into a a search query.

Chat History:

{chat_history}

Follow up question: {question}
"""
prompt = PromptTemplate.from_template(template)
question_generator_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
# Function to use
def qa_response(message, history):
	# Convert message history into format for the `question_generator_chain`.
	convo_string = "\n\n".join([f"Human: {h}\nAssistant: {a}" for h, a in history])

	# Convert message history into LangChain format for the final response chain.
	messages = []
	for human, ai in history:
		messages.append(HumanMessage(content=human))
		messages.append(AIMessage(content=ai))

	# Wrap all actual calls to chains in a trace group.
	with trace_as_chain_group("qa_response") as group_manager:
		# Generate search query.
		search_query = question_generator_chain.run(
			question=message,
			chat_history=convo_string,
			callbacks=group_manager
		)

		# Retrieve relevant docs.
		docs = retriever.get_relevant_documents(search_query, callbacks=group_manager)

		# Answer question.
		return combine_docs_chain.run(
			input_documents=docs,
			chat_history=messages,
			question=message,
			callbacks=group_manager
		)

# start the app
gr.ChatInterface(qa_response).launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://226675851e29cf15b6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://226675851e29cf15b6.gradio.live


