<a href="https://colab.research.google.com/github/sultanasabiha/RAG/blob/main/QABot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### **Introduction to the Retrieval-Augmented Generation (RAG) Interface**

In this notebook, a **Retrieval-Augmented Generation (RAG) backend** is augmented with an interactive interface for developing a QA Bot using **Streamlit**.

It allows users to upload files, index their contents, and ask questions about them. It also allows users to see the retrieved documents alongside the generated answer.

🚨 _Note that running this code should be run on a GPU. If running on Google Colab go to **Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4**. This should be included within the free tier of Colab._

---

We start by doing a `pip install` of all required libraries, and saving all the required secrets to the environment.

In [1]:
!pip install -qU\
    llama-index\
    llama_index-embeddings-huggingface\
    pinecone[grpc]\
    llama_index-vector_stores-pinecone\
    transformers\
    accelerate\
    bitsandbytes\
    llama_index-llms-huggingface\
    streamlit


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m97.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m115.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m402.8/402.8 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import userdata
import os
os.environ['HF_TOKEN']=userdata.get('HF_TOKEN')
os.environ['PC_API_KEY']=userdata.get('PC_API_KEY')


The Streamlit app is run with local tunneling using LocalTunnel.It helps us make a local server, running on our machine, publicly accessible. This means our can expose apps running on local ports (like 8501 for Streamlit) to the internet.

In [3]:
!npm install localtunnel

[K[?25h
added 22 packages, and audited 23 packages in 2s

3 packages are looking for funding
  run `npm fund` for details

2 [33m[1mmoderate[22m[39m severity vulnerabilities

To address all issues, run:
  npm audit fix

Run `npm audit` for details.


In [4]:

%%writefile app.py
import streamlit as st
import time
import os
import torch
from transformers import BitsAndBytesConfig
from llama_index.llms.huggingface import HuggingFaceLLM
from transformers import logging
from llama_index.core import (SimpleDirectoryReader, StorageContext, VectorStoreIndex, Settings)
from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


HF_TOKEN=os.environ['HF_TOKEN']
PC_API_KEY=os.environ['PC_API_KEY']


def init_page():
  st.set_page_config(
    page_title="RAG Application with Sab"
  )
  st.header("RAG QA Bot")
  st.sidebar.title("Options")


def init_messages():
  clear_button = st.sidebar.button("Clear Conversation", key="clear")
  if clear_button or "messages" not in st.session_state:
    st.session_state.messages = []


def load_file():
  uploaded_file = st.file_uploader("Upload a file")
  if uploaded_file is not None:
    with open(os.path.join("/content/sample_data",uploaded_file.name),"wb") as f:
      f.write(uploaded_file.getbuffer())
      st.success("Saved file :{} ".format(uploaded_file.name))
    Settings.chunk_size = 512
    Settings.chunk_overlap = 20
    documents=SimpleDirectoryReader(input_files=[os.path.join("/content/sample_data",uploaded_file.name)]).load_data()
    return documents

@st.cache_resource(show_spinner="Vector Store is being initialized...")
def init_vector_store(_documents):
  pc=PineconeGRPC(api_key=PC_API_KEY)
  index_name="rag-application"
  pc.delete_index(index_name)
  if index_name not in pc.list_indexes().names():
      pc.create_index(
          name=index_name,
          dimension=384,
          metric="cosine",
          spec=ServerlessSpec(cloud="aws", region="us-east-1"),
      )
      #while not pc.describe_index(index_name).status["ready"]:
      #    time.sleep(1)


  index = pc.Index(index_name)
  vector_store = PineconeVectorStore(pinecone_index=index)
  embed_model=HuggingFaceEmbedding()

  context=StorageContext.from_defaults(vector_store=vector_store)
  vs_index=VectorStoreIndex.from_documents(_documents,storage_context=context,embed_model=embed_model)
  return vs_index


@st.cache_resource(show_spinner="LLM is being initialized...")
def select_llm() -> HuggingFaceLLM:
  # quantize to save memory
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype=torch.float16,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
  )
  logging.set_verbosity_error()

  return HuggingFaceLLM(
      model_name="mistralai/Mistral-7B-Instruct-v0.3",
      tokenizer_name="mistralai/Mistral-7B-Instruct-v0.3",
      context_window=3900,
      max_new_tokens=1024,
      model_kwargs={"quantization_config": quantization_config},
      generate_kwargs={"temperature": 0.7, "top_k": 10, "top_p": 0.95,"do_sample":True},
      device_map="auto",
  )


def get_qa(llm, vs_index):
  # Display the chat history
  if st.session_state.messages != []:
    for query, res in st.session_state.messages:
      st.chat_message("user").write(query)
      st.chat_message("assistant").write(res)

  chat_engine = vs_index.as_chat_engine(chat_mode="context", llm=llm,streaming=True)

  retriever = vs_index.as_retriever()
  if user_input := st.chat_input("Input your question!"):

    st.sidebar.title("Retrieved Document Segments for the current query:")
    nodes = retriever.retrieve(user_input)
    for node in nodes:
      st.sidebar.write(node.text)
      st.sidebar.write(node.metadata)

    st.chat_message("user").write(user_input)

    response = chat_engine.stream_chat(user_input)
    iter=list(response.response_gen)

    st.chat_message("assistant").write_stream(iter)
    answer="".join(str(x) for x in iter)

    st.session_state.messages.append((user_input, answer))


def main():
  init_page()
  init_messages()
  llm = select_llm()
  documents=load_file()
  if documents is not None:
    vs_index=init_vector_store(documents)

    st.success("File successfully uploaded and processed. You can now ask questions about its content.")

    get_qa(llm,vs_index)



if __name__ == "__main__":
  main()


Writing app.py


To run the app on the local tunnel, click on the URL generated and give the IP address of the external URL as password to the consent page.

In [5]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
your url is: https://few-ghosts-throw.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.16.151.238:8501[0m
[0m
34.16.151.238





2024-10-18 20:40:45.136923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-18 20:40:45.176328: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-18 20:40:45.188238: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLA