<a href="https://colab.research.google.com/github/vishnusureshperumbavoor/rag_apps/blob/main/rag_phi3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import Markdown, display
display(Markdown("#VSP's RAG app Phi3"))

#VSP's RAG app Phi3

# Install packages

In [None]:
!pip install -q pypdf
!pip install -q python-dotenv
!pip install llama-index==0.10.12
!pip install -q gradio
!pip install einops
!pip install accelerate

In [None]:
!pip install llama-index-llms-huggingface llama-index-embeddings-huggingface

In [None]:
pip install transformers -U

# Huggingface API import

In [None]:
# add your huggingface API in colab secrets and allow access
from huggingface_hub import notebook_login
notebook_login()

# Logging

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Chunking

In [None]:
# Process of breaking down a large input text into smaller pieces to improve retrieval efficiency. This ensures that the text fits the input size of the embedding model.
from llama_index.core import SimpleDirectoryReader

input_dir_path="/content/data"
# Create a folder called 'data' and upload the pdf into that folder
loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
documents = loader.load_data()


# Models declaration

In [None]:
embedding_model="BAAI/bge-small-en-v1.5"
tokenizer_model="microsoft/Phi-3-mini-4k-instruct"
llm_model="microsoft/Phi-3-mini-4k-instruct"

# Embeddings

In [None]:
# Document is converted into numerical vectors using an embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model, trust_remote_code=True)
Settings.chunk_size = 512

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Vector database (VectorStoreIndex)

In [None]:
# Vector databases is used for fast retrieval and similarity search which can be used for CRUD operations, metadata filtering, and horizontal scaling.
from llama_index.core import VectorStoreIndex

# Create vector store and upload the indexed data
index = VectorStoreIndex.from_documents(documents)

# Tokenization (huggingface tokenizer)

In [None]:
# huggingface tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(llm_model)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

tokenizer_config.json:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Prompt template

In [None]:
from llama_index.core import PromptTemplate

system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

# This will wrap the default prompts that are internal to llama-index into LLM
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

# LLM

In [None]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

Settings.llm = HuggingFaceLLM(
    context_window=8192,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=tokenizer_model,
    model_name=llm_model,
    device_map="auto",
    stopping_ids=stopping_ids,
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.chunk_size = 512

The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
The repository for microsoft/Phi-3-mini-4k-instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/microsoft/Phi-3-mini-4k-instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y




`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.




Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]



Some parameters are on the meta device device because they were offloaded to the cpu.
Some parameters are on the meta device device because they were offloaded to the cpu.






Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Query engine

In [None]:
# The query engine takes query string to use it to fetch relevant context and then sends them both as a prompt to the LLM to generate a final natural language response.
query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)

# Response check

In [None]:
print(query_engine.query("What is this pdf is all about"))



You are not running the flash-attention implementation, expect numerical differences.
You are not running the flash-attention implementation, expect numerical differences.


Exception in thread Thread-15 (generate):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 1576, in generate
    result = self._greedy_search(
  File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2494, in _greedy_search
    outputs = self(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/

KeyboardInterrupt: 

In [None]:
while True:
  query=input()
  print(query_engine.query(query))

# User Interface (gradio)

In [None]:
def predict(input, history):
  response = query_engine.query(input)
  return str(response)

In [None]:
import gradio as gr

gr.ChatInterface(predict).launch(share=True)