<a href="https://colab.research.google.com/github/vishnusureshperumbavoor/rag_apps/blob/main/rag_llama3_8b_instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import Markdown, display
display(Markdown("#VSP's RAG app Llama3-8b-instruct"))

# VSP's RAG app Llama3-8b-instruct 

# Install packages

In [None]:
!pip install -q pypdf
!pip install -q python-dotenv
!pip install llama-index==0.10.12
!pip install -q gradio
!pip install einops
!pip install accelerate

In [None]:
!pip install llama-index-llms-huggingface llama-index-embeddings-fastembed fastembed

In [None]:
pip install transformers -U

# Huggingface API import

In [None]:
# add your huggingface API in colab secrets and allow access
from huggingface_hub import notebook_login
notebook_login()

# Logging

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Chunking

In [None]:
# Process of breaking down a large input text into smaller pieces to improve retrieval efficiency. This ensures that the text fits the input size of the embedding model.
from llama_index.core import SimpleDirectoryReader

# Create a folder called 'data' and upload the pdf into that folder
documents = SimpleDirectoryReader("/content/data").load_data()

# Models declaration

In [None]:
embedding_model="BAAI/bge-small-en-v1.5"
tokenizer_model="meta-llama/Meta-Llama-3-8B-Instruct"
llm_model="meta-llama/Meta-Llama-3-8B-Instruct"

# Embeddings

In [None]:
# Technique for representing text data as numerical vectors, which can be input into ML models. The embedding model (FastEmbed) is responsible for converting text into numerical vectors.
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings

Settings.embed_model = FastEmbedEmbedding(model_name=embedding_model)
Settings.chunk_size = 512

# Vector database (VectorStoreIndex)

In [None]:
# Vector databases is used for fast retrieval and similarity search which can be used for CRUD operations, metadata filtering, and horizontal scaling.
# By default, LlamaIndex uses VectorStoreIndex (simple in-memory vector store that’s great for quick experimentation)
from llama_index.core import VectorStoreIndex

# Create vector store and upload the indexed data
index = VectorStoreIndex.from_documents(documents)

# Prompt template

In [None]:
from llama_index.core import PromptTemplate

system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

# This will wrap the default prompts that are internal to llama-index into LLM
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

# Tokenization (huggingface tokenizer)

In [None]:
# tiktoken
# import tiktoken
# Settings.tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo").encode

# huggingface
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

# LLM

In [None]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

Settings.llm = HuggingFaceLLM(
    context_window=8192,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.7, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=tokenizer_model,
    model_name=llm_model,
    device_map="auto",
    stopping_ids=stopping_ids,
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.chunk_size = 512

# Query engine

In [None]:
# The query engine takes query string to use it to fetch relevant context and then sends them both as a prompt to the LLM to generate a final natural language response.
query_engine = index.as_query_engine()

# User Interface (gradio)

In [None]:
import gradio as gr

def predict(input, history):
  response = query_engine.query(input)
  return str(response)

gr.ChatInterface(predict).launch(share=True)