<a href="https://colab.research.google.com/github/yashsakhuja/Introduction-to-RAG-with-LLAMA3/blob/main/Introduction_to_RAGs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
#For Embedding
!pip install sentence_transformers

!pip install llama_index
!pip install llama-index-embeddings-langchain
!pip install llama-index-llms-huggingface

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt

In [3]:
documents= SimpleDirectoryReader('/content/data').load_data()

In [None]:
documents

In [5]:
system_prompt="""

You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.

"""

In [6]:
#Default prompt supported by llama2
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
!huggingface-cli login

In [8]:
import torch

In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Meta-Llama-3-8B-Instruct",
    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
    device_map="auto",
    # loading model in 8bit for reducing memory
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.legacy.embeddings.langchain import LangchainEmbedding

In [None]:
embed_model= LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

In [None]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
service_context

In [25]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [27]:
query_engine=index.as_query_engine()

In [None]:
response=query_engine.query("What are the rules regarding hitting the ball twice?")

In [None]:
print(response)