# Installing Packages:

In [33]:
!pip install -q pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes

# for Embedding
!pip install -q sentence_transformers

#for use of llama2 from hugging face
!pip install -q llama-index-llms-huggingface

!pip install -q llama_index
!pip install -q llama-index-embeddings-huggingface
!pip install -q llama-index-embeddings-langchain


# Importing Packages:

In [34]:
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,PromptTemplate
import torch
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding


# Loading all the documents:

In [19]:
documents=SimpleDirectoryReader("/content/drive/MyDrive/GenAI/LLama-index_RAGs/LLama-index_llama2/data").load_data()


# Preparing prompt for llama2:

In [23]:
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
## Default format supportable by LLama2
query_wrapper_prompt="<|USER|>{query_str}<|ASSISTANT|>"

qa_template = PromptTemplate(query_wrapper_prompt)





# Hugging face login:

In [24]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store

# Loading llama2 model for our usecase:

In [29]:
llm = HuggingFaceLLM(
    context_window=4096,
    #output tokens we want
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.5, "do_sample": False},

    #passing the system prompt
    system_prompt=system_prompt,

    query_wrapper_prompt=qa_template,

    #tokenizer
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",

    #llama model
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Embedding model:

In [35]:
embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Combining LLM, Emveddings, documents with service Context:

For querying documents we have to bundle LLM, embedding model and socuments together, for that purpose we will use service context.

In [36]:

service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

  service_context=ServiceContext.from_defaults(


# Converting data to index:

In [37]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)


# Making query engine:
To ask questions from the index that are our documents we have to make a query engine using index.

In [38]:
query_engine=index.as_query_engine()


In [41]:
response=query_engine.query("what is vector?")




In [42]:
print(response)

A vector is an algebraic system consisting of a non-empty set V equipped with a binary operation + (vector addition) and an operation of scalar multiplication (a,v) ∈ K × V → av ∈ V, where K is a ﬁeld. In this context, vectors are represented by points in a geometric space, and the operations of vector addition and scalar multiplication are deﬁned based on the properties of the geometric space.
