# RAG

[![YouTube Video](https://img.youtube.com/vi/qppV3n3YlF8/0.jpg)](https://www.youtube.com/watch?v=qppV3n3YlF8)

More information: https://medium.com/@Nirodya_Pussadeniya/retrieval-augmented-generation-rag-a594873b9a96


In [6]:
!gdown https://drive.google.com/file/d/1-95Y5kMQtDUH6CkLLA5bmRMNgTP82e23/view?usp=drive_link

Downloading...
From: https://drive.google.com/file/d/1-95Y5kMQtDUH6CkLLA5bmRMNgTP82e23/view?usp=drive_link
To: /content/view?usp=drive_link
89.1kB [00:00, 2.41MB/s]


In [7]:
!pip install langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install unstructured
!pip install chromadb
!pip install llama-cpp-python



In [8]:
from torch import cuda
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [14]:
%cd /content
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('data')
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

/content


In [15]:
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embed_model)

In [16]:
from langchain.embeddings import LlamaCppEmbeddings
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
n_gpu_layers = 32  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [20]:
#llama = LlamaCppEmbeddings(model_path="/data/llama.cpp/models/llama-2-7b-chat/ggml-model-q4_0.bin")
llm = LlamaCpp(
    model_path="/content/ggml-model-q4_k_m.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=False,
)


ValidationError: 1 validation error for LlamaCpp
__root__
  Could not load Llama model from path: /content/ggml-model-q4_k_m.gguf. Received error Model path does not exist: /content/ggml-model-q4_k_m.gguf (type=value_error)

In [None]:
question = "what is pesticides?"
docs = vectorstore.similarity_search(question)
#result = llm_chain(docs)
docs

# Main RAG scripts

In [None]:
from langchain.chains import RetrievalQA
import json
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)



In [None]:
out = rag_pipeline("what are the curriculum")
output = dict(out)
out_new = json.dumps(output)
out1 = json.loads(out_new)
final_response = out1['result']




# Deployment as an OpenAI Compatible API

## Install vLLM + Haystack

- we install vLLM using pip ([docs](https://docs.vllm.ai/en/latest/getting_started/installation.html))
- for production use cases, there are many other options, including Docker ([docs](https://docs.vllm.ai/en/latest/serving/deploying_with_docker.html))

In [None]:
!pip install vllm haystack-ai

In [None]:
# we prepend "nohup" and postpend "&" to make the Colab cell run in background
! nohup python -m vllm.entrypoints.openai.api_server \
                  --model /content/final_weights_new \
                  --dtype auto \
                  --max-model-len 2048 \
                  > vllm.log &

In [None]:
# we check the logs until the server has been started correctly
!while ! grep -q "Application startup complete" vllm.log; do tail -n 1 vllm.log; sleep 5; done

In [None]:
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
import string
import random

# initializing size of string
N = 20

# using random.choices()
# generating random strings
res = ''.join(random.choices(string.ascii_uppercase +
                             string.digits, k=N))

generator = OpenAIChatGenerator(
    api_key=Secret.from_token(res),  # for compatibility with the OpenAI API, a placeholder api_key is needed
    model="/content/final_weights_new",
    api_base_url="http://localhost:8000/v1",
    generation_kwargs = {"max_tokens": 1024}
)
