# LLM + RAG

Learn how to build a very simple RAG retrieving information from a folder, with any LLM, depending on your computing power.

## Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from svsvllm.utils.nb import nb_init

nb_init()

INFO | nb_init | Set current dir to llm
INFO | nb_init | You are using Python 3.10.13 (main, Mar 19 2024, 11:05:39) [Clang 15.0.0 (clang-1500.3.9.4)]


In [3]:
import os, sys
import typing as ty

## RAG

In [4]:
from langchain_core.documents import Document
from langchain_community.document_loaders.directory import DirectoryLoader

loader = DirectoryLoader(
    path=os.path.join('res', 'documents'),
    glob="*.pdf",
    recursive=True,
)

In [5]:
docs: ty.List[Document] = loader.load()

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)

chunked_docs = splitter.split_documents(docs)

In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.faiss import FAISS

# For all model names, see: https://www.sbert.net/docs/pretrained_models.html
embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

db = FAISS.from_documents(chunked_docs, embedding=embedding)

In [8]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

## LLM

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

if torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        # llm_int8_enable_fp32_cpu_offload=True,
    )
else:
    bnb_config = None

# model_name = "HuggingFaceH4/zephyr-7b-beta"
# model_name = "mistralai/Mistral-7B-v0.1"
model_name = "Writer/palmyra-small" # Very small model, not sure this works well

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    token=os.environ['HUGGINGFACE_TOKEN'],
)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/551M [00:00<?, ?B/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ['HUGGINGFACE_TOKEN'])

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

## Pipeline

### LLM without RAG

In [11]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

### LLM with RAG

In [12]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever()

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

## Question

Ask a question:

In [16]:
question = "Deep Reinforcement Learning (Deep RL) is increasingly used to cope with the open-world assumption in service-oriented systems. Is this true?"

The quality of the answers will depend on the model you chose.

## Answers

### Without RAG

In [17]:
llm_chain.invoke({"context": "", "question": question})

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'\n</s>\n<|user|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this true?\n</s>\n<|assistant|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this true?\n</s>\n<|user|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this true?\n</s>\n<|assistant|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this true?\n</s>\n<|user|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this true?\n</s>\n<|assistant|>\nDeep Reinforcement Learning (Deep RL) is increasingly usedto  cope  with  the  open-world  assumption  in  service-oriented  systems. Is this

### With RAG

In [18]:
rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'  <s>\n    <s>\n      <s>\n        <s>\n          <s>\n            <s>\n              <s>\n                <s>\n                  <s>\n                    <s>\n                      <s>\n                        <s>\n                           <s>\n                            <s>\n                               <s>\n                                  <s>\n                                   <s>\n                                      <'