# Building a RAG pipeline with llama.cpp

## Imports

In [9]:
#import pypdf to handle pdf documents, and all needed libraries
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
import requests
from tqdm import tqdm
import time

In [4]:
import sys

In [10]:
sys.path

['/opt/homebrew/lib/python3.10/site-packages',
 '/Library/Frameworks/Python.framework/Versions/3.10/lib/python310.zip',
 '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10',
 '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/lib-dynload',
 '',
 '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages']

import sys
sys.path.append(0,"../") 

## Local LLM


Llama 2 7B quantized model, available from Hugging Face

**Below was run already!!**

model_path = "llama-2-7b-chat.Q4_K_M.gguf"
 
if not os.path.exists(model_path):
    print(f"Downloading {model_path}...")
    # You may want to replace the model URL by another of your choice
    model_url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
    response = requests.get(model_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(model_path, 'wb') as f:
        for data in tqdm(response.iter_content(chunk_size=1024), total=total_size//1024):
            f.write(data)
    print("Download complete!")

We now need to set up another major component in any RAG system: the document base. In this example, we will create a mechanism to read documents in multiple formats, including .doc and .txt, and for simplicity we will provide a default sample text document built on the fly, adding it to our newly created documents directory, docs. To try it yourself with an extra level of fun, make sure you load actual documents of your own.

In [14]:

os.makedirs("docs", exist_ok=True)
 
# Sample text for demonstration purposes
with open("docs/sample.txt", "w") as f:
    f.write("""
    Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches
    for natural language processing tasks. It involves retrieving relevant information from a knowledge base and then 
    using that information to generate more accurate and informed responses.
    
    RAG models first retrieve documents that are relevant to a given query, then use these documents as additional context
    for language generation. This approach helps to ground the model's responses in factual information and reduces hallucinations.
    
    The llama.cpp library is a C/C++ implementation of Meta's LLaMA model, optimized for CPU usage. It allows running LLaMA models
    on consumer hardware without requiring high-end GPUs.
    
    LocalAI is a framework that enables running AI models locally without relying on cloud services. It provides APIs compatible
    with OpenAI's interfaces, allowing developers to use their own models with the same code they would use for OpenAI services.
    """)
 
documents = []
for file in os.listdir("docs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("docs", file))
        documents.extend(loader.load())
    elif file.endswith(".txt"):
        loader = TextLoader(os.path.join("docs", file))
        documents.extend(loader.load())
 
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
 
chunks = text_splitter.split_documents(documents)

Notice that after processing the documents, we split them into chunks, which is a common practice in RAG systems for enhancing retrieval accuracy and ensuring the LLM effectively processes manageable inputs within its context window.

Both LLMs and RAG systems need to handle numerical representations of text rather than raw text, therefore, we next build a vector store that contains embeddings of our text documents. Chroma is a lightweight, open-source vector database for efficiently storing and querying embeddings.

In [15]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)


Now llama.cpp enters the scene for initializing our previously downloaded LLM. To do this, a LlamaCpp object is instantiated with the model path and other settings like model temperature, maximum context length, and so on.

In [16]:
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.7,
    max_tokens=2000,
    n_ctx=4096,
    verbose=False
)


llama_init_from_model: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h96           (not supported)
ggml_me

We are getting closer to the inference show, and just a few actors remain to appear on stage. One is the RAG prompt template, which is an elegant way to define how the retrieved context and user query are combined into a single, well-structured input for the LLM during inference.

In [17]:

template = """
Answer the question based on the following context:
 
{context}
 
Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [18]:
#put everything together to create our RAG pipeline based on llama.cpp.

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

Let’s review the building blocks of the RAG pipeline we just created for a better understanding:

llm: the LLM downloaded and then initialized using llama.cpp
chain_type: a method to specify how the retrieved documents in an RAG system are put together and sent to the LLM, with "stuff" meaning that all retrieved context is injected in the prompt.
retriever: initialized upon the vector store and configured to get the three most relevant document chunks.
return_source_documents=True: used to obtain information about which document chunks were used to answer the user’s question.
chain_type_kwargs={"prompt": prompt}: enables the use of our recently defined custom template to format the retrieval-augmented input into a presentable format for the LLM.
To finalize and see everything in action, we define and utilize a pipeline-driving function, ask_question(), that runs the RAG pipeline to answer the user’s questions.

In [19]:
def ask_question(question):
    start_time = time.time()
    result = rag_pipeline({"query": question})
    end_time = time.time()
    
    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    print("\nSource documents:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"Document {i+1}:")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:150]}...\n")
#Now let’s try out our pipeline with some specific questions.

ask_question("What is RAG and how does it work?")
ask_question("What is llama.cpp?")
ask_question("How does LocalAI relate to cloud AI services?")


  result = rag_pipeline({"query": question})


Question: What is RAG and how does it work?
Answer: RAG is a technique that combines retrieval-based and generation-based approaches for natural language processing tasks. It involves retrieving relevant information from a knowledge base and then using that information to generate more accurate and informed responses. The model first retrieves documents that are relevant to a given query, and then uses these documents as additional context for language generation. This approach helps to ground the model's responses in factual information and reduces hallucinations.
Time taken: 932.68 seconds

Source documents:
Document 1:
Source: docs/sample.txt
Content: Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches
    for natural language processing ...

Document 2:
Source: docs/sample.txt
Content: on consumer hardware without requiring high-end GPUs.
    
    LocalAI is a framework that enables running AI models locally without rely

# Implementing with webscraped data

In [22]:
import os
import pandas as pd
import numpy as np
import json

In [23]:
data_analyst_jobs_df = pd.read_csv('data_analyst_positions_2025-04-23')

In [40]:
data_analyst_jobs_df= data_analyst_jobs_df[:2]

In [41]:

os.makedirs("docs", exist_ok=True)
with open('docs/data.txt', 'w') as f:
    for row in data_analyst_jobs_df.to_dict('records'):
        f.write(json.dumps(row) + '\n')

In [44]:
documents = []
for file in os.listdir("docs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("docs", file))
        documents.extend(loader.load())
    elif file.endswith(".txt"):
        loader = TextLoader(os.path.join("docs", file))
        documents.extend(loader.load())
 
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
 
chunks = text_splitter.split_documents(documents)

In [45]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
 
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.7,
    max_tokens=2000,
    n_ctx=4096,
    verbose=False
)

llama_init_from_model: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h96           (not supported)
ggml_me

In [46]:
template = """
Answer the question based on the following context:
 
{context}
 
Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [47]:
#put everything together to create our RAG pipeline based on llama.cpp.

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [48]:
def ask_question(question):
    start_time = time.time()
    result = rag_pipeline({"query": question})
    end_time = time.time()
    
    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    print("\nSource documents:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"Document {i+1}:")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:150]}...\n")
#Now let’s try out our pipeline with some specific questions.


In [49]:
ask_question("What kind of companies are hiring")

KeyboardInterrupt: 