# RAG using Meta AI Llama-3


<img src="./resources/rag_architecture.png" width=800px>

In [2]:
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# allows nested access to the event loop
nest_asyncio.apply()

In [4]:
# add your documents in this directory, you can drag & drop
input_dir_path = './documents'

In [13]:

# setup llm & embedding model
def load_model():
    llm=Ollama(model="hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:latest", request_timeout=120.0)
    # embed_model = HuggingFaceEmbedding( model_name="Snowflake/snowflake-arctic-embed-m", trust_remote_code=True)
    embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-large-en-v1.5", trust_remote_code=True)
    return llm, embed_model

In [16]:
# load data
def load_data(llm, embed_model):
    loader = SimpleDirectoryReader(
                input_dir = input_dir_path,
                required_exts=[".pdf"],
                recursive=True
            )
    docs = loader.load_data()

    # Creating an index over loaded data
    Settings.embed_model = embed_model
    index = VectorStoreIndex.from_documents(docs, show_progress=True)

    # Create the query engine, where we use a cohere reranker on the fetched nodes
    Settings.llm = llm
    query_engine = index.as_query_engine()

    # ====== Customise prompt template ======
    qa_prompt_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
    "Query: {query_str}\n"
    "Answer: "
    )
    qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)

    query_engine.update_prompts(
        {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
    )
    return query_engine

# Generate the response

Parsing nodes: 100%|██████████| 2/2 [00:00<00:00, 56.26it/s]
Generating embeddings: 100%|██████████| 2/2 [00:06<00:00,  3.34s/it]


In [17]:
def inference(query_engine):
    response = query_engine.query("Where is the overlap in experiences between Sumanth and Yash")
    return Markdown(str(response))
    # display(Markdown(str(response)))

After analyzing the experience sections of both resumes, I found that the overlap in experiences between Sumanth Kalluru and Yash V Saxena is:

* Software Engineering Intern roles:
	+ Sumanth at Wiweeki (June 2024 - August 2024) and Disruption Lab at Gies (August 2023 - June 2024)
	+ Yash at Super (May 2024 – August 2024) and Meta (May 2025 - August 2025)

Additionally, both Sumanth and Yash have experience as Software Engineers/Technology Leads/Product Managers, although the specific company names and roles are different.

I don't know if there's any further overlap in other areas.

In [7]:
# check GPU usage

!nvidia-smi

zsh:1: command not found: nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
