In [None]:
# Go one level up in the directories hierarchy to access src directory and codes
import sys
import os
# Add project root to Python path
project_root = os.path.abspath("..")  # go one level up from notebooks/
sys.path.append(project_root)

In [57]:
# Setup necessary models for chatting and embedding
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from src.config import Config
from google.genai import types

llm = GoogleGenAI(
    model = Config.CHAT_LLM,
    api_key = Config.GOOGLE_API_KEY,
    generation_config = types.GenerateContentConfig(
        thinking_config = types.ThinkingConfig(thinking_budget = 0),
        temperature = 0.2,
    ),
    max_tokens = 3000
)

embed_model = HuggingFaceEmbedding(
    model_name = Config.EMBEDDING_MODEL
)

2025-12-11 10:21:14,418 - INFO - HTTP Request: GET https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite "HTTP/1.1 200 OK"
2025-12-11 10:21:14,421 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-small


In [51]:
# Setup simple RAG
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

docs_path = "../documents"

# 1) Read documents and create list of 'Document' objects, that has id_, metadata, text attributes.
#    Document class (generic container for any data source) is a subclass of the TextNode class 
documents = SimpleDirectoryReader(input_dir = docs_path).load_data()

# 2) Read each of this document objects and create index from it
#    Document objects are parsed into Node objects that have different attributes such as text, embeddings, metadata, relationships.
#    Document objects are split into multiple nodes (relationships between these nodes are recorded in Node objects as attributes).
index = VectorStoreIndex.from_documents(
    documents = documents,
    embed_model = embed_model,
    show_progress = True
)

Parsing nodes: 100%|██████████| 23/23 [00:00<00:00, 1279.43it/s]
Generating embeddings: 100%|██████████| 30/30 [00:01<00:00, 17.01it/s]


In [58]:
import nest_asyncio
nest_asyncio.apply()

# 3) On top of that index build query engine for retrieving the context.
query_engine = index.as_query_engine(llm = llm)

# 4) Take user query and generate an answer
user_query = "Tell me about attention block in LLMs briefly"
response = query_engine.query(user_query)
print(response)

2025-12-11 10:21:23,926 - INFO - AFC is enabled with max remote calls: 10.
2025-12-11 10:21:26,089 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"


The Transformer model utilizes multi-head attention in three distinct ways. In "encoder-decoder attention" layers, queries originate from the preceding decoder layer, while keys and values are sourced from the encoder's output. This configuration enables every position in the decoder to examine all positions within the input sequence, mirroring conventional encoder-decoder attention mechanisms found in sequence-to-sequence models.

The encoder incorporates self-attention layers where keys, values, and queries all stem from the output of the encoder's prior layer. This allows each position in the encoder to attend to all positions in the preceding encoder layer.

Similarly, self-attention layers within the decoder permit each position to attend to all preceding positions in the decoder, including itself. To maintain the auto-regressive property, the model prevents information flow from right to left in the decoder by masking out (setting to negative infinity) values in the softmax input