In [None]:
# import yaml

# # load config
# with open("config.yaml", "r") as file:
#     config = yaml.safe_load(file)

: 

## extracting

In [None]:
import fitz  # PyMuPDF

def extract_text_with_layout(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        # ดึงข้อความแบบ block เพื่อรักษา layout พอสมควร
        blocks = page.get_text("blocks")
        for b in blocks:
            text += b[4] + "\n"
    return text



In [None]:
import re

def clean_text(text):
    # ตัดเลขหน้า เช่น "4037" ที่มักอยู่หลังจุด
    text = re.sub(r'\.{3,}\s*\d+\n', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

In [30]:
path = "doc/Chip Huyen - AI Engineering.pdf"
raw_text = extract_text_with_layout(path)
raw_text = clean_text(text=raw_text)

In [31]:
raw_text



## chunking

In [32]:
def chunk_text_by_words(text, max_words=50, overlap=10):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + max_words, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += max_words - overlap
    return chunks

In [33]:
chunks = chunk_text_by_words(text=raw_text, max_words=500, overlap=100)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 434


In [34]:
chunks[100:105]

['agreed that hallucination is the biggest blocker for many AI enterprise use cases. The Probabilistic Nature of AI The way AI models sample their responses makes them probabilistic. Let’s go over an example to see what being probabilistic means. Imagine that you want to know what’s the best cuisine in the world. If you ask your friend this question twice, a minute apart, your friend’s answers both times should be the same. If you ask an AI model the same question twice, its answer can change. If an AI model thinks that Vietnamese cuisine has a 70% chance of being the best cuisine in the world and Ital‐ ian cuisine has a 30% chance, it’ll answer “Vietnamese cuisine” 70% of the time and “Italian cuisine” 30% of the time. The opposite of probabilistic is deterministic, when the outcome can be determined without any random variation. This probabilistic nature can cause inconsistency and hallucinations. Inconsistency is when a model generates very different responses for the same or slight

## embedding

In [23]:
# from sentence_transformers import SentenceTransformer

# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# # Embed text
# embeddings = embedding_model.encode(chunks)
# print(f"Vector length: {len(embeddings[0])}")

In [35]:
import ollama

embeddings = []

for chunk in chunks:
    emb_result = ollama.embeddings(model="nomic-embed-text", prompt=chunk)
    embeddings.append(emb_result.embedding)

print(f"Number of embeddings: {len(embeddings)}")

Number of embeddings: 434


## store in db

### chroma local

In [46]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./vector_database")

collection = chroma_client.get_or_create_collection(name="my_local_collection")

In [47]:
# Cloud collections work the same way!
collection.add(
    ids=[f"doc_{i}" for i in range(len(chunks))],  # unique IDs
    embeddings=embeddings,               # must be list of lists!
    documents=chunks,                              # optional, but useful
    metadatas=[{"source": "example"} for _ in chunks]  # optional metadata
)

## retrieve

In [69]:
query = "How to evaluate rag system?"

# Embed locally
query_embedding = ollama.embeddings(model="nomic-embed-text", prompt=query)

# Search Chroma Cloud
results = collection.query(
    query_embeddings=query_embedding.embedding,
    n_results=5
)

# Extract relevant chunks
contexts = results['documents'][0]
context_text = "\n".join(contexts)

In [70]:
contexts[:10]

['add knowledge.” outperformed RAG with finetuned models, as shown in Table 7-2. This finding indi‐ cates that while finetuning can enhance a model’s performance on a specific task, it may also lead to a decline in performance in other areas. Table 7-2. RAG outperforms finetuning on a question-answering task about current events, curated by Ovadia et al. (2024). FT-reg and FT-par refer to two different finetuning approaches the author used. Base model Base model + RAG FT-reg FT-par FT-reg + RAG FT-par + RAG Mistral-7B 0.481 0.875 0.504 0.588 0.810 0.830 Llama 2-7B 0.353 0.585 0.219 0.392 0.326 0.520 Orca 2-7B 0.456 0.876 0.511 0.566 0.820 0.826 On the other hand, if the model has behavioral issues, finetuning might help. One behavioral issue is when the model’s outputs are factually correct but irrelevant to the task. For example, you ask the model to generate technical specifications for a soft‐ ware project to provide to your engineering teams. While accurate, the generated specs lac

# call llm

In [72]:
system_prompt = """You are a precise and reliable assistant that answers questions using only the provided context.

Instructions:
- Use the context text to answer the question as accurately as possible.
- If the context does not contain enough information, say: "I don't have enough information from the context."
- Do not include outside knowledge or assumptions.
- Keep your answer concise and factual.
- Use clear sentences.
"""

In [71]:
# Call your local Llama
prompt = f"""
Context:
{context_text}

Question:
{query}

Answer:"""

## llama

In [73]:
from package.ollama import OllamaChat

# Chat completion
chat = OllamaChat(model_name="llama3.2")
messages = [
    # chat.UserMessage("Explain quantum computing"),
    # chat.AIMessage("Quantum computing uses..."),
    chat.UserMessage(prompt)
]
response = chat.run(system_prompt, messages)
print(response)

To evaluate a RAG (Retrieval-Augmentation-Generation) system, you can use metrics such as:

1. Context precision: Out of all the documents retrieved, what percentage is relevant to the query?
2. Context recall: Out of all the documents that are relevant to the query, what percentage is retrieved?

These metrics can be computed by curating an evaluation set with a list of test queries and a set of documents, annotating each document as relevant or not relevant to the query, and then computing precision and recall scores.

Additionally, if you care about the ranking of the retrieved documents, you can use metrics such as:

1. NDCG (Normalized Discounted Cumulative Gain)
2. MAP (Mean Average Precision)
3. MRR (Mean Reciprocal Rank)

These metrics evaluate how well the retriever ranks relevant documents in order of relevance.

It's also important to evaluate the quality of your embeddings, which can be done independently or by how well they work for specific tasks, such as the MTEB benchma

## gemini

In [None]:
import google.generativeai as genai

# Configure the SDK
genai.configure(api_key=config['genimi_api'])

# Create the Gemini 2.5 Flash model
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

In [None]:
response = gemini_model.generate_content(system_prompt + prompt)

print(response.text)