<a href="https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/02-Basic_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Packages and Setup Variables


In [None]:
!pip install -q openai==1.93.0 cohere==5.15.0 tiktoken==0.8.0 google-genai==1.23.0

In [None]:
import os

# os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
# os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY"

from google.colab import userdata

# Set the "OPENAI_API_KEY" in the Python environment. Will be used by OpenAI client later.
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GOOGLE_API_KEY"] =  userdata.get('GOOGLE_API_KEY')

In [None]:
# False: Generate the embedding for the dataset. (Associated cost with using OpenAI endpoint)
# True: Load the dataset that already has the embedding vectors.
load_embedding = False

# Load Dataset


## Download Dataset (JSON)


The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model.


In [None]:
!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv
!wget https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles-with_embeddings.csv

## Read File


In [None]:
# Split the input text into chunks of specified size.
def split_into_chunks(text, chunk_size=1024):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i : i + chunk_size])

    return chunks

In [None]:
import csv

chunks = []

# Load the file as a CSV
with open("./mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    for idx, row in enumerate(csv_reader):
        if idx == 0:
            continue
            # Skip header row
        chunks.extend(split_into_chunks(row[1]))

In [None]:
import pandas as pd

# Convert the JSON list to a Pandas Dataframe
df = pd.DataFrame(chunks, columns=["chunk"])

df.keys()

# Generate Embedding


In [None]:
from openai import OpenAI

client = OpenAI()


# Defining a function that converts a text to embedding vector using OpenAI's Ada model.
def get_embedding(text):
    try:
        # Remove newlines
        text = text.replace("\n", " ")
        res = client.embeddings.create(input=[text], model="text-embedding-3-small")

        return res.data[0].embedding

    except:
        return None

In [None]:
from tqdm.notebook import tqdm
import numpy as np

# Generate embedding
if not load_embedding:
    print("Generating embeddings...")
    embeddings = []
    for index, row in tqdm(df.iterrows()):
        # df.at[index, 'embedding'] = get_embedding( row['chunk'] )
        embeddings.append(get_embedding(row["chunk"]))

    embeddings_values = pd.Series(embeddings)
    df.insert(loc=1, column="embedding", value=embeddings_values)

# Or, load the embedding from the file.
else:
    print("Loaded the embedding file.")
    # Load the file as a CSV
    df = pd.read_csv("mini-llama-articles-with_embeddings.csv")
    # Convert embedding column to an array
    df["embedding"] = df["embedding"].apply(lambda x: np.array(eval(x)), 0)

In [None]:
# df.to_csv('mini-llama-articles-with_embeddings.csv')

# User Question


In [None]:
# Define the user question, and convert it to embedding.
QUESTION = "How many parameters LLaMA2 model has?"
QUESTION_emb = get_embedding(QUESTION)

len(QUESTION_emb)

# Test Cosine Similarity


Calculating the similarity of embedding representations can help us to find pieces of text that are close to each other. In the following sample you see how the Cosine Similarity metric can identify which sentence could be a possible answer for the given user question. Obviously, the unrelated answer will score lower.


In [None]:
BAD_SOURCE_emb = get_embedding("The sky is blue.")
GOOD_SOURCE_emb = get_embedding("LLaMA2 model has a total of 2B parameters.")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# A sample that how a good piece of text can achieve high similarity score compared
# to a completely unrelated text.
print("> Bad Response Score:", cosine_similarity([QUESTION_emb], [BAD_SOURCE_emb]))
print("> Good Response Score:", cosine_similarity([QUESTION_emb], [GOOD_SOURCE_emb]))

# Calculate Cosine Similarities


In [None]:
# The similarity between the questions and each part of the essay.
cosine_similarities = cosine_similarity([QUESTION_emb], df["embedding"].tolist())

print(cosine_similarities)

In [None]:
import numpy as np

number_of_chunks_to_retrieve = 3

# Sort the scores
highest_index = np.argmax(cosine_similarities)

# Pick the N highest scored chunks
indices = np.argsort(cosine_similarities[0])[::-1][:number_of_chunks_to_retrieve]
print(indices)

In [None]:
# Look at the highest scored retrieved pieces of text
for idx, item in enumerate(df.chunk[indices]):
    print(f"> Chunk {idx+1}")
    print(item)
    print("----")

## Gemini client function

In [None]:
from google import genai
from google.genai import types

def gemini_response(system_prompt,prompt):
    client = genai.Client()

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
            system_instruction=system_prompt
        ),
    )

    try:
        return response.text
    except Exception as e:
        print(f"An error occurred: {e}")

# Augment the Prompt


In [None]:
from google import genai
from google.genai import types

# Use the Gemini API to answer the questions based on the retrieved pieces of text.
try:
    # Formulating the system prompt and condition the model to answer only AI-related questions.
    system_prompt = """You are an assistant and expert in answering questions from a chunks of content.
                      Only answer AI-related question, else say that you cannot answer this question."""

    # User prompt with the user's question
    prompt = """
        Read the following informations that might contain the context you require to answer the question.
        You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag.
        Here is the content:\n\n<START_OF_CONTEXT>\n{}\n<END_OF_CONTEXT>\n\n"
        Please provide an informative and accurate answer to the following question based on the avaiable context.
        Be concise and take your time. \nQuestion: {}\nAnswer:"
      """

    # Add the retrieved pieces of text to the prompt.
    prompt = prompt.format("".join(df.chunk[indices]), QUESTION)


    response = gemini_response(system_prompt,prompt)
    print(response)

except Exception as e:
    print(f"An error occurred: {e}")


## How Augmenting the Prompt can address knowledge cutoff limitations and hallucinations

In [None]:
# Consider this as a retrieved chunk
# https://ai.meta.com/blog/llama-4-multimodal-intelligence/

Example_chunk = """
# Llama 4 Technical Overview

Meta has introduced the Llama 4 model family, marking a significant advancement in open-weight multimodal AI with three distinct variants designed for different use cases. The Llama 4 Scout represents the more compact option with 17 billion active parameters utilizing 16 experts within a total parameter count of 109 billion, designed to fit on a single H100 GPU with Int4 quantization. The Llama 4 Maverick serves as the flagship model with 17 billion active parameters but employs 128 experts across 400 billion total parameters, fitting on a single H100 host and designed as the primary workhorse for general assistant and chat applications. The preview model Llama 4 Behemoth functions as a teacher model with 288 billion active parameters, 16 experts, and nearly 2 trillion total parameters, currently still in training but demonstrating state-of-the-art performance on mathematical and reasoning benchmarks.
The architecture represents Meta's first implementation of mixture-of-experts (MoE) design in the Llama series, fundamentally changing how the models process information by activating only a fraction of total parameters for each token. The MoE architecture alternates between dense and mixture-of-experts layers, with the Maverick model specifically using 128 routed experts alongside a shared expert, where each token is processed by the shared expert and routed to one of the 128 specialized experts. This design significantly improves compute efficiency during both training and inference while delivering higher quality outputs compared to dense models with equivalent computational budgets.
Native multimodality represents another breakthrough, achieved through early fusion architecture that seamlessly integrates text and vision tokens into a unified model backbone during pre-training. This approach enables joint training with massive amounts of unlabeled text, image, and video data, with the vision encoder based on MetaCLIP but trained separately in conjunction with a frozen Llama model to optimize adaptation to the language model. The models support up to 48 images during pre-training and demonstrate strong performance with up to 8 images in post-training scenarios, enabling sophisticated visual reasoning and understanding tasks across multiple input modalities.
The training infrastructure incorporates several technical innovations, including the MetaP technique for reliable hyperparameter optimization that transfers well across different model configurations and training scenarios. The models were trained on over 30 trillion tokens, representing more than double the training data of Llama 3, with support for 200 languages including over 100 languages with more than 1 billion tokens each. Training efficiency was maximized through FP8 precision without quality sacrifice, achieving 390 TFLOPs per GPU utilization on 32,000 GPUs during Behemoth pre-training, while a specialized mid-training phase enhanced core capabilities and enabled the industry-leading 10 million token context length for Scout.
Context length capabilities represent a dramatic advancement, with Llama 4 Scout supporting 10 million tokens compared to Llama 3's 128K limit through the innovative iRoPE architecture that employs interleaved attention layers without positional embeddings and inference-time temperature scaling of attention. This architecture enables superior length generalization and opens possibilities for multi-document summarization, extensive user activity parsing for personalization, and reasoning over vast codebases, with compelling performance demonstrated in retrieval tasks and cumulative negative log-likelihood evaluations over 10 million tokens of code.
The post-training pipeline underwent significant revision with a three-stage approach consisting of lightweight supervised fine-tuning, online reinforcement learning, and lightweight direct preference optimization. A critical insight involved removing over 50% of data tagged as easy using Llama models as judges, focusing training on harder examples to prevent over-constraining that could restrict exploration during online RL and lead to suboptimal performance in reasoning, coding, and mathematics. The continuous online RL strategy alternated between model training and adaptive data filtering to retain only medium-to-hard difficulty prompts, proving highly beneficial for compute and accuracy trade-offs.
Safety and bias mitigation received comprehensive attention through multi-layered approaches spanning pre-training data filtering, post-training policy conformance, and system-level safeguards. Meta introduced Generative Offensive Agent Testing (GOAT) to address traditional red-teaming limitations by simulating multi-turn interactions of medium-skilled adversarial actors, enabling more efficient vulnerability detection while allowing human experts to focus on novel adversarial areas. Particular emphasis was placed on addressing political and social bias inherent in internet training data, with Llama 4 demonstrating significant improvements over Llama 3 in presenting balanced perspectives on contentious issues without favoring particular viewpoints.
The distillation process for creating smaller models from Behemoth required novel approaches, including a dynamic loss function that weighted soft and hard targets throughout training, with codistillation during pre-training amortizing computational costs for the majority of training data. Post-training the 2 trillion parameter Behemoth model necessitated pruning 95% of supervised fine-tuning data compared to 50% for smaller models, followed by large-scale reinforcement learning focused on sampling hard prompts through pass@k analysis and constructing training curricula of increasing difficulty. The unprecedented scale required complete infrastructure overhaul, including optimized MoE parallelization and fully asynchronous online RL training framework that improved training efficiency by approximately 10x over previous generations through flexible GPU allocation and resource balancing across multiple models.
"""

In [None]:
QUESTION = "How many parameters LLaMA 4 model has?"

# Formulating the system prompt
system_prompt = """ You are an assistant and expert in answering questions from a chunks of content.
                    Only answer AI-related question, else say that you cannot answer this question."""

# Combining the system prompt with the user's question
prompt = """Read the following informations that might contain the context you require to answer the question.
            You can use the informations starting from the <START_OF_CONTEXT> tag and end with the <END_OF_CONTEXT> tag.
            Here is the content:\n\n<START_OF_CONTEXT>\n{}\n<END_OF_CONTEXT>\n\n".
            Please provide an informative and accurate answer to the following question based on the avaiable context.
            Be concise and take your time. \nQuestion: {}\nAnswer:
            """

prompt = prompt.format(Example_chunk, QUESTION)

response = gemini_response(system_prompt,prompt)
print(response)

# Without Augmentation


In [None]:
QUESTION = "How many parameters LLaMA 4 model has?"

# Formulating the system prompt
system_prompt = "You are an assistant and expert in answering questions."

# Combining the system prompt with the user's question
prompt = "Be concise and take your time to answer the following question. \nQuestion: {}\nAnswer:"

prompt = prompt.format(QUESTION)

response = gemini_response(system_prompt,prompt)
print(response)