In [10]:
!pip install pymilvus pymilvus[milvus_lite] datasets transformers sentence-transformers ragas

Collecting milvus-lite>=2.4.0 (from pymilvus[milvus_lite])
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: milvus-lite
Successfully installed milvus-lite-2.5.1


In [1]:
# Load all required Libraries
import pandas as pd
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM

from sentence_transformers import SentenceTransformer

from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

# Read Passages from the Datasets and Drop rows if they are NA or empty

In [2]:
passages = pd.read_parquet(
    "hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet"
)

print(passages.shape)
passages.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


# Do EDA on the passage dataset
- You can try to find the maximum and minimum length of the passages before indexing (just a direction)

In [3]:
# Analyze passage lengths
passages["length"] = passages["passage"].str.len()
print(f"Min length: {passages['length'].min()}")
print(f"Max length: {passages['length'].max()}")
print(f"Mean length: {passages['length'].mean():.2f}")
print(f"Median length: {passages['length'].median()}")

# Check for missing values
print(f"\nMissing values: {passages['passage'].isna().sum()}")

Min length: 1
Max length: 2515
Mean length: 389.85
Median length: 299.0

Missing values: 0


# Tokenize Text and Generate Embeddings using Sentence Transformers

# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [4]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode Text
embeddings = embedding_model.encode(
    passages["passage"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True,
    batch_size=64,
)

print(embeddings.shape)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

torch.Size([3200, 384])


In [5]:
# Define every column of your schema

id_ = FieldSchema(
    name="id",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=False,
)

passage = FieldSchema(
    name="passage",
    dtype=DataType.VARCHAR,
    max_length=2600,
)
embedding = FieldSchema(
    name="embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=384,
)

In [8]:
schema = CollectionSchema(
    fields=[id_, passage, embedding],
    description="RAG Wikipedia passages",
    auto_id=False,
)

In [13]:
client = MilvusClient("rag_wikipedia_mini.db")

# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection
client.create_collection(collection_name="rag_mini", schema=schema)

**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [29]:
rag_data = [
    {
        "id": idx,
        "passage": passages.iloc[idx]["passage"],
        "embedding": embeddings[idx].tolist()
    }
    for idx in range(len(passages))
]

In [30]:
# Code to insert the data to your DB
res = client.insert(collection_name="rag_mini", data=rag_data)

print(res)

{'insert_count': 3200, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

- Do a Sanity Check on your database

**Do not delete the below line during your submission**

In [None]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [32]:
import pandas as pd

queries = pd.read_parquet(
    "hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet"
)
queries

Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832
...,...,...
1710,Was Wilson president of the American Political...,Yes
1711,Did he not cast his ballot for John M. Palmer ...,Yes
1712,Did Wilson not spend 1914 through the beginnin...,Yes
1713,"Was Wilson , a staunch opponent of antisemitis...",Yes


In [36]:
query = queries.iloc[0]["question"]
print(query)

query_embedding = embedding_model.encode(query)

print(query_embedding.shape)

Was Abraham Lincoln the sixteenth President of the United States?
(384,)


#### Create Index on the embedding column on your DB

In [50]:
SEARCH_RESULTS_TO_FETCH = 1

In [43]:
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name="embedding",
    index_type="FLAT",
    metric_type="COSINE",
)

# Create the index
try:
    client.create_index(
        collection_name="rag_mini",
        index_params=index_params,
    )
except Exception as e:
    print(f"Index creation result: {e}")

# Load collection into memory (required for search)
client.load_collection(collection_name="rag_mini")
print("Collection loaded into memory")

Collection loaded into memory


In [52]:
# Search the db with your query embedding
output_ = client.search(
    collection_name="rag_mini",
    data=[query_embedding.tolist()],
    output_fields=["passage", "id"],
    limit=SEARCH_RESULTS_TO_FETCH,
)

print(output_)

data: [[{'id': 2172, 'distance': 0.9814172387123108, 'entity': {'id': 2172, 'passage': 'President Wilson before Congress, announcing the break in official relations with Germany. February 3, 1917.'}}]]


## Now get the Context
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

In [45]:
context = output_[0][0].entity.get("passage")
print(context)

Young Abraham Lincoln


**Develop your Prompt**

In [53]:
system_prompt = f"""
You are a helpful assistant answering questions using retrieved passages from the rag-mini-wikipedia dataset.

Rules:
1. Use the given passages as your main source. Combine info if multiple passages are relevant.
2. If passages lack enough info, say so. Only add general knowledge if you’re confident.
3. Be concise, clear, and factual. Avoid speculation or irrelevant details.
4. If a query is ambiguous, pick the meaning best supported by the passages.
5. Do not invent references or details not in the passages."
"""

In [54]:
def create_prompt(query, context):
    return f"{system_prompt} \n Context: {context}: \n Question: {query} "

# RAG Response for a Single Query

In [47]:
# Load the LLM Model you want to use
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [55]:
# Generate answer
prompt = create_prompt(query, context)
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs)

# Decode and extract answer.
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(answer)


['1917 was the year that Abraham Lincoln was born.']


# Generate Responses for all the Queries in the Dataset

In [59]:
from tqdm.auto import tqdm

results = []

for index, row in tqdm(queries.iterrows(), total=len(queries)): # Wrap the iterator with tqdm
    query = row['question']
    query_embedding = embedding_model.encode(query)

    search_results = client.search(
        collection_name="rag_mini",
        data=[query_embedding.tolist()],
        output_fields=["passage"],
        limit=SEARCH_RESULTS_TO_FETCH,
    )

    context = search_results[0][0].entity.get("passage")
    prompt = create_prompt(query, context)
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    results.append(
        {
            'question': query,
            'predicted_answer': answer,
            'ground_truth': row['answer']
        }
    )

  0%|          | 0/918 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Finding out the Basic QA Metrics (F1 score, EM score)

In [None]:
# Your code Here
...

# Advanced Evaluation using RAGAs

In [None]:
data = {
    "question": ...,  # Question
    "answer": ...,  # Generated Answer
    "contexts": ...,  # Context you pass in. You can just use top-1 here
    "ground_truths": ...,  # Reference Answer in the dataset (Human annotated)
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [None]:
# Pass the dataset above to the evaluate method in RAGAs
# Your code here
...