<a href="https://colab.research.google.com/github/syedmahmoodiagents/Agents/blob/main/Basic_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [None]:
import numpy as np

In [None]:
import faiss

In [None]:
import os, getpass

In [None]:
os.environ['HUGGINGFACE_TOKEN'] = getpass.getpass('Huggingface Token:')

Huggingface Token:··········


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    # print(inputs.keys())
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    embeddings = outputs.hidden_states[-1][:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)

    return embeddings

In [None]:
# retriever_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
documents = [
    "A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843.",
    "The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \n"
    "the supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come.",
    "The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written."
]
query = "What is the significance of Christmas Eve in A Christmas Carol?"

In [None]:
query_embedding = get_embeddings([query])
document_embeddings = get_embeddings(documents)

In [None]:
document_embeddings.shape

torch.Size([3, 768])

In [None]:
# def cosine_similarity(a, b):
#     return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def cosine_similarity(embedding1, embedding2):
    return torch.nn.functional.cosine_similarity(embedding1, embedding2)


In [None]:
for doc_embedding in document_embeddings:
  print(cosine_similarity(query_embedding, doc_embedding))

tensor([0.9757])
tensor([0.9959])
tensor([0.9959])


In [None]:
similarities = [cosine_similarity(query_embedding, doc_embedding) for doc_embedding in document_embeddings]

In [None]:
similarities

[tensor([0.9757]), tensor([0.9959]), tensor([0.9959])]

In [None]:
# documents

In [None]:
ranked_documents = sorted(zip(documents, similarities), key=lambda x: x[1], reverse=True)

In [None]:
ranked_documents

[("The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \nthe supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come.",
  tensor([0.9959])),
 ('The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.',
  tensor([0.9959])),
 ('A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843.',
  tensor([0.9757]))]

In [None]:
top_documents = [doc for doc, _ in ranked_documents[:2]]
print(top_documents)

["The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \nthe supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come.", 'The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.']


In [None]:
# Generation

In [None]:
query + " [SEP] " + " ".join(top_documents)

"What is the significance of Christmas Eve in A Christmas Carol? [SEP] The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \nthe supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written."

In [None]:
augmented_input = query + " [SEP] " + " ".join(top_documents)

In [None]:
input_ids = tokenizer.encode(augmented_input, return_tensors="pt", padding=True, truncation=True)

In [None]:
input_ids

tensor([[ 2061,   318,   262, 12085,   286,  6786, 12882,   287,   317,  6786,
          5074,    30,   685,  5188,    47,    60,   383,  1621,  4952,   286,
         11348,   290, 24276,    88, 12119,  1734,  9107,  1446, 42407,   469,
           338, 15735,    11, 15028,    11,   290,  7016, 13389,   706,   220,
           198,  1169, 22239, 11864,   286, 12806,  1526,  1636,   290,   262,
         38389,   286,  6786, 11303,    11, 21662,    11,   290,  6430,   284,
          7911,    13,   383,   645,   303,  8466,  1138,   351,  9113,  1943,
           290,  4688, 21684,    13,   632,   318, 11987,   355,   530,   286,
           262,  6000,  6786,  3923,  1683,  3194,    13]])

In [None]:
outputs = model.generate(input_ids, max_length=150, num_beams=2, early_stopping=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
outputs

tensor([[ 2061,   318,   262, 12085,   286,  6786, 12882,   287,   317,  6786,
          5074,    30,   685,  5188,    47,    60,   383,  1621,  4952,   286,
         11348,   290, 24276,    88, 12119,  1734,  9107,  1446, 42407,   469,
           338, 15735,    11, 15028,    11,   290,  7016, 13389,   706,   220,
           198,  1169, 22239, 11864,   286, 12806,  1526,  1636,   290,   262,
         38389,   286,  6786, 11303,    11, 21662,    11,   290,  6430,   284,
          7911,    13,   383,   645,   303,  8466,  1138,   351,  9113,  1943,
           290,  4688, 21684,    13,   632,   318, 11987,   355,   530,   286,
           262,  6000,  6786,  3923,  1683,  3194,    13,   198,   198,    58,
          5188,    47,    60,   383,  1621,  4952,   286, 11348,   290, 24276,
            88, 12119,  1734,  9107,  1446, 42407,   469,   338, 15735,    11,
         15028,    11,   290,  7016, 13389,   706,   262, 22239, 11864,   286,
         12806,  1526,  1636,   290,   262, 38389,  

In [None]:
generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(generated_response)

What is the significance of Christmas Eve in A Christmas Carol? [SEP] The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after 
the supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.

[SEP] The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after the supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come. The novella met with instant success and critical acclaim. It


**With Vectordb**

In [None]:
document_embeddings.shape

torch.Size([3, 768])

In [None]:
# Create FAISS index
index = faiss.IndexFlatL2(document_embeddings.shape[1])

In [None]:
index.is_trained

True

In [None]:
index.add(document_embeddings.numpy())

In [None]:
index.ntotal

3

In [None]:
# Retrieve information
query_embedding = get_embeddings([query])

In [None]:
distances, indices = index.search(query_embedding.detach().numpy(), k=5)

In [None]:
distances[0], indices[0]

(array([8.2615279e-03, 8.2615279e-03, 4.8689466e-02, 3.4028235e+38,
        3.4028235e+38], dtype=float32),
 array([ 1,  2,  0, -1, -1]))

In [None]:
# Get top documents
top_documents = [documents[i] for i in indices[0]]

In [None]:
top_documents

["The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \nthe supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come.",
 'The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.',
 'A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843.',
 'The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.',
 'The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.']

In [None]:
augmented_input = query + " [SEP] " + " ".join(top_documents)

In [None]:
# Generate the response
input_ids = tokenizer.encode(augmented_input, return_tensors="pt", padding=True, truncation=True)

In [None]:
input_ids

tensor([[ 2061,   318,   262, 12085,   286,  6786, 12882,   287,   317,  6786,
          5074,    30,   685,  5188,    47,    60,   383,  1621,  4952,   286,
         11348,   290, 24276,    88, 12119,  1734,  9107,  1446, 42407,   469,
           338, 15735,    11, 15028,    11,   290,  7016, 13389,   706,   220,
           198,  1169, 22239, 11864,   286, 12806,  1526,  1636,   290,   262,
         38389,   286,  6786, 11303,    11, 21662,    11,   290,  6430,   284,
          7911,    13,   383,   645,   303,  8466,  1138,   351,  9113,  1943,
           290,  4688, 21684,    13,   632,   318, 11987,   355,   530,   286,
           262,  6000,  6786,  3923,  1683,  3194,    13,   317,  6786,  5074,
           318,   257,   645,   303,  8466,   416,  7516, 46167,    11,   717,
          3199,   287,  3576,   319,   678,  3426,  1248,  3559,    13,   383,
           645,   303,  8466,  1138,   351,  9113,  1943,   290,  4688, 21684,
            13,   632,   318, 11987,   355,   530,  

In [None]:
input_ids

tensor([[ 2061,   318,   262, 12085,   286,  6786, 12882,   287,   317,  6786,
          5074,    30,   685,  5188,    47,    60,   383,  1621,  4952,   286,
         11348,   290, 24276,    88, 12119,  1734,  9107,  1446, 42407,   469,
           338, 15735,    11, 15028,    11,   290,  7016, 13389,   706,   220,
           198,  1169, 22239, 11864,   286, 12806,  1526,  1636,   290,   262,
         38389,   286,  6786, 11303,    11, 21662,    11,   290,  6430,   284,
          7911,    13,   383,   645,   303,  8466,  1138,   351,  9113,  1943,
           290,  4688, 21684,    13,   632,   318, 11987,   355,   530,   286,
           262,  6000,  6786,  3923,  1683,  3194,    13,   317,  6786,  5074,
           318,   257,   645,   303,  8466,   416,  7516, 46167,    11,   717,
          3199,   287,  3576,   319,   678,  3426,  1248,  3559,    13,   383,
           645,   303,  8466,  1138,   351,  9113,  1943,   290,  4688, 21684,
            13,   632,   318, 11987,   355,   530,  

In [None]:
outputs = model.generate(input_ids, max_length=160, num_beams=2, early_stopping=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
generated_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generated_response

"What is the significance of Christmas Eve in A Christmas Carol? [SEP] The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after \nthe supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.\n"

**With Chunks of Data**

In [None]:
# embeddings
emb = np.random.random((10, 128)).astype('float32')

In [None]:
emb.shape

(10, 128)

In [None]:
index = faiss.IndexFlatL2(128)

In [None]:
index.add(emb)

In [None]:
# Query
quer = np.random.random((1, 128)).astype('float32')

In [None]:
# search
distances, indices = index.search(quer, k=13)

In [None]:
distances

array([[1.7885969e+01, 1.7996664e+01, 1.8398716e+01, 1.9504526e+01,
        1.9603886e+01, 2.0077065e+01, 2.0149769e+01, 2.0748947e+01,
        2.1160025e+01, 2.3337437e+01, 3.4028235e+38, 3.4028235e+38,
        3.4028235e+38]], dtype=float32)

In [None]:
indices

array([[ 8,  7,  2,  3,  0,  6,  4,  5,  1,  9, -1, -1, -1]])

In [None]:
stdoc = "The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after the supernatural visits of Jacob Marley and the Ghosts of Christmas Past, Present, and Yet to Come."

In [None]:
w = stdoc.split()

In [None]:
for i in range(0, len(w), 5):
  print(' '.join(w[i:i+5]))

The story tells of sour
and stingy Ebenezer Scrooge's ideological,
ethical, and emotional transformation after
the supernatural visits of Jacob
Marley and the Ghosts of
Christmas Past, Present, and Yet
to Come.


In [None]:
# Function to chunk text
def chunk_text(text, max_length=100):
    words = text.split()
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return chunks

In [None]:
chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc, max_length=21))  # Adjust max_length as needed


In [None]:
chunks

['A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843.',
 "The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after the supernatural visits of Jacob Marley",
 'and the Ghosts of Christmas Past, Present, and Yet to Come.',
 'The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.']

In [None]:
# Get chunk embeddings
chunk_embeddings = get_embeddings(chunks)

In [None]:
index = faiss.IndexFlatL2(chunk_embeddings.shape[1])


In [None]:
index.add(chunk_embeddings.detach().numpy())

In [None]:
query_embedding = get_embeddings([query])
distances, indices = index.search(query_embedding.detach().numpy(), k=7)

In [None]:
indices

array([[ 1,  3,  2,  0, -1, -1, -1]])

In [None]:
# Get top chunks
top_chunks = [chunks[i] for i in indices[0]]
augmented_input = query + " [SEP] " + " ".join(top_chunks)
print("Augmented Input:", augmented_input)

Augmented Input: What is the significance of Christmas Eve in A Christmas Carol? [SEP] The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after the supernatural visits of Jacob Marley The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. and the Ghosts of Christmas Past, Present, and Yet to Come. A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written.


In [None]:
input_ids = tokenizer.encode(augmented_input, return_tensors="pt", padding=True, truncation=True)

In [None]:
outputs = model.generate(input_ids, max_length=190, num_beams=2, early_stopping=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
out = tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
out.split('[SEP]')[0]

'What is the significance of Christmas Eve in A Christmas Carol? '

In [None]:
out.split('[SEP]')[1]

" The story tells of sour and stingy Ebenezer Scrooge's ideological, ethical, and emotional transformation after the supernatural visits of Jacob Marley The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. and the Ghosts of Christmas Past, Present, and Yet to Come. A Christmas Carol is a novella by Charles Dickens, first published in London on 19 December 1843. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success and critical acclaim. It is regarded as one of the greatest Christmas stories ever written. The novella met with instant success"

In [None]:
# What is RAG, Why RAG, RAG Architecture, Why not fine tuning
# Update
# HyperLinks
# Diversified documents
# Metrics
# Hallucinations and wrong answers
# Updating the query
# Best practices
# RAG failure scenarios