<a href="https://colab.research.google.com/github/zamanmiraz/DSandML-Notebooks/blob/main/RAG/02_embedding_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/guyernest/advanced-rag.git
%cd advanced-rag
!pip install --upgrade -r requirements.txt
# Install a compatible version of torchvision to address the nms error


In [None]:
!pip install torchvision==0.18.0
!pip install -q -U google-generativeai

In [None]:
from rich.console import Console
from rich.style import Style
import pathlib
from rich_theme_manager import Theme, ThemeManager

THEMES = [
    Theme(
        name="dark",
        description="Dark mode theme",
        tags=["dark"],
        styles={
            "repr.own": Style(color="#e87d3e", bold=True),      # Class names
            "repr.tag_name": "dim cyan",                        # Adjust tag names
            "repr.call": "bright_yellow",                       # Function calls and other symbols
            "repr.str": "bright_green",                         # String representation
            "repr.number": "bright_red",                        # Numbers
            "repr.none": "dim white",                           # None
            "repr.attrib_name": Style(color="#e87d3e", bold=True),    # Attribute names
            "repr.attrib_value": "bright_blue",                 # Attribute values
            "default": "bright_white on black"                  # Default text and background
        },
    ),
    Theme(
        name="light",
        description="Light mode theme",
        styles={
            "repr.own": Style(color="#22863a", bold=True),          # Class names
            "repr.tag_name": Style(color="#00bfff", bold=True),     # Adjust tag names
            "repr.call": Style(color="#ffff00", bold=True),         # Function calls and other symbols
            "repr.str": Style(color="#008080", bold=True),          # String representation
            "repr.number": Style(color="#ff6347", bold=True),       # Numbers
            "repr.none": Style(color="#808080", bold=True),         # None
            "repr.attrib_name": Style(color="#ffff00", bold=True),  # Attribute names
            "repr.attrib_value": Style(color="#008080", bold=True), # Attribute values
            "default": Style(color="#000000", bgcolor="#ffffff"),   # Default text and background
        },
    ),
]

theme_dir = pathlib.Path("themes").expanduser()
theme_dir.expanduser().mkdir(parents=True, exist_ok=True)

theme_manager = ThemeManager(theme_dir=theme_dir, themes=THEMES)
theme_manager.list_themes()

dark = theme_manager.get("dark")
theme_manager.preview_theme(dark)

In [None]:
from rich.console import Console

dark = theme_manager.get("dark")
light = theme_manager.get("light")

console = Console(theme=light)

In [None]:
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [None]:
first_sentence = "I have no interest in politics"

In [None]:
# Gemini Embedding
import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

result = genai.embed_content(
    model="models/text-embedding-004",
    content=first_sentence)

print(result['embedding'])

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
first_sentence = "I have no interest in politics"
second_sentence = "The bank's interest rate is too high"

In [None]:
tokenized_first_sentence = model.tokenize([first_sentence])
console.rule(f"{first_sentence}")
console.print(tokenized_first_sentence)

In [None]:
tokenized_second_sentence = model.tokenize([second_sentence])
console.rule(f"{second_sentence}")
console.print(tokenized_second_sentence)

In [None]:
sentence_tokens = (
    model.tokenizer.convert_ids_to_tokens(tokenized_first_sentence['input_ids'][0]),
    model.tokenizer.convert_ids_to_tokens(tokenized_second_sentence['input_ids'][0])
)
console.rule("Tokens")
console.print(sentence_tokens)

In [None]:
vocabulary = model._first_module().tokenizer.get_vocab().items()
console.rule("Vocabulary")
console.print(list(vocabulary)[:20])

In [None]:
sorted_vocabulary = sorted(vocabulary, key=lambda x:x[1])
sorted_tokens = [token for token, cnt in sorted_vocabulary]
focused_token = 'interest'
index = sorted_tokens.index("interest")
console.print(sorted_tokens[index - 10 : index + 10])

In [None]:
console.print(model)

In [None]:
first_module = model._first_module()
console.print(first_module.auto_model)

In [None]:
embeddings = first_module.auto_model.embeddings
console.print(embeddings)

In [None]:
ms = [m for m in model]
len(ms), ms[0]

In [None]:
import torch

device = torch.device("mps" if torch.has_mps else "cpu")  # Use MPS for Apple, CUDA for others, or fallback to CPU

with torch.no_grad():
    # Tokenize both texts
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])

    # Get the corresponding embeddings
    first_embeddings = embeddings.word_embeddings(
        first_tokens["input_ids"].to(device)
    )
    second_embeddings = embeddings.word_embeddings(
        second_tokens["input_ids"].to(device)
    )

console.print(first_embeddings.shape, second_embeddings.shape)

In [None]:
from sentence_transformers import util
import altair as alt
import pandas as pd

distances = util.cos_sim(first_embeddings.squeeze(), second_embeddings.squeeze()).cpu().numpy()

# Get token labels
x_labels = model.tokenizer.convert_ids_to_tokens(second_tokens["input_ids"][0])
y_labels = model.tokenizer.convert_ids_to_tokens(first_tokens["input_ids"][0])

# Create a DataFrame for Altair
data = pd.DataFrame(
    [(x, y, distances[i, j]) for i, y in enumerate(y_labels) for j, x in enumerate(x_labels)],
    columns=['x', 'y', 'similarity']
)

# Create heatmap using Altair
chart = alt.Chart(data).mark_rect().encode(
    x=alt.X('x:O', title='Second Sentence Tokens', axis=alt.Axis(labelAngle=-45), sort=x_labels),
    y=alt.Y('y:O', title='First Sentence Tokens', sort=y_labels),
    color=alt.Color('similarity:Q', scale=alt.Scale(scheme='yellowgreenblue')),
    tooltip=['x', 'y', alt.Tooltip('similarity:Q', format='.2f')]
).properties(
    width=500,
    height=400,
    title='Input Token Similarity Heatmap'
)

# Add text labels
text = chart.mark_text(baseline='middle').encode(
    text=alt.Text('similarity:Q', format='.2f'),
    color=alt.condition(
        alt.datum.similarity > 0.5,
        alt.value('white'),
        alt.value('black')
    )
)

# Combine chart and text
final_chart = (chart + text).configure_title(fontSize=16)

# Display the chart
final_chart


In [None]:
token_embeddings = first_module.auto_model \
    .embeddings \
    .word_embeddings \
    .weight \
    .detach() \
    .cpu() \
    .numpy()

console.print(token_embeddings.shape)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, metric="cosine", random_state=42)
tsne_embeddings_2d = tsne.fit_transform(token_embeddings)
console.print(tsne_embeddings_2d.shape)

In [None]:
token_colors = []
for token in sorted_tokens:
    if token[0] == "[" and token[-1] == "]": # Control Tokens
        token_colors.append("red")
    elif token.startswith("##"):            # Suffix Tokens
        token_colors.append("blue")
    else:
        token_colors.append("green")        # All Word Tokens

In [None]:
import altair as alt
import pandas as pd

# Enable VegaFusion data transformer to handle larger datasets
alt.data_transformers.enable("vegafusion")

# Create a DataFrame from the data
df = pd.DataFrame({
    'x': tsne_embeddings_2d[:, 0],
    'y': tsne_embeddings_2d[:, 1],
    'token': sorted_tokens,
    'color': token_colors
})

# Create the Altair chart
chart = alt.Chart(df).mark_circle(size=30).encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('color:N', scale=None),
    tooltip=['token:N']
).properties(
    width=600,
    height=900,
    title='Token Embeddings'
).interactive()

# Display the chart
chart

In [None]:
output_embedding = model.encode([first_sentence])
console.print(output_embedding.shape)

In [None]:
output_embedding = model.encode([first_sentence])
console.print(output_embedding.shape)

In [None]:
output_token_embeddings = model.encode(
    [first_sentence],
    output_value="token_embeddings"
)
console.print(output_token_embeddings[0].shape)

In [None]:
with torch.no_grad():
    first_tokens = model.tokenize([first_sentence])
    second_tokens = model.tokenize([second_sentence])

    first_output_embeddings = model.encode(
        [first_sentence],
        output_value="token_embeddings"
    )
    second_output_embeddings = model.encode(
        [second_sentence],
        output_value="token_embeddings"
    )

# Calculate cosine similarity
distances = util.cos_sim(
    first_output_embeddings[0],
    second_output_embeddings[0]
)

In [None]:
# Get token labels
x_labels = model.tokenizer.convert_ids_to_tokens(second_tokens["input_ids"][0])
y_labels = model.tokenizer.convert_ids_to_tokens(first_tokens["input_ids"][0])

# Create a DataFrame for Altair
data = pd.DataFrame(
    [(x, y, distances[i, j]) for i, y in enumerate(y_labels) for j, x in enumerate(x_labels)],
    columns=['x', 'y', 'similarity']
)

# Create heatmap using Altair
chart = alt.Chart(data).mark_rect().encode(
    x=alt.X('x:O', title='Second Sentence Tokens', axis=alt.Axis(labelAngle=-45), sort=x_labels),
    y=alt.Y('y:O', title='First Sentence Tokens', sort=y_labels),
    color=alt.Color('similarity:Q', scale=alt.Scale(scheme='yellowgreenblue', domain=[0, 1])),
    tooltip=['x', 'y', alt.Tooltip('similarity:Q', format='.2f')]
).properties(
    width=500,
    height=400,
    title='Output Token Similarity Heatmap'
)

# Add text labels
text = chart.mark_text(baseline='middle').encode(
    text=alt.Text('similarity:Q', format='.2f'),
    color=alt.condition(
        alt.datum.similarity > 0.5,
        alt.value('white'),
        alt.value('black')
    )
)

# Combine chart and text
final_chart = (chart + text).configure_title(fontSize=16)

# Display the chart
final_chart

In [None]:

# Calculate cosine distance between output embeddings
from sklearn.metrics.pairwise import cosine_distances
from rich.panel import Panel
from rich.table import Table

def calculate_sentence_similarity(first_sentence, second_sentence):

    first_embeddings = model.encode([first_sentence])
    second_embeddings = model.encode([second_sentence])

    # Reshape the embeddings to 2D arrays
    first_embedding_2d = first_embeddings.reshape(1, -1)
    second_embedding_2d = second_embeddings.reshape(1, -1)

    # Calculate cosine distance
    cosine_distance = cosine_distances(first_embedding_2d, second_embedding_2d)[0][0]

    # Note: Cosine distance is 1 - cosine similarity
    cosine_similarity = 1 - cosine_distance

    console.print(
        Panel(
            f"[cyan bold]First Sentence:[/cyan bold] {first_sentence}\n"
            f"[cyan bold]Second Sentence:[/cyan bold] {second_sentence}",
            title="[green bold]Similarity Calculation[/green bold]",
            expand=False,
            border_style="dim white"
        )
    )

    results = Table(title="Results")
    results.add_column("Metric", style="bold")
    results.add_column("Value", style="bold")
    results.add_row("Cosine Distance", f"{cosine_distance:.4f}", style="cyan")
    results.add_row("Cosine Similarity", f"{cosine_similarity:.4f}", style="bright_yellow")

    console.print(results)

In [None]:
calculate_sentence_similarity(first_sentence, second_sentence)

In [None]:
third_sentence = "Chase increased its lending fees"

calculate_sentence_similarity(second_sentence, third_sentence)

In [None]:
import transformers

improved_model = transformers.AutoModel.from_pretrained("jxm/cde-small-v1", trust_remote_code=True)
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
console.print(improved_model)

In [None]:
from datasets import load_dataset

corpus = load_dataset("BeIR/fiqa", "corpus")["corpus"]
queries = load_dataset("BeIR/fiqa", "queries")["queries"]

In [None]:
import pandas as pd
from tabulate import tabulate

console.rule("Corpus Sample")
print(tabulate(
    corpus
    .to_pandas()
    .head(10)
    .assign(text_start=lambda x: x['text'].str[:100])
    .drop(columns=['text','title'])
    ,headers='keys',
    tablefmt='github',
    showindex=False
))

In [None]:
console.rule("Queries Sample")
print(tabulate(
    queries
    .to_pandas()
    .head(10)
    .assign(text_start=lambda x: x['text'].str[:100])
    .drop(columns=['text','title'])
    ,headers='keys',
    tablefmt='github',
    showindex=False
))

In [None]:
query_prefix = "search_query: "
document_prefix = "search_document: "

In [None]:
import random

def process_ex_document(ex: dict) -> dict:
  ex["text"] = f"{ex['title']} {ex['text']}"
  return ex

corpus_size = improved_model.config.transductive_corpus_size
console.print(f"Choosing {corpus_size} out of {len(corpus)} documents")
minicorpus_docs = corpus.select(random.choices(list(range(len(corpus))), k=corpus_size))
minicorpus_docs = minicorpus_docs.map(process_ex_document)["text"]
minicorpus_docs = tokenizer(
    [document_prefix + doc for doc in minicorpus_docs],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

In [None]:
import torch
device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model.to(device)
minicorpus_docs = minicorpus_docs.to(device)

In [None]:
import torch
from tqdm.autonotebook import tqdm

batch_size = 32

dataset_embeddings = []
for i in tqdm(range(0, len(minicorpus_docs["input_ids"]), batch_size)):
    minicorpus_docs_batch = {k: v[i:i+batch_size] for k,v in minicorpus_docs.items()}
    with torch.no_grad():
        dataset_embeddings.append(
            improved_model.first_stage_model(**minicorpus_docs_batch)
        )

dataset_embeddings = torch.cat(dataset_embeddings)

In [None]:
sample_docs = corpus.select(range(16)).map(process_ex_document)["text"]

docs_tokens = tokenizer(
    [document_prefix + doc for doc in sample_docs],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
).to(device)

with torch.no_grad():
  doc_embeddings = improved_model.second_stage_model(
      input_ids=docs_tokens["input_ids"],
      attention_mask=docs_tokens["attention_mask"],
      dataset_embeddings=dataset_embeddings,
  )
doc_embeddings /= doc_embeddings.norm(p=2, dim=1, keepdim=True)

In [None]:
queries_sample = queries.select(range(16))["text"]
queries_tokens = tokenizer(
    [query_prefix + query for query in queries_sample],
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors="pt"
).to(device)

with torch.no_grad():
  query_embeddings = improved_model.second_stage_model(
      input_ids=queries_tokens["input_ids"],
      attention_mask=queries_tokens["attention_mask"],
      dataset_embeddings=dataset_embeddings,
  )
query_embeddings /= query_embeddings.norm(p=2, dim=1, keepdim=True)

In [None]:
with torch.no_grad():
  doc_basic_embeddings = model.encode(sample_docs)

In [None]:
with torch.no_grad():
  queries_basic_embeddings = model.encode(queries_sample)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Heatmap for improved model
sns.heatmap((doc_embeddings @ query_embeddings.T).cpu(), cmap="jet", ax=ax1, vmin=0, vmax=1)
ax1.set_title("Improved Model", fontsize=16)

# Heatmap for basic model
sns.heatmap((doc_basic_embeddings @ queries_basic_embeddings.T), cmap="jet", ax=ax2 ,vmin=0, vmax=1)
ax2.set_title("Basic Model", fontsize=16)

plt.tight_layout()
console.rule("Embedding Model Comparison")
plt.show()