# Tokenization and Embeddings

Import the libraries we are going to use.

In [None]:
# Preprocessing
from unstructured.partition.text import partition_text
from unstructured.cleaners.core import group_broken_paragraphs
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

# tokenization and embedding
from sentence_transformers import SentenceTransformer

# Chroma
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# embedding projection
import umap.umap_ as umap
import numpy as np
from tqdm import tqdm

# visulalization
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


## Pre-processing

It is a relatively large .txt file with impracitcal paragraph splitting. We group the broken parapraphs together into chunks of 1500 characters, which correspond to roughly 1 actual paragraph.

Additionally, we want the partition to later fit into the token splitter. The token splitter we will use has a max input length of 128 tokens. German has a token word ratio of roughly 2.1:1. The average German word has 6.3 characters.

128 / 2.1 * 6.3 = 384 characters

We are can increase

In [None]:
elements = partition_text('data/Stein.txt', paragraph_grouper=group_broken_paragraphs, max_partition=384)
element_strings = [str(el) for el in elements]
print("\n\n".join([el for el in element_strings][:5]))
print("The book has been split into " + str(len(element_strings)) + " chunks.")
print("An element is " + str(len(str(element_strings[0]))) + " characters long.")

## Chunk refinement

We now make sure that each chunk fits into the input lenght of the model we will use to embed our vector database.

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=128, model_name="paraphrase-multilingual-MiniLM-L12-v2")

token_split_texts = []
for text in element_strings:
    token_split_texts += token_splitter.split_text(text)

print(f"\nTotal chunks: {len(token_split_texts)}")
print(token_split_texts[0])

## Language Model Analysis

### Does it matter what language your text has when deciding for an embedding model?

We tokenize the chunks now with the tokenizer of the embedding model we will use.

- The model uses SentencePiece tokenisation, which is a bit different from WordPiece or Byte Pair Encoding.
- We still see sub words. White spaces are highlighted with underscores.
- Sentence boundaries are marked with `<s>`
- Subwords and single characters are recognizable
- Is more on the language-agnostic side, as it does not rely on white spaces to separate words.

In [None]:
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)
tokenized_chunks = []
for i, text in enumerate(token_split_texts[:10]):
    # Tokenize each chunk
    encoded_input = model.tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    # Convert token IDs back to tokens
    tokens = model.tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0].tolist())
    tokenized_chunks.append(tokens)
    print(f"Chunk {i}: {tokens}")



Try the same now with a model that uses a SentenciePiece tokenizer.

You should notice:
- White spaces have been removed
- Words that have been split into words can be connected with `##`
- First tries to determine word boundaries like byte-pair encoding.
- The start of a sentence is marked with `[CLS]`

In [None]:
model_name = "Sahajtomar/German-semantic"

## From Token to Embedding

Notice:
- our text snippet has 110 tokens
- the embedding has 384 dimensions
- When calculating the embedding, the embedding model first calculates the 384 dimensional embedding for each individual token
- depending on the model the individual vectors are then averaged, maxed or they take the embedding for the sentence boundary marker.
- This allows us to end up with just one rather than 110 384 dimensional vectors per chunk

In [None]:
# raw text
print(token_split_texts[10])

# tokens
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)
print("Number of tokens: ", len(model.tokenizer(token_split_texts[10], padding=True, truncation=True, max_length=128, return_tensors='pt')[0]))

# Embedding
embedding_function = SentenceTransformerEmbeddingFunction(model_name="paraphrase-multilingual-MiniLM-L12-v2")
print(embedding_function([token_split_texts[10]]))
print("Vector dimensions: ", len(embedding_function([token_split_texts[10]])[0]))



## Building our Vector Store

## 

In [None]:
chroma_client = chromadb.Client()


embedding_function = SentenceTransformerEmbeddingFunction(model_name="paraphrase-multilingual-MiniLM-L12-v2")
chroma_collection = chroma_client.create_collection("Steint.txt", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

## Embedding Projections

We retrieve all embeddings from our chroma collection.

- UMAP (Uniform Manifold Approximation and Projection): reduces dimensionanality of a vector to project into a lower dimensionality space. Tyipically 2D or 3D vor visualisations.


In [None]:

embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)

### Function to apply the UMAP transformation to our data

We will need to tranform multiple vectors

In [None]:
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

Create a query.

In [None]:
query = "Hogwarts"

Embed and project the query into a 2-dimensional space.

In [None]:
original_query_embedding = embedding_function(query)
project_original_query = project_embeddings(original_query_embedding, umap_transform)

Query the chroma_collection for documents related to "Hogwarts" and retrieve the top 5 results


- Extract the embeddings from the results

- Flatten the list of embeddings

- Project the result embeddings using the umap_transform

- Project the dataset embeddings using the umap_transform


In [None]:
results = chroma_collection.query(query_texts=["Hogwarts"], n_results=5, include=['documents', 'embeddings'])
print(results['documents'][0])
result_embeddings = results['embeddings']
result_embeddings = [item for sublist in result_embeddings for item in sublist]
projected_result_embeddings = project_embeddings(result_embeddings, umap_transform)
projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)

In [None]:
def shorten_text(text, max_length=15):
    """ Shortens text to max_length and adds an ellipsis if the text was shortened. """
    return (text[:max_length] + '...') if len(text) > max_length else text

plt.figure()

# Scatter plots
plt.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1], s=10, color='gray', label='Dataset')
plt.scatter(projected_result_embeddings[:, 0], projected_result_embeddings[:, 1], s=100, facecolors='none', edgecolors='g', label='Results')
plt.scatter(project_original_query[:, 0], project_original_query[:, 1], s=150, marker='X', color='r', label='Original Query')

# Assuming result_texts is an array of texts for the results
# result_texts = ['text1', 'text2', ..., 'text5']

for i, text in enumerate(results['documents'][0]):
    if i < len(projected_result_embeddings):
        plt.annotate(shorten_text(text), (projected_result_embeddings[i, 0], projected_result_embeddings[i, 1]), fontsize=8)

# Assuming you have text for the original query
original_query_text = 'Original Query Text'  # Replace with your actual text for the original query
plt.annotate(shorten_text(original_query_text), (project_original_query[0, 0], project_original_query[0, 1]), fontsize=8)

plt.gca().set_aspect('equal', 'datalim')
plt.title('Hogwarts')
plt.legend()
plt.show()

### 3D-Projection

In [None]:

# Adjusted UMAP transform for 3D projection
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transform_3d = umap.UMAP(n_components=3, random_state=0, transform_seed=0).fit(embeddings)

def project_embeddings_3d(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings), 3))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])[0]
    return umap_embeddings

In [None]:
project_original_query = project_embeddings_3d(original_query_embedding, umap_transform_3d)
projected_result_embeddings = project_embeddings_3d(result_embeddings, umap_transform_3d)
projected_dataset_embeddings = project_embeddings_3d(embeddings, umap_transform_3d)

In [None]:

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
projected_dataset_embeddings_3d = projected_dataset_embeddings
projected_result_embeddings_3d = projected_result_embeddings
project_original_query_3d = project_original_query

# Scatter plots
ax.scatter(projected_dataset_embeddings_3d[:, 0], projected_dataset_embeddings_3d[:, 1], projected_dataset_embeddings_3d[:, 2], s=10, color='gray', label='Dataset')
ax.scatter(projected_result_embeddings_3d[:, 0], projected_result_embeddings_3d[:, 1], projected_result_embeddings_3d[:, 2], s=100, facecolors='none', edgecolors='g', label='Results')
ax.scatter(project_original_query_3d[:, 0], project_original_query_3d[:, 1], project_original_query_3d[:, 2], s=150, marker='X', color='r', label='Original Query')

# Annotations
for i, text in enumerate(results['documents'][0]):
    if i < len(projected_result_embeddings_3d):
        ax.text(projected_result_embeddings_3d[i, 0], projected_result_embeddings_3d[i, 1], projected_result_embeddings_3d[i, 2], shorten_text(text), fontsize=8)

ax.text(project_original_query_3d[0, 0], project_original_query_3d[0, 1], project_original_query_3d[0, 2], shorten_text(original_query_text), fontsize=8)

ax.set_xlabel('X Axis')