# Embeddings: Semantics of the language

We have some sentences and want to compare the semantic distance of them.

We use Python 3.9 for this.

In [None]:
%pip install sentence-transformers
%pip install numpy

## 1. The data

In [6]:
sentences = [
    'The dogs play with the ball on the grass',
    'Die Hunde spielen mit dem Ball auf der Rasen',
    'The pack rolls around a round thing on the meadow',
    'The archer aimed carefully at the target',
    'The bowman focused intently on the bullseye',
]

## 2. Create embeddings using Huggingface models

Models can, for example, be found at the [https://huggingface.co/spaces/mteb/leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [None]:
from sentence_transformers import SentenceTransformer

model_name = 'intfloat/multilingual-e5-large'
#model_name = 'danielheinz/e5-base-sts-en-de'

embedding_model = SentenceTransformer(model_name)

embeddings = embedding_model.encode(sentences)

print(embeddings[0])
for embedding in embeddings:
    print(len(embedding))


In [None]:
import numpy as np

def cosine_similarity(left, right):
    return np.dot(left, right) / (np.linalg.norm(left) * np.linalg.norm(right))

for i, left in enumerate(sentences):
    for j, right in enumerate(sentences):
        if j < i:
            continue
        similarity = cosine_similarity(embeddings[i], embeddings[j])
        print(f'{similarity:.4f}: {left} <-> {right}')
