In [None]:
import marimo as mo

# Module 2: Practice 3 - Word Embeddings

In [None]:
import subprocess
result = subprocess.run(['bash', '-c', 'uv run python -m spacy download en_core_web_lg'], capture_output=True, text=True)

## Setup

First, we import the *spacy* library and load the large English model.

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

Next, let's define a function to calculate word embeddings based on an input word:

In [None]:
def calculate_embedding(input_word):
    word = nlp(input_word)
    return word.vector

Let's try with the word 'apple'.  For brevity, only the first elements of the embedding vector are displayed:

In [None]:
calculate_embedding("apple")[:10]

In [None]:
word_input_ui = mo.ui.text(value="orange")
mo.md(f'''
## More Practice with Word Embeddings

Type in a word to generate an embedding vector: {word_input_ui}
''')

In [None]:
word_embedding = calculate_embedding(word_input_ui.value)
word_embedding[:10]

## Similarity
Let's add a function to calculate the similarity between two words based on their embeddings:

In [None]:
def calculate_similarity(word1, word2):
    return nlp(word1).similarity(nlp(word2))

Compare embeddings of words: 'apple' and 'car'

In [None]:
calculate_similarity("apple", "car")

In [None]:
word1_input_ui = mo.ui.text(value="apple")
word2_input_ui = mo.ui.text(value="orange")
mo.md(f'''
## More Practice with Similarity

Enter any two words to generate the similarity measure between them: 

Word 1: {word1_input_ui} &nbsp; Word 2: {word2_input_ui}
''')

In [None]:
calculate_similarity(word1_input_ui.value, word2_input_ui.value)

In [None]:
la_word1_input_ui = mo.ui.text(value='spain')
la_word2_input_ui = mo.ui.text(value='paris')
la_word3_input_ui = mo.ui.text(value='france')
la_word4_input_ui = mo.ui.text(value='madrid')

mo.md(f'''
We can even do linear algebra with the underlying vector representations. Enter any three words and calculate similarity with a fourth one, e.g.:

'woman' + 'king' - 'man' with 'queen'

OR

'spain' + 'paris' - 'france' with 'madrid': 

Word 1: {la_word1_input_ui} + (Word 2: {la_word2_input_ui} - Word 3: {la_word3_input_ui})

compared to

Word 4: {la_word4_input_ui}.
''')

In [None]:
la_word1_embedding = nlp(la_word1_input_ui.value).vector
la_word2_embedding = nlp(la_word2_input_ui.value).vector
la_word3_embedding = nlp(la_word3_input_ui.value).vector
la_word = la_word1_embedding + (la_word2_embedding - la_word3_embedding)
la_word4 = nlp(la_word4_input_ui.value).vector

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
print("Cosine similarity: ", cosine_similarity([la_word], [la_word4])[0][0])

----
## Sentence Embeddings

Finally, to calculate an embedding for a sentence, we can just average the embeddings of all the words in that sentence.  We will again use `spacy` to calculate the sentence embeddings.

```python
query = "What is the capital of France?"
info_1 = "The capital of France is Paris"
info_2 = "France is a beautiful country"
info_3 = "Today is very warm in New York City"
print("Response 1 Similarity: ", nlp(query).similarity(nlp(info_1)))
print("Response 2 Similarity: ", nlp(query).similarity(nlp(info_2)))
print("Response 3 Similarity: ", nlp(query).similarity(nlp(info_3)))
```

In [None]:
query = "What is the capital of France?"
info_1 = "The capital of France is Paris"
info_2 = "France is a beautiful country"
info_3 = "Today is very warm in New York City"
print("Response 1 Similarity: ", nlp(query).similarity(nlp(info_1)))
print("Response 2 Similarity: ", nlp(query).similarity(nlp(info_2)))
print("Response 3 Similarity: ", nlp(query).similarity(nlp(info_3)))

Being able to quickly calculate similarities between a query and target information text is very powerful for Information Retrieval, especially when combined with Large Language Models trained for chat/question answering capabilities.