<a href="https://colab.research.google.com/github/vektor8891/llm/blob/main/projects/32_watsonx_embedding/32_watsonx_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# ! pip install -qq langchain_community
# ! pip install -qq ibm_watsonx_ai
# ! pip install -qq langchain_ibm

# Document Embedding with Watsonx

## Load data

In [4]:
from langchain_community.document_loaders import TextLoader

!wget "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/i5V3ACEyz6hnYpVq6MTSvg/state-of-the-union.txt"

loader = TextLoader("state-of-the-union.txt")
data = loader.load()
data

--2025-07-11 20:13:47--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/i5V3ACEyz6hnYpVq6MTSvg/state-of-the-union.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 39027 (38K) [text/plain]
Saving to: ‘state-of-the-union.txt’


2025-07-11 20:13:49 (365 KB/s) - ‘state-of-the-union.txt’ saved [39027/39027]



[Document(metadata={'source': 'state-of-the-union.txt'}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determin

## Split data

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

chunks = text_splitter.split_text(data[0].page_content)

len(chunks)

570

In [6]:
chunks

['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and',
 'of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.',
 'Last year COVID-19 kept us apart. This year we are finally together again.',
 'Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.',
 'With a duty to one another to the American people to the Constitution.',
 'And with an unwavering resolve that freedom will always triumph over tyranny.',
 'Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he',
 'world thinking he could make it bend to his menacing ways. But he badly miscalculated.',
 'He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of',
 'he met a wall of strength he never imagined.',
 'He met the Ukrainian people.',
 'From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their',
 'courage, th

## Watsonx embedding model

[model card](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-slate-125m-english-rtrvr-model-card.html)

In [13]:
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings
from google.colab import userdata

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url=userdata.get("WATSONX_URL"),
    project_id=userdata.get("WATSONX_PROJECT_ID"),
    params=embed_params,
    apikey=userdata.get('IBM_CLOUD_API_KEY')
)

### Query embeddings

In [15]:
query = "How are you?"

query_result = watsonx_embedding.embed_query(query)

len(query_result)

768

In [16]:
query_result[:5]

[-0.06722454, -0.023729993, 0.017487843, -0.013195328, -0.039584607]

### Document embeddings

In [17]:
doc_result = watsonx_embedding.embed_documents(chunks)

len(doc_result)

570

In [18]:
doc_result[0][:5]

[-0.009708214, -0.0016381955, -0.01340118, -0.005656508, -0.031278424]

In [19]:
len(doc_result[0])

768

## Hugging Face embedding model

In [22]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"

huggingface_embedding = HuggingFaceEmbeddings(model_name=model_name)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Query embeddings

In [23]:
query = "How are you?"

query_result = huggingface_embedding.embed_query(query)

query_result[:5]

[0.02710616961121559,
 0.01133184228092432,
 -0.0019523961236700416,
 -0.036951325833797455,
 0.01776495948433876]

### Document embeddings


In [24]:
doc_result = huggingface_embedding.embed_documents(chunks)
doc_result[0][:5]

[0.040284302085638046,
 0.030196329578757286,
 -0.00639386149123311,
 0.06266564875841141,
 -0.007770895957946777]

In [25]:
len(doc_result[0])

768

# Exercises

### Exercise 1 - Using another watsonx embedding model

In [26]:
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxEmbeddings

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-30m-english-rtrvr",
    url=userdata.get("WATSONX_URL"),
    project_id=userdata.get("WATSONX_PROJECT_ID"),
    params=embed_params,
    apikey=userdata.get('IBM_CLOUD_API_KEY')
)

doc_result = watsonx_embedding.embed_documents(chunks)

doc_result[0][:5]

[0.020980379, 0.0008661887, 0.018803626, 0.028992167, -0.0032073553]

In [27]:
! pip freeze > requirements.txt