In [1]:
!pip install -Uqqq sentence-transformers


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/255.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m153.6/255.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import plotly.express as px


In [62]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [63]:
# List of sentences you want to encode
sentences = [
    "I love programming.",
    "The weather is nice today.",
    "coding is my life style",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

# Print embeddings
for sentence, embedding in zip(sentences, embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding[:10]}\n")

Sentence: I love programming.
Embedding: [-1.9631904e-02 -1.7836422e-02  1.0179512e-02 -1.5698869e-02
  9.7629709e-05 -6.9564186e-02  6.6222072e-02  6.2824890e-02
  1.8769771e-02  4.8245110e-02]

Sentence: The weather is nice today.
Embedding: [-0.02945096  0.10326593  0.15032196  0.07372083  0.00080059 -0.03651824
  0.04306472 -0.08111952 -0.06354132  0.02024679]

Sentence: coding is my life style
Embedding: [-0.04929301  0.05627077 -0.00257218  0.00362889  0.01892747 -0.01958509
  0.07381872 -0.02014969 -0.02327397  0.04051569]



In [64]:
# Calculate the cosine similarity
cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [81]:
from sentence_transformers import SentenceTransformer

# Load a multilingual model that supports Turkish
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# List of Turkish sentences
sentences = [
    "Bu havada yürümek çok güzel.",
    "Yapay zeka ile programlama oldukça ilginç.",
    "Kodlamayı YZ ile yapmayı çok seviyorum",
    "Programlama işlerimi yapay zeka teknikleri kullanarak yapmak çok enteresan"
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

# Calculate the cosine similarity
cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



### Typos

In [67]:
model = SentenceTransformer('all-MiniLM-L6-v2')


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [68]:
sentences = [
    "Great accommodation",
    "Great acommodation",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [52]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
text = "Great accommodation"
tokenized_text = tokenizer(text)
print("Tokenized text:", tokenized_text)
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

Tokenized text: {'input_ids': [101, 2307, 11366, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
Tokens: ['great', 'accommodation']
Token IDs: [2307, 11366]



`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [72]:
from transformers import RobertaTokenizer

# Load RoBERTa's tokenizer (BPE-based)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

text = "Great acommodation"
tokenized_text = tokenizer(text)
print("Tokenized text:", tokenized_text)
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

Tokenized text: {'input_ids': [0, 19065, 10, 175, 14377, 1258, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Tokens: ['Great', 'Ġa', 'com', 'mod', 'ation']
Token IDs: [19065, 10, 175, 14377, 1258]


In [50]:
model = SentenceTransformer('all-roberta-large-v1')


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



In [51]:
sentences = [
    "Great accommodation",
    "Great acommodation",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

### Unknown Symbols


In [56]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [76]:
embeddings.shape

(2, 384)

In [78]:
sentences = [
    "I feel happy",
    "I feel 😊"
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

### Numerical values and date/time

In [79]:
sentences = [
    "This shirt costs $55.",
    "This shirt costs fifty five dollars.",
    "This shirt costs $50.",
    "This shirt costs $559.",
    "This shirt has a 10% discount from $60.",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [82]:
sentences = [
    "16th February 2024",
    "2024-02-16",
    "17th February 2024",
    "18th February 2024",
    "19th February 2024",
    "20th February 2024",
    "15th February 2024",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)


fig = px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)
fig.update_layout(
    xaxis={"type": "category"},
    yaxis={"type": "category"}
)
fig.update_xaxes(tickangle=-30)

In [21]:
2.9
2.99