In [None]:
!pip install -Uqqq sentence-transformers


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import plotly.express as px


In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [26]:
# List of sentences you want to encode
sentences = [
    "I love programming.",
    "The weather is nice today.",
    "coding is my life style",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

# Print embeddings
for sentence, embedding in zip(sentences, embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding[:10]}\n")

Sentence: I love programming.
Embedding: [-1.9631904e-02 -1.7836422e-02  1.0179512e-02 -1.5698869e-02
  9.7629709e-05 -6.9564186e-02  6.6222072e-02  6.2824890e-02
  1.8769771e-02  4.8245110e-02]

Sentence: The weather is nice today.
Embedding: [-0.02945096  0.10326593  0.15032196  0.07372083  0.00080059 -0.03651824
  0.04306472 -0.08111952 -0.06354132  0.02024679]

Sentence: coding is my life style
Embedding: [-0.04929301  0.05627077 -0.00257218  0.00362889  0.01892747 -0.01958509
  0.07381872 -0.02014969 -0.02327397  0.04051569]



In [27]:
embeddings.shape

(3, 384)

In [28]:
# Calculate the cosine similarity
cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [29]:
from sentence_transformers import SentenceTransformer

# Load a multilingual model that supports Turkish
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')




In [32]:
# List of Turkish sentences
sentences = [
    "Bu havada y√ºr√ºmek √ßok g√ºzel.",
    "Yapay zeka ile programlama olduk√ßa ilgin√ß.",
    "Kodlamayƒ± YZ ile yapmayƒ± √ßok seviyorum",
    "Programlama i≈ülerimi yapay zeka teknikleri kullanarak yapmak √ßok enteresan"
]
# Generate sentence embeddings
embeddings = model.encode(sentences)

# Calculate the cosine similarity
cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)



### Typos

In [33]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
sentences = [
    "Great accommodation",
    "Great acommodation",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [37]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
text = "Great accommodation"
tokenized_text = tokenizer(text)
print("Tokenized text:", tokenized_text)
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

Tokenized text: {'input_ids': [101, 2307, 11366, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}
Tokens: ['great', 'accommodation']
Token IDs: [2307, 11366]


In [None]:
from transformers import RobertaTokenizer

# Load RoBERTa's tokenizer (BPE-based)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [38]:
text = "Great acommodation"
tokenized_text = tokenizer(text)
print("Tokenized text:", tokenized_text)
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)

Tokenized text: {'input_ids': [101, 2307, 9353, 5358, 5302, 20207, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
Tokens: ['great', 'ac', '##om', '##mo', '##dation']
Token IDs: [2307, 9353, 5358, 5302, 20207]


In [39]:
model = SentenceTransformer('all-roberta-large-v1')

In [40]:
sentences = [
    "Great accommodation",
    "Great acommodation",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

### Unknown Symbols


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embeddings.shape

In [41]:
sentences = [
    "I feel happy",
    "I feel üòä"
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

### Numerical values and date/time

In [42]:
sentences = [
    "This shirt costs $55.",
    "This shirt costs fifty five dollars.",
    "This shirt costs $50.",
    "This shirt costs $559.",
    "This shirt has a 10% discount from $60.",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)

px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)

In [None]:
sentences = [
    "16th February 2024",
    "2024-02-16",
    "17th February 2024",
    "18th February 2024",
    "19th February 2024",
    "20th February 2024",
    "15th February 2024",
]

# Generate sentence embeddings
embeddings = model.encode(sentences)

cos_sim = cosine_similarity(embeddings)


fig = px.imshow(
    cos_sim,
    x=sentences,
    y=sentences,
    text_auto=True,
)
fig.update_layout(
    xaxis={"type": "category"},
    yaxis={"type": "category"}
)
fig.update_xaxes(tickangle=-30)