In [None]:
# Load the packages and the Model!!
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.cluster import KMeans

# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & accurate

In [None]:
###########  Semantic Textual Similarity ###################
"""
Compare the meaning of two sentences.
"""

sentence1 = "A man is playing a guitar"
sentence2 = "A person is performing music"

# Encode sentences
emb1 = model.encode(sentence1, convert_to_tensor=True)
emb2 = model.encode(sentence2, convert_to_tensor=True)

# Compute cosine similarity
similarity = util.pytorch_cos_sim(emb1, emb2)
print(f"Similarity: {similarity.item():.4f}")

In [None]:
######### Semantic Search (Information Retrieval)  ##################
"""
Find the most relevant documents from a list.
"""
corpus = [
    "The cat sits outside",
    "A man is playing guitar",
    "I love pizza",
    "The new movie is amazing",
    "I have a cat who loves music"
]
query = "A person playing an instrument"

# Encode
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
query_embedding = model.encode(query, convert_to_tensor=True)

# Search
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=3)[0]
for hit in hits:
    print(f"{corpus[hit['corpus_id']]} (Score: {hit['score']:.4f})")

In [None]:
################## Clustering ##############
"""
Group similar sentences together (e.g., using KMeans).
"""
sentences = [
    "A dog barks",
    "A puppy is barking loudly",
    "He plays football",
    "The soccer game was fun",
    "Cats are great pets",
    "The kitten is adorable"
]

embeddings = model.encode(sentences)

# Cluster into 3 groups
kmeans = KMeans(n_clusters=3, random_state=42).fit(embeddings)

# Show results
for i, label in enumerate(kmeans.labels_):
    print(f"Cluster {label}: {sentences[i]}")

In [None]:
############  Zero-shot Text Classification (via semantic similarity) #############
"""
Assign one of several label descriptions to a sentence based on similarity.
"""

sentence = "I just bought a new phone and I love it and going to watch football with it!"
labels = ["Technology", "Politics", "Food", "Sports"]

# Encode
sentence_emb = model.encode(sentence, convert_to_tensor=True)
label_embs = model.encode(labels, convert_to_tensor=True)

# Match
cos_scores = util.pytorch_cos_sim(sentence_emb, label_embs)[0]
top_label_idx = cos_scores.argmax().item()
print(f"Predicted label: {labels[top_label_idx]} (Score: {cos_scores[top_label_idx]:.4f})")

In [None]:
###########  Duplicate Detection / Paraphrase Mining  ################
"""
Identify paraphrases or duplicate questions.
"""
sentences = [
    "How can I learn Python?",
    "What’s the best way to study Python programming?",
    "Tips for improving Python coding skills",
    "Where can I find good pizza in NYC?",
    "Pizza places in New York City"
]

embeddings = model.encode(sentences, convert_to_tensor=True)
pairs = util.paraphrase_mining_embeddings(embeddings)

# Show top pairs
for score, i, j in pairs[:3]:
    print(f"Score: {score:.4f} | '{sentences[i]}' <--> '{sentences[j]}'")