In [12]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache

In [11]:
# Load the pre-trained model globally (so it's not loaded multiple times)
model = SentenceTransformer('all-MiniLM-L6-v2')


In [16]:
# Cache function to store embeddings
@lru_cache(maxsize=100)  # Cache up to 100 entries, adjust as needed
def get_embedding(text):
    """
    Compute and cache the embedding for a given text.
    """
    embedding = model.encode(text, show_progress_bar=True)
    return tuple(embedding)  # Tuples are hashable and compatible with lru_cache

In [17]:
def compute_similarity(text1, text2):
    """
    Compute the cosine similarity between the embeddings of two texts.
    """
    embedding1 = get_embedding(text1)
    embedding2 = get_embedding(text2)
    
    # Convert tuple back to numpy array for similarity calculation
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    
    return similarity_score

### sanity check

In [18]:
text1 = "The sky is blue and clear today."
text2 = "Today's sky is clear and has a blue color."
text3 = "Functional Safety Standards are complex, sometimes blue, documents"

In [20]:
similarity_1 = compute_similarity(text1, text2)
similarity_2 = compute_similarity(text1, text3)
similarity_3 = compute_similarity(text2, text3)
print(f"Cosine Similarity 1-2: {similarity_1:.4f}")
print(f"Cosine Similarity 1-3: {similarity_2:.4f}")
print(f"Cosine Similarity 2-3: {similarity_3:.4f}")

Cosine Similarity 1-2: 0.9523
Cosine Similarity 1-3: 0.1014
Cosine Similarity 2-3: 0.1451


CacheInfo(hits=9, misses=3, maxsize=100, currsize=3)