# Cosine Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Example texts
text1 = "a white corridor with a circular ceiling and a large window on the far wall that leads to another room with a skylight"
text2 = "white"

# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the texts
tfidf_matrix = vectorizer.fit_transform([text1, text2])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

print("Cosine similarity:", cosine_sim[0][0])


# Jaccard Similarity

In [None]:
print("Jaccard similarity: ", jaccard_similarity(text1, text2))

----

In [None]:
caption = "a white corridor with a circular ceiling and a large window on the far wall that leads to another room with a skylight"
tags = "building | room | skylight | hallway | space | light | ceiling | corridor | tunnel | window | wall | circular | white | round | large"

In [None]:
def jaccard_similarity(text1, text2):
    list1 = text1.split()
    list2 = text2.split()
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
def preprocess(caption, tags):
    words = caption.split()
    new_tags = tags.split(" | ")
    new_words = []
    for word in words:
        if word in new_tags:
            new_words.append(word)
    for tags in new_tags:
        if tags not in new_words:
            new_words.append(tags)
    return " ".join(new_words)

better_tags = preprocess(caption, tags)

In [None]:
better_tags

In [None]:
query = "black room"

In [None]:
print("Jaccard similarity: ", jaccard_similarity(query, better_tags))

In [None]:
# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the texts
tfidf_matrix = vectorizer.fit_transform([query, better_tags])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

print("Cosine similarity:", cosine_sim[0][0])

----

In [1]:
import numpy as np
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

In [2]:
# Define the model we want to use (it'll download itself)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
sentences = [
  "a white corridor with a circular ceiling and a large window on the far wall that leads to another room with a skylight",
  "a sunset over a body of water with the sun setting over the horizon",
  "a black and white photo of a statue of a man with a hood on his head holding his hands up in front of his face",
  "a fat man in a blue shirt laying on a chair with his feet up",
  "a white shirt and black pants with a white belt and black shoes",
  "a room with white curtains and large ceiling"
]

# vector embeddings created from dataset
embeddings = model.encode(sentences)

In [5]:
embeddings.shape

(6, 384)

In [6]:
# query vector embedding
query_embedding = model.encode("white room")

# define our distance metric
def cosine_similarity(a, b):
    return np.dot(a, b)/(norm(a)*norm(b))

# run semantic similarity search
print("Query: white room")
for e, s in zip(embeddings, sentences):
    print(s, " -> similarity score = ",
         cosine_similarity(e, query_embedding))

Query: white room
a white corridor with a circular ceiling and a large window on the far wall that leads to another room with a skylight  -> similarity score =  0.5832046
a sunset over a body of water with the sun setting over the horizon  -> similarity score =  0.09988168
a black and white photo of a statue of a man with a hood on his head holding his hands up in front of his face  -> similarity score =  0.072189346
a fat man in a blue shirt laying on a chair with his feet up  -> similarity score =  0.06918598
a white shirt and black pants with a white belt and black shoes  -> similarity score =  0.31238416
a room with white curtains and large ceiling  -> similarity score =  0.6998396
