In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [8]:
import numpy as np
from math import isclose

In [2]:
# Calculates Cosine Similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [3]:
# Normalizes a vector
def normalize_vector(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm

In [4]:
# Generate random vectors
np.random.seed(0)
vector1 = np.random.rand(1000)
vector2 = np.random.rand(1000)

# Normalize the vectors
norm_vector1 = normalize_vector(vector1)
norm_vector2 = normalize_vector(vector2)

In [5]:
print(np.linalg.norm(vector1))
print(np.linalg.norm(norm_vector1))

18.17651933044476
1.0


In [6]:
%time score = cosine_similarity(vector1, vector2)

CPU times: user 87 µs, sys: 25 µs, total: 112 µs
Wall time: 98 µs


In [7]:
%time score = np.dot(norm_vector1, norm_vector2)

CPU times: user 23 µs, sys: 8 µs, total: 31 µs
Wall time: 30.8 µs


In [9]:
print(cosine_similarity(vector1, vector2))
print(np.dot(norm_vector1, norm_vector2))

isclose(cosine_similarity(vector1, vector2), np.dot(norm_vector1, norm_vector2))

0.7463645813901374
0.7463645813901376


True

In [14]:
# Store the text file in Books folder
import os
import requests


def download_gutenberg_book(url: str, bookname: str):
    response = requests.get(url)
    book_text = response.text
    folder_name = "Books"
    
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    with open(f"{folder_name}/{bookname}.txt", "w") as file:
        file.write(book_text)


In [15]:
download_gutenberg_book('https://www.gutenberg.org/cache/epub/2852/pg2852.txt', bookname="The Hound of the Baskervilles")

In [16]:
# Write code to read the text files from the folder Books and store the text in text variable
import os

folder_name = "../Books"
text = ""
for filename in os.listdir(folder_name):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_name, filename), "r") as file:
            text += file.read()

In [18]:
# Split the text into sentences based on the newline character
sentences = text.split("\n")

# Calculate the split index to divide the list into index and search sentences
split_index = int(len(sentences) * 0.95)

# Divide the list of sentences into index sentences and search sentences based on the split index
index_sentences = sentences[:split_index]
search_sentences = sentences[split_index:]

# Check the length of the index and search sentences
len(index_sentences), len(search_sentences)

(7347, 387)

In [22]:
for sentence in sentences[100:105]:

    print(sentence)
    print("\n\n")





      “How did you know what I was doing? I believe you have eyes in



      the back of your head.”







      “I have, at least, a well-polished, silver-plated coffee-pot in





In [24]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

In [25]:
index_vecs = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/230 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [26]:
print(index_vecs)

[[ 0.18585834 -0.02780735 -0.37401932 ... -0.08716787 -0.11603957
  -0.26933447]
 [-0.52928126 -0.2696722  -0.21997276 ...  0.08029402  0.00908622
  -0.25519395]
 [ 0.15273277 -0.15258895 -0.44360036 ... -0.32564375 -0.2011139
  -0.3818391 ]
 ...
 [ 0.2466823  -0.12731965 -0.30290797 ... -0.32036534  0.06795593
  -0.1924995 ]
 [ 0.15273179 -0.61342275 -0.26124075 ... -0.1257215  -0.06204565
  -0.33400458]
 [ 0.16924407 -0.49414548 -0.27602398 ... -0.26493305  0.1770307
  -0.11719689]]


In [27]:
np.linalg.norm(index_vecs[0])  # Norm isn't 1

6.11775

In [28]:
index_vecs_norm = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs_norm = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/230 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

In [29]:
N = 200

In [30]:
%%time
for v1 in search_vecs[:N]:
    for v2 in index_vecs:
        cosine_similarity(v1, v2)

CPU times: user 4.65 s, sys: 5.07 ms, total: 4.65 s
Wall time: 4.72 s


In [31]:
%%time
for v1 in search_vecs_norm[:N]:
    for v2 in index_vecs_norm:
        np.dot(v1, v2)

CPU times: user 920 ms, sys: 3.81 ms, total: 924 ms
Wall time: 937 ms
