# Semantic Search

## Import Libraries

In [1]:
import os
import json
import torch
from tqdm.auto import tqdm
from dotenv import load_dotenv, find_dotenv
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

_ = load_dotenv(find_dotenv("secrets.env", raise_error_if_not_found=True))

ROOT_DIR = os.environ["ROOT_DIR"]
print("all packaged imported")

  from .autonotebook import tqdm as notebook_tqdm


all packaged imported


## Load and Transform the dataset

In [2]:
# Load the dataset
with open(file=f"{ROOT_DIR}/Pinecone Building Applications with Vector Databases/files/datasetText.txt") as file:
	dataset = file.read().split("\n")[1:]
# dataset[:20]

# Extract the questions from the dataset
import re
questions = []
for data in dataset:
	result = re.findall(r"'text': \[([^\]]+)", data)
	result = list(result)
	for q in result:
	# 	q = q.split(", ")
		pattern = r'([\'"])(.*?)\1'
		q = re.findall(pattern, q)
		for e in q:
			garbage, question = e
			questions.append(question)

print("\n".join(questions[:5]))
print("-"*50)
print(f"Number of questions = {len(questions)}")

# Not exactly 100000 questions, but that should do it

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
--------------------------------------------------
Number of questions = 100041


## Instantiate the Model

In [3]:
# See if a GPU is available
if torch.cuda.is_available():
    device = "cude"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# Load the model on the GPU
modelName = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name_or_path=modelName, device=device)
print(f"Model: {modelName}\nRunning on: {device}")

Model: all-MiniLM-L6-v2
Running on: mps


## Encoding Queries

In [4]:
query = "what is the most populated city in the world?"
xq = model.encode(sentences=query)
print(f"Shape of xq: {xq.shape}\nType of xq: {type(xq)}")


Shape of xq: (384,)
Type of xq: <class 'numpy.ndarray'>


## Setting up Pinecone

In [6]:
# Set up Pinecone
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
pinecone = Pinecone(api_key=PINECONE_API_KEY)

# Create an Index
INDEX_NAME = "pinecone-deeplearning-ai"

# check if it already exists
# if INDEX_NAME in pinecone.list_indexes():
# 	pinecone.delete_index(INDEX_NAME)

# # create the index
# pinecone.create_index(
# 	name=INDEX_NAME,
# 	dimension=model.get_sentence_embedding_dimension(),
# 	spec=ServerlessSpec(cloud="aws", region="us-west-2"), #TODO see if they have a region in Europe
# 	metric="cosine",
# )

# Instantiate the index object
index = pinecone.Index(INDEX_NAME)

# Show the list of all existing indexes associated with this API key
pinecone.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'pinecone-deeplearning-ai-skrwyc8.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'pinecone-deeplearning-ai',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

## Create a vector representation of the questions and store them in the index

In [15]:
batch_size = 200
vector_limit = 10000
questionsVec = questions[:vector_limit]

for i in tqdm(range(0, vector_limit, batch_size)):

	# def iEnd
	iEnd = min(i + batch_size, vector_limit)

	# def ids for this batch
	ids = [str(x) for x in range(i, iEnd)]

	# prepare the metadata (here: the questions)
	metadata = [{"text": text} for text in questionsVec[i:iEnd]]

	# create the emeddings
	emb = model.encode(questionsVec[i:iEnd])

	# zip ids, metadata and embeddings together
	package = zip(ids, emb, metadata)

	# store this batch info into the index
	index.upsert(vectors=package)

print(f"{vector_limit} vectors have been stored into the index")
print("-"*50)

# see the details of our index
index.describe_index_stats()

10000 vectors have been stored into the index
--------------------------------------------------


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

## Run queries

In [16]:
# helper function to run queries
def runQuery(query, top_k = 10):
	emb = model.encode(query).tolist()
	results = index.query(vector=emb, top_k=top_k, include_metadata=True, include_values=False)
	for result in results["matches"]:
		print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

# define your query
query = "what is the best movie of all times?"

# see the results of the semantic search
runQuery(query)

0.94: What are the best movies of all time?
0.87: Which is the all time best Hollywood movie?
0.83: Which is best Hollywood movie ever? Why?
0.8: What is best movies till date?
0.79: What are the best Hollywood movies ever?
0.78: Which is the best hollywood movie you have seen?
0.78: What are some of the best movies of all times I should watch (animated are welcome)?
0.77: Which is the best movie ever in Hollywood(new)?
0.77: Which is the best movie ever in Hollywood(new)?
0.71: What are your favorite movies and why?
