# Semantic Search

## Import Libraries

In [1]:
import torch
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv
from sentence_transformers import SentenceTransformer
import os
from pinecone import Pinecone, ServerlessSpec

_ = load_dotenv(find_dotenv("secrets.env", raise_error_if_not_found=True))

ROOT_DIR = os.environ["ROOT_DIR"]
print("all packaged imported")

  from .autonotebook import tqdm as notebook_tqdm


all packaged imported


## Load and Transform the dataset

In [None]:
# Load the dataset
with open(file=f"{ROOT_DIR}/Pinecone Building Applications with Vector Databases/files/datasetText.txt") as file:
	dataset = file.read().split("\n")[1:]
# dataset[:20]

# Extract the questions from the dataset
import re
questions = []
for data in dataset:
	result = re.findall(r"'text': \[([^\]]+)", data)
	result = list(result)
	for q in result:
	# 	q = q.split(", ")
		pattern = r'([\'"])(.*?)\1'
		q = re.findall(pattern, q)
		for e in q:
			garbage, question = e
			questions.append(question)

print("\n".join(questions[:5]))
print("-"*50)
print(f"Number of questions = {len(questions)}")

# Not exactly 100000 questions, but that should do it

## Instantiate the Model

In [None]:
# See if a GPU is available
if torch.cuda.is_available():
    device = "cude"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

# Load the model on the GPU
modelName = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name_or_path=modelName, device=device)
print(f"Model: {modelName}\nRunning on: {device}")

## Encoding Queries

In [None]:
query = "what is the most populated city in the world?"
xq = model.encode(sentences=query)
print(f"Shape of xq: {xq.shape}\nType of xq: {type(xq)}")


## Setting up Pinecone

In [2]:
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
pinecone = Pinecone(api_key=PINECONE_API_KEY)