In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils

import os
import time
import torch

In [3]:
from tqdm.auto import tqdm

##Load the data set

In [7]:
dataset = load_dataset('quora', split='train[240000:290000]')

In [8]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [9]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?


## We are using all-MiniLM-L6-v2 sentence-transformers model that maps sentences to a 384 dimension dense vector space


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda')
model = SentenceTransformer('all-MiniLM-L6-v2', device = device)

Sorry no cuda


In [11]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [12]:
print(xq)

[ 1.50860816e-01  1.29710687e-02 -5.08222915e-02  6.06876351e-02
 -3.35445802e-04 -2.90263370e-02  5.06248586e-02  2.99537759e-02
 -3.10023874e-02  5.96766174e-02  5.95133603e-02 -1.33624256e-01
  7.15946360e-03  4.92674299e-02  1.59945562e-02 -2.03522742e-02
  3.74022909e-02 -9.18012187e-02  7.44751617e-02 -5.41728698e-02
 -5.14636636e-02 -4.53633741e-02  6.31610602e-02  4.29923497e-02
  2.43183039e-02  2.65026409e-02  1.74598321e-02  8.24665278e-02
 -1.59858707e-02 -7.79033499e-03 -1.71448514e-02  7.63704404e-02
  1.10217586e-01 -2.29893699e-02  8.37783143e-03  7.16604386e-03
 -5.05503267e-03 -4.04071100e-02  3.40002216e-02  3.00653744e-02
  4.23340052e-02 -2.88223643e-02  3.75781246e-02 -4.33307774e-02
  2.17319559e-02  8.27986002e-03 -1.04310019e-02  7.60997087e-02
  3.55798937e-03  4.72234040e-02  3.57121490e-02  7.10471943e-02
 -2.74737459e-02  2.82487366e-03  1.41288498e-02 -3.47286314e-02
  4.96940338e-04 -3.59864160e-02  5.80318389e-04  1.07831005e-02
 -1.44697051e-03  4.60125

## Setting up Pinecone

In [13]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [17]:
pinecone = Pinecone(api_key = PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name = INDEX_NAME,
                      dimension = model.get_sentence_embedding_dimension(),
                      metric ='cosine',
                      spec = ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)
print(index)
                      

dl-ai-lwhuo6rht3blbkfjqfz7dzqe3fex2vopanwu
<pinecone.data.index.Index object at 0x7f823319df40>


## Create Embeddings and upsert to pinecone

In [18]:
batch_size = 200
vector_limit = 10000

questions = question[:vector_limit]
import json

for i in tqdm(range(0,len(questions), batch_size)):
    #finding end of batch
    i_end = min(i+batch_size, len(questions))
    #create id's batch 
    ids = [str(x) for x in range(i, i_end)]
    #create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    #create embeddings
    xc = model.encode(questions[i:i_end])
    #create records list for upsert
    records = zip(ids, xc, metadatas)
    #upsert to pinecone
    index.upsert(vectors= records)

  0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

## RUn your query

In [23]:
#small helper function so that we can repeat queries later
def run_query(query):
    embedding = model.encode(query).tolist()
    results = index.query(top_k = 10, vector = embedding, include_metadata= True, include_values= False)
    for result in results['matches']:
        print(f"{round(result['score'],2)}: {result['metadata']['text']}")

In [24]:
run_query("Which city has the highest population in the world?")

0.69: What is the most beautiful city in the world?
0.67: Which is the most urbanised city in India?
0.64: Which city has the most museums per capita?
0.58: What country has the most beautiful people?
0.56: Which is the most powerfull country in the world?
0.56: What percentage of the world's population lives in developing countries?
0.54: What's the highest mountain in the world?
0.53: Why is Uttar Pradesh the most populous state in India?
0.53: What are the world`s deadliest tourist destinations?
0.51: Which is the safest country in the world?


In [25]:
query = 'how do i make biryani?'
run_query(query)

0.41: What is a good substitute for bulgur?
0.41: What is a substitute for red wine in making bolognese?
0.4: What is the best spaghetti bolognese recipe?
0.4: How do I manufacture a bicycle in Brazil?
0.39: How do you make gravy without any dairy?
0.38: How do I make friendship bracelets?
0.38: How do you make love?
0.38: What is a good substitute for tarragon in a recipe?
0.38: How can I reduce bilirubin levels in my body?
0.36: How can I make a delicious cake?
