In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
workspace = '/content/drive/MyDrive/Courses/CS410/MP4/'

In [None]:
import os
os.path.exists(workspace)

True

# Semantic Search using Language Models and Nearest Neighbor Indexes

Go ahead and install these libraries


In [None]:
 %%capture
 !pip install sentence-transformers faiss-cpu ir-measures pyserini torch 

ANN is a form of semantic search which leverages vector based represenatations for queries and documents to perform retrieval. ANN approaches have become incredibly popular for search because of the success of Language Models such as BERT. 

Before we explore how they can be used for search, we will explore how language models can be used to form vector based representations. We will be using the Sentence Transformers library, which is a Language Model library which is optimized for representing sentences as text.

## Creating Sentence Representations

Import the needed libraries

In [None]:
from sentence_transformers import SentenceTransformer
import json
import numpy
from torch import nn
import torch

To create sentence representations we must select and load a model as shown below

In [None]:
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Lets looking at some of the samples and comparing their cosine distance

In [None]:
# What we want to represent
sentences = ['CS410 is a computer science class focused on information retireval',
             'The Cat is in the hat',
             'Methods in search use semantic search']

# create embedding
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding.shape)
    print("")

Sentence: CS410 is a computer science class focused on information retireval
Embedding: (768,)

Sentence: The Cat is in the hat
Embedding: (768,)

Sentence: Methods in search use semantic search
Embedding: (768,)



As we can see below sentence one, which deals with search, is closer to three than two, but both are far away. Go ahead and explore some sentences to see how minor semantic variations can be understood. Each setence has a extracted embedding which is a projection of the inputed text into a N dimensional space. In this case 768 dimensions

In [None]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
output = cos(torch.tensor(embeddings[0]),torch.tensor(embeddings[1]))
print("The similarity between sentence one and two is: {}".format(output))
output = cos(torch.tensor(embeddings[0]),torch.tensor(embeddings[2]))
print("The similarity between sentence one and thee is: {}".format(output))
output = cos(torch.tensor(embeddings[1]),torch.tensor(embeddings[2]))
print("The similarity between sentence two and three is: {}".format(output))

The similarity between sentence one and two is: 0.24906383454799652
The similarity between sentence one and thee is: 0.2929505407810211
The similarity between sentence two and three is: 0.23856395483016968


## Using Sentence Representations for Search

Using these sentence representations can be an effictive way of representing text and is also incredibly efficient when paired with a nearest neighbor index such as FAISS. Below is an example on how to go ahead loading sentence embeddings into an index and how to search this index.

In [None]:
import faiss  
d = 768 # The dimensionality of out embeddings
model.max_seq_length = 512
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
print(index.ntotal)

True
0


In [None]:
# document_collection = {}
i = 0

sentences = ['CS410 is a computer science class focused on information retireval',
             'The Cat is in the hat',
             'Romeo Romeo where are you']

for sentence in sentences:
    embedding = model.encode([sentence])
    index.add(embedding)# add vectors to the index
    i += 1

print("Our Index now has {} documents".format(index.ntotal))

Our Index now has 3 documents


Now that we have loaded each document into our index, we can go about searching semantically. Note that as our index only has 3 items, we will retrieve the whole index each time. In practice a FAISS index can have Billions of items and can still work incredibly well

In [None]:
k = 3 # amount of Docs to retieve

query = 'lets learn about cats'
embedding = model.encode([query])
distances, indexes = index.search(embedding, k) # search
for i in range(len(indexes[0])):
    print("Document {} is {} far away from the query: {}".format(sentences[indexes[0][i]], distances[0][i], query))
print()

query = 'computer science'
embedding = model.encode([query])
distances, indexes = index.search(embedding, k) # search
for i in range(len(indexes[0])):
    print("Document {} is {} far away from the query: {}".format(sentences[indexes[0][i]], distances[0][i], query)) 
print()   
    
query = 'Juliet misses you'
embedding = model.encode([query])
distances, indexes = index.search(embedding, k) # search
for i in range(len(indexes[0])):
    print("Document {} is {} far away from the query: {}".format(sentences[indexes[0][i]], distances[0][i],query))
print()

Document The Cat is in the hat is 28.104284286499023 far away from the query: lets learn about cats
Document Romeo Romeo where are you is 45.666751861572266 far away from the query: lets learn about cats
Document CS410 is a computer science class focused on information retireval is 53.587955474853516 far away from the query: lets learn about cats

Document CS410 is a computer science class focused on information retireval is 32.00547409057617 far away from the query: computer science
Document The Cat is in the hat is 42.66061019897461 far away from the query: computer science
Document Romeo Romeo where are you is 48.577293395996094 far away from the query: computer science

Document Romeo Romeo where are you is 26.879138946533203 far away from the query: Juliet misses you
Document The Cat is in the hat is 49.631248474121094 far away from the query: Juliet misses you
Document CS410 is a computer science class focused on information retireval is 62.04816436767578 far away from the query:

Using this same approach we have gone ahead and created some representations using a few different language models for the CS410 corpus. Feel free to create addition models if you wish

In [None]:
import pickle
import numpy as np

In [None]:
model_names = ['sentence-transformers/multi-qa-mpnet-base-dot-v1',
               'sentence-transformers/multi-qa-distilbert-cos-v1',
               'sentence-transformers/multi-qa-mpnet-base-dot-v1',
               'sentence-transformers/msmarco-distilbert-base-tas-b', 
               'sentence-transformers/all-mpnet-base-v2',
               'sentence-transformers/sentence-t5-base',
               'sentence-transformers/all-distilroberta-v1',
               'sentence-transformers/msmarco-bert-base-dot-v5',
               'sentence-transformers/stsb-distilbert-base',
               'sentence-transformers/multi-qa-distilbert-cos-v1',
               'sentence-transformers/nq-distilbert-base-v1']

for model_name in model_names:
    model = SentenceTransformer(model_name)
    embeddings = []
    with open(workspace + 'data/collection.jsonl', 'r') as f:
        for l in f:
            data = json.loads(l)
            embedding = model.encode([data['contents']])
            embeddings.append(embedding)

    with open(workspace + 'data/documents/documents' + model_name.replace('/','') + '.pkl', 'wb') as f:
        pickle.dump(embeddings, f)  # store the object data to the file

In [None]:
model_names = ['sentence-transformers/multi-qa-mpnet-base-dot-v1',
               'sentence-transformers/multi-qa-distilbert-cos-v1',
               'sentence-transformers/multi-qa-mpnet-base-dot-v1',
               'sentence-transformers/msmarco-distilbert-base-tas-b', 
               'sentence-transformers/all-mpnet-base-v2',
               'sentence-transformers/sentence-t5-base',
               'sentence-transformers/all-distilroberta-v1',
               'sentence-transformers/msmarco-bert-base-dot-v5',
               'sentence-transformers/stsb-distilbert-base',
               'sentence-transformers/multi-qa-distilbert-cos-v1',
               'sentence-transformers/nq-distilbert-base-v1']

for model_name in model_names:
    model = SentenceTransformer(model_name)
    embeddings = []
    with open(workspace + 'data/queries.txt','r') as f:
        for l in f:
            l = l.strip()
            embedding = model.encode([l])
            embeddings.append(embedding)

    with open(workspace + 'data/queries/queries' + model_name.replace('/','') + '.pkl', 'wb') as f:
        pickle.dump(embeddings, f)  # store the object data to the file