In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

import pickle 
import time
import seaborn as sns

from nltk.tokenize import word_tokenize 

from scipy.spatial import distance

### The Dataset

The Different Attributes in the dataset are: 
- id : text 
- Artist : text 
- Album : text 
- Decade : text 
- Sales_total : number 
- Sales_US : number
- <b>Description : Text</b>

In [4]:
dataset = pd.read_csv('../data/preprocessed.csv') 
dataset.head(3)


Unnamed: 0,id,artist,album,decade,sales_total,sales_us,description,author
0,25-the-beatles-sgt-pepper-s-lonely-hearts-club...,The Beatles,Sgt. Pepper’s Lonely Hearts Club Band,60s,1800000,1100000,decade defined sweeping cultural paradigm shif...,Max Blau
1,24-pink-floyd-the-wall,Pink Floyd,The Wall,70s,1800000,1100000,legacy pink floyd cemented dark side moon . wa...,Clint Alwahab
2,23-michael-jackson-bad,Michael Jackson,Bad,80s,1900000,1000000,bizarre consider album five consecutive number...,Josh Jackson


In [5]:
description = list(dataset['description'])

### About Faiss 

Faiss using clustering techniques to map and group similar queries together. Using the embeddings created using sentence tranformers, the faiss index is created and stored. 

In [6]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [7]:
embeddings = model.encode(description, convert_to_tensor=True)
embeddings = embeddings.numpy()

In [8]:
import faiss
index = faiss.IndexFlatL2(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings)
faiss.write_index(index, '../data/music_album_similarity')


The code is implemented using the Index Flat L2, that measures the euclidean distance between all the points and the query that is given. 

In [9]:

''''
Calculating the similarity in queries for all the descriptions using Faiss 
'''

#Determines the cluster value to be used. 
top_3_queries = 3
similar_query = []
encoded_descriptions = [ model.encode([description[i]]) for i in range(len(description))] 
start = time.time()
for i in range(len(encoded_descriptions)):
    
    D,I = index.search(encoded_descriptions[i], top_3_queries)
    similar_query.append(I)

end = time.time()
print("Runtime for similar query finding using Faiss: ", (end-start), ' seconds')

Runtime for similar query finding using Faiss:  0.0003485679626464844  seconds


### Exploring for potential speedup in searching similar queries

While Flat Index L2, performs an exhaustive search in comparing the search query with every other query that is there in the dataset, it is possible to speed up this process by using IndexIVFFLat that converts the embeddings into Voronoi cells and compares the search queries only to the centroid of the cells. 

In [10]:
''' 
Using IndexIVFFlat in expectancy of faster results 
'''
no_of_cells = 2
new_index = faiss.IndexIVFFlat(index, embeddings.shape[1], no_of_cells)


Checking if the index is trained or not 

In [11]:
new_index.is_trained

False

In [12]:
new_index.train(embeddings)
print("Is the Index trained ? ", index.is_trained )
new_index.add(embeddings)
print("The total number of indexed embeddings: ", index.ntotal)

Is the Index trained ?  True
The total number of indexed embeddings:  2




Now the same search for similar queries is performed over the faiss flat embeddings 

In [13]:

''''
Calculating the similarity in queries for all the descriptions using Faiss 
'''

#Determines the cluster value to be used. 
top_3_queries = 3
similar_query = []
encoded_descriptions = [ model.encode([description[i]]) for i in range(len(description))] 
start = time.time()
for i in range(len(encoded_descriptions)):
    
    D,I = new_index.search(encoded_descriptions[i], top_3_queries)
    similar_query.append(I)

end = time.time()
print("Runtime for similar query finding using Faiss: ", (end-start), 'milliseconds')

Runtime for similar query finding using Faiss:  0.00032210350036621094 milliseconds


There was a speed up in the performance of sentence transformers by using Faiss