<a href="https://colab.research.google.com/github/ymoslem/Sentence-Similarity/blob/main/Semantic_Search_Faiss_Multilingual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multilingual Semantic Search with Faiss

This notebook demonstrates how to use Faiss and Sentence-Transformers to perform multilingual semantic search. We have a query in English ("education and schools in Germany") and we retrieve top results from documents in the Ukrainian language.

* Faiss: https://github.com/facebookresearch/faiss
* Faiss tutorial: https://www.pinecone.io/learn/series/faiss/faiss-tutorial/
* Sentence-Transformers: https://github.com/UKPLab/sentence-transformers
* Sentence-Transformers documentation: https://sbert.net/

* Notebooks for some other similarity techniques [here](https://github.com/ymoslem/Sentence-Similarity).

In [None]:
!pip3 install faiss-gpu sentence_transformers -q

In [None]:
# Download files
!git clone https://github.com/ymoslem/Notion-Scraper.git -q

%cd Notion-Scraper/output/
!ls

In [None]:
import json
import os

work_dir = "."
json_files = [file_name for file_name in os.listdir(work_dir) if file_name.endswith(".json")]

corpus = []

for json_file in json_files:
  with open(os.path.join(work_dir,json_file)) as json_input:
    json_content = json.load(json_input)
    for item in json_content:
      url = item["url"]
      text_paragraphs = item["text"].split("\n")
      text_paragraphs = [(para, json_file[:-5], item["topic"], item["url"]) for para in text_paragraphs if len(para.split())>10 \
                     and (para, json_file[:-5], item["topic"], item["url"]) not in corpus]
      corpus += text_paragraphs

corpus[0:5]

In [None]:
from sentence_transformers import SentenceTransformer

corpus_sentences = [item[0] for item in corpus]

embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2",
                               device="cuda")

# Change the max length to 512
embedder.max_seq_length = 512

In [None]:
# Encode the sentences into embeddings
corpus_embeddings = embedder.encode(corpus_sentences,
                                    convert_to_numpy=True,
                                    show_progress_bar=True)

In [None]:
# Save corpus_embeddings to a file to be able to load later
import pickle

with open("corpus_embeddings_uk.pkl", "wb") as embeddings:
  pickle.dump({'corpus': corpus, 'embeddings': corpus_embeddings}, embeddings)

In [None]:
# To load the embeddings later from the file instead of creating from scratch
import pickle

with open("corpus_embeddings_uk.pkl", "rb") as embeddings:
  data = pickle.load(embeddings)
  corpus = data['corpus']
  corpus_sentences = [item[0] for item in corpus]
  corpus_embeddings = data['embeddings']

In [None]:
corpus_embeddings.shape

In [None]:
import faiss

embedding_size = 384  # as in the model
n_clusters = 128
top_k_hits = 20

quantizer = faiss.IndexFlatL2(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters)

# Number of clusters to explorer at search time.
# We will search for nearest neighbors in 8 clusters
index.nprobe = 8

### Create the FAISS index
print("Start creating FAISS index")

# Train the index to find a suitable clustering
index.train(corpus_embeddings)

# Add all embeddings to the index
index.add(corpus_embeddings)

print("Number of embeddings indexed:", index.ntotal)

In [None]:
from sentence_transformers import SentenceTransformer

queries = ["education and schools in Germany"]
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

query_embeddings = model.encode(queries)

# Search in FAISS. It returns a matrix with distances and corpus ids.
distances, corpus_ids = index.search(query_embeddings, k=top_k_hits)

print(corpus_ids, "\n")

results = sorted([result for result in zip(distances.flatten(), corpus_ids.flatten())])
print(results, "\n")

for distance, idx in results:
  print(corpus_sentences[idx])
  print(f"Read more: {corpus[idx][1]} - {corpus[idx][2]}: {corpus[idx][3]}")
  print(f"Distance: {round(distance.item(), 2)}\n")

In [None]:
results[:5]

In [None]:
# Reranking input [(query, paragraph), (query, paragraph), (query, paragraph), ...]

reranker_input = [(queries[0], corpus[result[1]][0]) for result in results]
reranker_input[:5]

In [None]:
# [Optional] Reranking
# After retrieving the top-k candidates, we can re-rank them with a cross-encoder model

from sentence_transformers import CrossEncoder

model = CrossEncoder("amberoad/bert-multilingual-passage-reranking-msmarco", max_length=512)

reranker_scores = model.predict(reranker_input)

# label 0: not relevant, and label 1: relevant
reranker_scores

In [None]:
# [Optional] Convert logits to probabilities for readability or to apply a threshold

import numpy as np

# Convert logits to probabilities using the sigmoid function
reranker_scores = [1 / (1 + np.exp(-score)) for score in reranker_scores]
reranker_scores

In [None]:
# full hits from the corpus with links
full_hits = [[result[0], corpus[result[1]]] for result in results]
reranker_output = zip(reranker_scores, full_hits)

# Compare the results before and after reranking
# for score, hit in zip(reranker_scores, full_hits):
#   print(score, hit)

sorted_reranked_output = sorted([(score[1], hit[1]) for score, hit in reranker_output], reverse=True)

for score, hit in sorted_reranked_output:
  print(f"{hit[0]} \nRead more: {hit[1]} - {hit[2]}: {hit[3]} \nScore: {round(score.item(), 2)}\n")

# GPU

In [None]:
# To load the embeddings later from the file instead of creating from scratch
import pickle

with open("corpus_embeddings_uk.pkl", "rb") as embeddings:
    data = pickle.load(embeddings)
    corpus = data['corpus']
    corpus_sentences = [item[0] for item in corpus]
    corpus_embeddings = data['embeddings']

In [None]:
import os

# Which GPU to use (if you have multiple GPUs)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # or "0,1" for multiple gpus

# For debugging CUDA errors
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

In [None]:
!echo $CUDA_VISIBLE_DEVICES

In [None]:
# Single GPU

import faiss

embedding_size = 384
n_clusters = 16
top_k_hits = 10

quantizer = faiss.IndexFlatL2(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters, faiss.METRIC_L2)

# Number of clusters to explorer at search time.
# We will search for nearest neighbors in 8 clusters
index.nprobe = 8

ngpus = faiss.get_num_gpus()
print("Number of GPUs:", ngpus)

res = faiss.StandardGpuResources()  # use a single GPU
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
print("Now the index in on GPU.")

# Train the index to find a suitable clustering
assert not gpu_index_flat.is_trained
gpu_index_flat.train(corpus_embeddings)
assert gpu_index_flat.is_trained
print("Training complete!")

gpu_index_flat.add(corpus_embeddings)  # add vectors to the index
print(gpu_index_flat.ntotal, "added.")

In [None]:
# Multiple GPUs

import faiss

embedding_size = 384
n_clusters = 64
top_k_hits = 10


quantizer = faiss.IndexFlatL2(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters)

# Number of clusters to explorer at search time.
# We will search for nearest neighbors in 8 clusters
index.nprobe = 8


print("Moving index to gpu before training")

ngpus = faiss.get_num_gpus()
print("Number of GPUs:", ngpus)

gpu_index_flat = faiss.index_cpu_to_all_gpus(index)
print("Now the index in on GPU.")

# Train the index to find a suitable clustering
assert not gpu_index_flat.is_trained
gpu_index_flat.train(corpus_embeddings)
assert gpu_index_flat.is_trained
print("Training complete!")

# Add vectors to the index
gpu_index_flat.add(corpus_embeddings)
print(gpu_index_flat.ntotal, "added.")