<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Retrieve-Fuzzy-Matches-Faiss-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create a bilingual dataset with fuzzy matches

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Load files

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
!wc -l all*

In [None]:
# Test files (FUZZY)
source_file_name = "all-filtered.es.fuzzy.test"
target_file_name = "all-filtered.en.fuzzy.test"

# Apply the same for training datasets

In [None]:
with open(source_file_name) as source, open(target_file_name) as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

# Indexing the dataset with Faiss

In [None]:
!pip3 install faiss-cpu sentence_transformers &> /dev/null

In [None]:
# Load the model to create embeddings
# Make sure the model works for the required language

model_name = "microsoft/Multilingual-MiniLM-L12-H384"

# Other model options
# model_name = "paraphrase-multilingual-MiniLM-L12-v2"  # multilingual
# model_name = "all-MiniLM-L6-v2"  # English

In [None]:
import os
from google.colab import userdata

shared_drive = userdata.get("shared_drive")

model_directory = os.path.join(shared_drive, "models")

os.chdir(directory)
os.getcwd()

In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer(model_name,
                               cache_folder=model_directory,
                               device="cuda")

# change the max length to 512
embedder.max_seq_length = 512

In [None]:
# encode the texts into embedding

corpus_embeddings = embedder.encode(source_sentences,
                                    convert_to_numpy=True,
                                    show_progress_bar=True)

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

## Saving / Loading embeddings

In [None]:
# Save the embeddings for the TEST set
pkl_file_name = "medical-testset-embeddings-MS-Multilingual-MiniLM-L12-H384-spanish.pkl"

In [None]:
# Save corpus_embeddings to a file to be able to load later
import pickle

with open(pkl_file_name, "wb") as embeddings_pkl:
  pickle.dump({"corpus": source_sentences,
               "target": target_sentences,
               "embeddings": corpus_embeddings,
               },
              embeddings_pkl)

In [None]:
# To load the embeddings later from the file instead of creating from scratch
import pickle

with open(pkl_file_name, "rb") as embeddings_pkl:
  data = pickle.load(embeddings_pkl)
  source_sentences = data["corpus"]
  target_sentences = data["target"]
  corpus_embeddings = data["embeddings"]

In [None]:
corpus_embeddings.shape

(50000, 384)

## Train a Faiss index

In [None]:
import faiss

# embedding size, same as the model
embedding_size = 384

# Number of clusters used for faiss
# 4*sqrt(N) to 16*sqrt(N) where N is the size of the dataset
n_clusters = 4096

quantizer = faiss.IndexFlatL2(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, n_clusters)

# Number of clusters to explore at search time.
# We will search for nearest neighbors in 32 clusters
index.nprobe = 32

### Create the FAISS index
print("Start creating FAISS index")

# Train the index to find a suitable clustering
index.train(corpus_embeddings)

# Add all embeddings to the index
index.add(corpus_embeddings)

print("Number of embeddings indexed:", index.ntotal)

Start creating FAISS index
Number of embeddings indexed: 50000


In [None]:
# Saving the index for the TEST set
index_file_name = "medical-testset-embeddings-IndexIVFFlat-4096-MS-Multilingual-MiniLM-L12-H384-spanish.index"

faiss.write_index(index, index_file_name)

## Search

In [None]:
from sentence_transformers import SentenceTransformer

top_k_hits = 10


queries = ["Niños y adolescentes No se recomienda el uso de Telmisartán Teva en niños y adolescentes hasta 18 años."]


# Use the same model you used for embedding the dataset
model_name = "microsoft/Multilingual-MiniLM-L12-H384"

# model_name = "paraphrase-multilingual-MiniLM-L12-v2"  # multilingual
# model_name = "all-MiniLM-L6-v2"  # English

model = SentenceTransformer(model_name,
                            cache_folder="/content/drive/MyDrive/models",
                            device="cuda")

query_embeddings = model.encode(queries)

# Search in FAISS. It returns a matrix with distances and corpus ids.
distances, corpus_ids = index.search(query_embeddings,
                                     k=top_k_hits)

print(corpus_ids, "\n")

results = sorted([result for result in zip(distances.flatten(), corpus_ids.flatten())])
print(results, "\n")

print(queries[0], "\n")

for distance, idx in results:
  print(source_sentences[idx], sep="\n")
  print(f"Distance: {round(distance.item(), 2)}\n")

# Use saved index

In [None]:
!pip3 install faiss-cpu sentence_transformers &> /dev/null

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
# Load the index
import faiss

# Load the saved index for the test set
saved_index = "medical-testset-embeddings-IndexIVFFlat-4096-MS-Multilingual-MiniLM-L12-H384-spanish.index"

index = faiss.read_index(saved_index)

In [None]:
# Load the data (and embeddings - not required for search)
import pickle

# Load the embeddings for the test set
pkl_file_name = "medical-testset-embeddings-MS-Multilingual-MiniLM-L12-H384-spanish.pkl"

with open(pkl_file_name, "rb") as embeddings_pkl:
  data = pickle.load(embeddings_pkl)
  source_sentences = data["corpus"]
  target_sentences = data["target"]
  corpus_embeddings = data["embeddings"]

In [None]:
# Test files (REAL)
source_file_name = "all-filtered.es.real.test"
target_file_name = "all-filtered.en.real.test"

In [None]:
with open(source_file_name) as source, open(target_file_name) as target:
  online_source_sentences = [sent.strip() for sent in source.readlines()]
  online_target_sentences = [sent.strip() for sent in target.readlines()]

print(online_source_sentences[0])
print(online_target_sentences[0])

El consumo nocivo de alcohol es responsable por cerca de 3% de todas las muertes que ocurren en el planeta, incluyendo desde cirrosis y cáncer hepático hasta accidentes, caídas, intoxicaciones y homicidios.
The harmful use of alcohol is responsible for about 3% of all deaths that occur on the planet, ranging from liver cancer and cirrhosis to accidents, falls, poisoning and murder.


In [None]:
model_name = "microsoft/Multilingual-MiniLM-L12-H384"

In [None]:
# TEST - Find fuzzies

from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

top_k_hits = 3  # it can more or less, and 'rerank' the results later

index.nprobe = 32

queries = online_source_sentences[:5]
queries_len = len(queries)

model = SentenceTransformer(model_name,
                            cache_folder="/content/drive/Shareddrives/adapt-yasmin/models/",
                            device="cuda")

query_embeddings = model.encode(queries)

# Search in FAISS for each query
distances, corpus_ids = index.search(query_embeddings,
                                     k=top_k_hits)

# Process results for each query
for query_idx, (dist, ids) in tqdm(enumerate(zip(distances, corpus_ids)), total=queries_len):
  print(f"\nQuery {query_idx + 1}: {queries[query_idx]}")
  results = sorted([result for result in zip(dist.flatten(), ids.flatten())])

  # Store the results of the current query
  result_rows = []
  for distance, idx in results:
    result_rows.append((distance,
                        source_sentences[idx],
                        target_sentences[idx]
                        ))

  print(*result_rows, sep="\n")

In [None]:
output_file_name = "all-filtered.esen.ms-multi-12.online.test"

In [None]:
# Find fuzzies and save to file
# We will use top_k_hits 1 to get only one result
# If more top_k_hits retrieved, reranking can be used to get the best result

from sentence_transformers import SentenceTransformer

top_k_hits = 1

index.nprobe = 32

queries = online_source_sentences
queries_len = len(queries)

model = SentenceTransformer(model_name,
                            cache_folder="/content/drive/Shareddrives/adapt-yasmin/models/",
                            device="cuda")

query_embeddings = model.encode(queries)

# Search in FAISS for each query
distances, corpus_ids = index.search(query_embeddings,
                                     k=top_k_hits)

# Process results for each query and save to file
with open(output_file_name, "w+") as output_file:
  for query_idx, (dist, ids) in enumerate(zip(distances, corpus_ids)):
    results = sorted([result for result in zip(dist.flatten(), ids.flatten())])

    # Store the results of the current query
    result_rows = []
    for distance, idx in results:
      result_rows.append((distance,
                          source_sentences[idx],
                          target_sentences[idx],
                          ))

    # Save the output to file

    score = result_rows[0][0]
    new_src_sent = queries[query_idx]
    fuzzy_src_sent = result_rows[0][1]
    fuzzy_tgt_sent = result_rows[0][2]

    output = f"{score} ||| {fuzzy_src_sent} ||| {new_src_sent} ||| {fuzzy_tgt_sent}"
    output_file.write(output + "\n")

    # Check the output for the first few segments
    if query_idx >= 0 and query_idx < 3:
      print(f"\nQuery {query_idx}: {queries[query_idx]}")
      print(output)

In [None]:
!head -n 5 $output_file_name