In [1]:
from sentence_transformers import evaluation, SentenceTransformer

## Load models

In [2]:
model = SentenceTransformer("stsb-xlm-r-multilingual")

In [3]:
model_labse = SentenceTransformer("LaBSE")

In [22]:
model_para_xlm_r = SentenceTransformer("paraphrase-xlm-r-multilingual-v1")

  0%|          | 0.00/1.01G [00:00<?, ?B/s]

In [45]:
model_para_minilm_l6 = SentenceTransformer('paraphrase-MiniLM-L6-v2')

  0%|          | 0.00/83.4M [00:00<?, ?B/s]

You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





## Get data

In [4]:
# read data
data_path = "/Users/clementow/repos/translate_selenium/data/sts-benchmark/sts-test-translate.csv"

with open(data_path) as fopen:
    dataset = list(filter(None, fopen.read().split('\n')))
len(dataset)

1379

In [5]:
display(dataset[2].split('\t')[5])
display(dataset[2].split('\t')[6])
dataset[2].split('\t')[4]

"One woman is measuring another woman's ankle."

'Seorang wanita mengukur buku lali wanita lain.'

'5.000'

In [62]:
sent1 = []
sent2 = []
scores = []
for data in dataset:
    data_list = data.split('\t')
    sent1.append(data_list[5])
    sent2.append(data_list[6])
    scores.append(data_list[4])

In [7]:
display(len(sent1), len(sent2), len(scores))

1379

1379

1379

In [9]:
def prep_stsbenchmark_dataset(data_path="/Users/clementow/repos/translate_selenium/data/sts-benchmark/sts-test-translate.csv"):

    with open(data_path) as fopen:
        dataset = list(filter(None, fopen.read().split('\n')))
    
    sent1 = []
    sent2 = []
    scores = []
    for data in dataset:
        data_list = data.split('\t')
        sent1.append(data_list[5])
        sent2.append(data_list[6])
        scores.append(data_list[4])
    return sent1, sent2, scores

In [57]:
# load eng original dataset
# read data
data_path = "/Users/clementow/repos/translate_selenium/data/sts-benchmark/sts-test.csv"

sent1_en, sent2_en, scores_en = prep_stsbenchmark_dataset(data_path)

In [58]:
len(sent1_en), len(sent2_en), len(scores_en)

(1379, 1379, 1379)

In [59]:
len(sent1), len(sent2), len(scores)

(6895, 1379, 1379)

In [63]:
idx = len(sent1)

sent1.extend(sent2)
display(idx)
len(sent1[:idx])

1379

1379

In [65]:
sent1[idx:][0]

'Seorang gadis memberus rambutnya.'

## Eval

By right we can use this but this is more used when training and deals with the number of epochs and steps
```python
sts_evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1=sent1, sentences2=sent2, scores=scores)

# cannot use this as it is called only during training. 
# what i want is the just direct evaluation after the embeddings computation stage
sts_evaluator.__call__(model) 
```

In [68]:
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sentence_transformers import SentenceTransformer
from typing import List
from enum import Enum
import numpy as np
import csv
import os


class SimilarityFunction(Enum):
    COSINE = 0
    EUCLIDEAN = 1
    MANHATTAN = 2
    DOT_PRODUCT = 3

class EmbeddingSimilarityEval:
    def __init__(self, model: SentenceTransformer, sentences1: List[str], sentences2: List[str], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True):
        """
        Constructs an evaluator based for the dataset

        The labels need to indicate the similarity between the sentences.
        
        :param models: Model that you want to evaluate with
        :param sentences1:  List with the first sentence in a pair
        :param sentences2: List with the second sentence in a pair
        :param scores: Similarity score between sentences1[i] and sentences2[i]
        :param write_csv: Write results to a CSV file
        """
        self.model = model
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.scores = [float(i) for i in scores]
        self.write_csv = write_csv
        
        assert model != None and type(model) is SentenceTransformer
        assert len(self.sentences1) == len(self.sentences2)
        assert len(self.sentences1) == len(self.scores)

        self.main_similarity = main_similarity
        self.name = name

        self.batch_size = batch_size
        if show_progress_bar is None:
            show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)
        self.show_progress_bar = show_progress_bar

        self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv"
        self.csv_headers = ["cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"]
        
        
    def encode_embeddings(self):
        all_sent = list()
        #note down the sent1 end index
        sent1_end_idx = len(self.sentences1)
        #join both sent1 and sent2 into the same list
        all_sent.extend(self.sentences1)
        all_sent.extend(self.sentences2)
        self.sentences = all_sent
        embeddings = self.model.encode(self.sentences, convert_to_numpy=True, show_progress_bar=self.show_progress_bar)
        return embeddings[:sent1_end_idx], embeddings[sent1_end_idx:]
    
   

    def run_eval(self, output_path: str = None):
        embeddings1, embeddings2 = self.encode_embeddings()
        labels = self.scores
        
        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
        
        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
        
#         eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
#         eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

#         eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
#         eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

#         eval_pearson_dot, _ = pearsonr(labels, dot_products)
#         eval_spearman_dot, _ = spearmanr(labels, dot_products)

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)
                    
                writer.writerow([eval_pearson_cosine, eval_spearman_cosine, '',
                                 '', '', '', '', ''])
#                 writer.writerow([eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
#                                  eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])


        if self.main_similarity == SimilarityFunction.COSINE:
            print("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
                eval_pearson_cosine, eval_spearman_cosine))
    
    


In [69]:

sts_eval = EmbeddingSimilarityEval(model_para_minilm_l6, sent1_en, sent2_en, scores, main_similarity=SimilarityFunction.COSINE, name="paraphrase-MiniLM-L6-v2", show_progress_bar=True, write_csv=True)
sts_eval.run_eval(output_path='')

Batches:   0%|          | 0/87 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.8361	Spearman: 0.8412


[Sanity Check]
The above tallies with the Spearman's Correlation score as in https://www.sbert.net/docs/pretrained_models.html

In [47]:
sts_eval = EmbeddingSimilarityEval(model_para_minilm_l6, sent1, sent2, scores, main_similarity=SimilarityFunction.COSINE, name="paraphrase-MiniLM-L6-v2 with Malay", show_progress_bar=True, write_csv=True)
sts_eval.run_eval(output_path='')

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.2100	Spearman: 0.1696


In [27]:
sts_eval = EmbeddingSimilarityEval(model, sent1_en, sent2_en, scores, main_similarity=SimilarityFunction.COSINE, name="stsb-xlm-r-multilingual", show_progress_bar=True, write_csv=True)
sts_eval.run_eval(output_path='')

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.8379	Spearman: 0.8504


In [18]:
sts_eval = EmbeddingSimilarityEval(model, sent1_en, sent2_en, scores, main_similarity=SimilarityFunction.COSINE, name="stsb-xlm-r-multilingual", show_progress_bar=True)
sts_eval.run_eval()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.8379	Spearman: 0.8504


In [15]:
sts_eval = EmbeddingSimilarityEval(model, sent1, sent2, scores, main_similarity=SimilarityFunction.COSINE, name="stsb-xlm-r-multilingual with Malay data", show_progress_bar=True)
sts_eval.run_eval()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.6067	Spearman: 0.6337


In [23]:
sts_eval = EmbeddingSimilarityEval(model_para_xlm_r, sent1_en, sent2_en, scores, main_similarity=SimilarityFunction.COSINE, name="Paraphrase XML-R", show_progress_bar=True)
sts_eval.run_eval()

sts_eval = EmbeddingSimilarityEval(model_para_xlm_r, sent1, sent2, scores, main_similarity=SimilarityFunction.COSINE, name="Paraphrase XML-R with Malay data", show_progress_bar=True)
sts_eval.run_eval()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.8355	Spearman: 0.8350


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.5571	Spearman: 0.6027


In [20]:
sts_eval = EmbeddingSimilarityEval(model_labse, sent1_en, sent2_en, scores, main_similarity=SimilarityFunction.COSINE, name="LaBSE", show_progress_bar=True)
sts_eval.run_eval()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.7269	Spearman: 0.7225


In [21]:
sts_eval = EmbeddingSimilarityEval(model_labse, sent1, sent2, scores, main_similarity=SimilarityFunction.COSINE, name="LaBSE with Malay data", show_progress_bar=True)
sts_eval.run_eval()

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Cosine-Similarity :	Pearson: 0.4690	Spearman: 0.5319
