In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr


cos_sim = lambda a, b: np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))


def compare_embeddings(embedding_one, embedding_two):
    emb_one_is_list = isinstance(embedding_one[0], (list, np.ndarray))
    emb_two_is_list = isinstance(embedding_two[0], (list, np.ndarray))

    # pairwise compare all embeddings
    if emb_one_is_list and emb_two_is_list:
        if len(embedding_one) != len(embedding_two):
            raise RuntimeError(
                "embedding one and embedding two lists not of equal length"
            )
        return np.array([
            cos_sim(e1, e2)
            for e1, e2 in zip(embedding_one, embedding_two)
        ])

    # embedding one compared against all of embedding two
    elif emb_two_is_list:
        return [cos_sim(embedding_one, e2) for e2 in embedding_two]
    # compare embedding one against embedding two
    elif not emb_one_is_list:
        return cos_sim(embedding_one, embedding_two)

    # embedding one is a list of embeddings but embedding two is one embedding
    raise RuntimeError(
        "Either Embedding one and two are lists of embeddings, else only embedding two."
    )


def rescale_numeric(cos_dist, cur_min=-1, cur_max=1, new_min=0, new_max=5):
    # percent of measurement on current scale
    cur_perc = (cos_dist - cur_min) / (cur_max - cur_min)

    # for scaling the measurement to the new range
    scaling_fct = (new_max - new_min) + new_min
    return cur_perc * scaling_fct


def get_embedding_distances(s1_embeddings, s2_embeddings, scale=True):
    distances = compare_embeddings(s1_embeddings, s2_embeddings)

    # return cosine similarity unscaled
    if not scale:
        return distances

    return rescale_numeric(distances)


# todo: proper benchmark
def eval_embedding_correlation(distances, truth_vector):
    pearson_corr, pearson_pval = pearsonr(distances, truth_vector)
    spearman_corr, spearman_pval = spearmanr(distances, truth_vector)

    return pd.DataFrame({
        "type": ["pearson", "spearman"],
        "corr": [pearson_corr, spearman_corr],
        "pval": [pearson_pval, spearman_pval],
    })

In [5]:
def benchmark_model(sts_df, encode_func):
    s1_embeddings = encode_func(sts_df.sent1.values)
    s2_embeddings = encode_func(sts_df.sent2.values)
    gold_standard = sts_df.score.values

    embedding_distances = get_embedding_distances(
        s1_embeddings,
        s2_embeddings)

    return(eval_embedding_correlation(embedding_distances, gold_standard))

# benchmark_model(train_sentences, model.encode)

In [19]:
import os
import gzip
import csv

from sentence_transformers import util
from sentence_transformers.readers import *


def cache_sts_eval_data(
    dataset = "stsbenchmark",
    save_dir = "../data/inference",
    force = False):
    file_path = f"{save_dir}/{dataset}.tsv.gz"

    if not os.path.exists(file_path) or force:
        print("fetching dataset...")
        fetch_path = f"https://sbert.net/datasets/{dataset}.tsv.gz"
        util.http_get(fetch_path, file_path)
    else:
        print("dataset already exsits...")
    
    return file_path


def load_sent_trans_data(dataset_gz=None):
    if dataset_gz is None:
        dataset_gz = cache_sts_eval_data()

    test_sts_samples = []
    dev_sts_samples = []
    train_sts_samples = []
    with gzip.open(dataset_gz, "rt", encoding="utf8") as file_in:
        reader = csv.DictReader(
            file_in,
            delimiter="\t",
            quoting=csv.QUOTE_NONE)

        for row in reader:
            score = row.get("label",)

            if "score" in row:
                # rescale to 0 - 1
                score = float(row.get("score", None)) / 5.0

            assert score is not None, "could not detect label"

            inp_example = InputExample(
                texts=[
                    row["sentence1"],
                    row["sentence2"]
                ],
                label=score)


            if row["split"] == "test":
                test_sts_samples.append(inp_example)
            elif row["split"] == "dev":
                dev_sts_samples.append(inp_example)
            else:
                train_sts_samples.append(inp_example)

    return train_sts_samples, dev_sts_samples, test_sts_samples


## Inferencing

In [20]:
# fetch data
sts_train, sts_dev, sts_test = load_sent_trans_data()

dataset already exsits...


In [25]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator


def run_evaluator(model,
                  model_name,
                  dataset=None,
                  evaluator_name="sts-test",
                  outdir="../data/inference/sts"):
    if dataset is None:
        _, _, dataset = load_sent_trans_data()

    sts_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        sts_test,
        name=evaluator_name
    )

    outdir = f"{outdir}/{model_name}"
    os.makedirs(outdir, exist_ok=True)

    results_path = \
        f"{outdir}/similarity_evaluation_{evaluator_name}_results.csv"

    if os.path.exists(results_path):
        os.remove(results_path)

    print("evaluating model...")
    sts_evaluator(model, output_path=outdir)

    return pd.read_csv(results_path)


model_name = "paraphrase-MiniLM-L6-v2"
sts_model = SentenceTransformer(model_name)
run_evaluator(sts_model, model_name)

dataset already exsits...
evaluating model...


Unnamed: 0,epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,-1,-1,0.836131,0.841234,0.808878,0.806445,0.806642,0.803589,0.639891,0.634872


In [26]:
glove_model_name = "average_word_embeddings_glove.6B.300d"
glove_model = SentenceTransformer(glove_model_name)

run_evaluator(glove_model, glove_model_name)

Downloading: 100%|██████████| 690/690 [00:00<00:00, 719kB/s]
Downloading: 100%|██████████| 2.15k/2.15k [00:00<00:00, 1.10MB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 61.1kB/s]
Downloading: 100%|██████████| 248/248 [00:00<00:00, 61.3kB/s]
Downloading: 100%|██████████| 480M/480M [02:06<00:00, 3.79MB/s]
Downloading: 100%|██████████| 4.61M/4.61M [00:02<00:00, 1.83MB/s]
Downloading: 100%|██████████| 164/164 [00:00<00:00, 20.5kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 32.0kB/s]


dataset already exsits...
evaluating model...


Unnamed: 0,epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
0,-1,-1,0.621784,0.615356,0.598526,0.614718,0.605307,0.617694,0.32855,0.321752
