In [96]:
#!pip install sentence-transformers transformers


In [97]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import RobertaModel, RobertaTokenizer, XLNetModel, XLNetTokenizer
import torch



In [109]:
reference_sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step."
]

model_sentences = {
    "BERT": [
        "The fast brown fox jumps over the lazy dog.",
        "An expedition of a thousand kilometers starts with a single stride."
    ],
    "USE": [
        "A speedy brown fox leaps over the inactive canine.",
        "Embarking on a lengthy trek initiates with a solitary footfall."
    ],
    "Tf-Idf": [
        "The swift brown fox leaps over the lazy dog.",
        "Commencing a lengthy journey starts with one initial step."
    ],
    "RoBERTa": [
        "A nimble brown fox vaults over the indolent dog.",
        "Setting off on a lengthy quest begins with a lone footstep."
    ],
    "XLNet": [
        "The agile brown fox hops over the lethargic dog.",
        "Initiating a prolonged expedition starts with a single pace."
    ]
}


In [99]:

def get_sentence_embedding(sentence, model):
    if model == "BERT":
        model_name = "bert-base-uncased"
        embedder = SentenceTransformer(model_name)
        return embedder.encode(sentence, convert_to_tensor=True).numpy()

    elif model == "USE":
        model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
        embedder = SentenceTransformer(model_name)
        return embedder.encode(sentence, convert_to_tensor=True).numpy()
    elif model == "Tf-Idf":
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([sentence] + model_sentences[model])
        return vectors.toarray()[0].reshape(1, -1)
    elif model == "RoBERTa":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaModel.from_pretrained('roberta-base')
        inputs = tokenizer(sentence, return_tensors="pt")
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()
    elif model == "XLNet":
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetModel.from_pretrained('xlnet-base-cased')
        inputs = tokenizer(sentence, return_tensors="pt")
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()
    else:
        raise ValueError(f"Model {model} not supported.")


In [130]:
cosine_similarities = {}
max_length = 100

for model, sentences in model_sentences.items():
    model_vectors = [get_sentence_embedding(sentence, model) for sentence in sentences + reference_sentences]
    flattened_model_vectors = [embedding.flatten()[:max_length] for embedding in model_vectors]
    flattened_model_vectors = [np.pad(embedding, (0, max_length - len(embedding)))[:max_length] for embedding in flattened_model_vectors]
    flattened_model_vectors_np = np.array(flattened_model_vectors)
    cosine_similarities[model] = cosine_similarity(flattened_model_vectors_np)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

In [131]:
cosine_similarity_table = pd.DataFrame({model: similarities.flatten() for model, similarities in cosine_similarities.items()})
print(cosine_similarity_table)

        BERT       USE    Tf-Idf   RoBERTa     XLNet
0   1.000000  1.000000  1.000000  1.000000  1.000000
1   0.282296  0.184619  0.000000  0.963622  0.334462
2   0.983279  0.902487  0.454021  0.993482  0.802075
3   0.316642  0.094034  0.553074  0.978428  0.226346
4   0.282296  0.184619  0.000000  0.963622  0.334462
5   1.000000  1.000000  1.000000  1.000000  1.000000
6   0.333670  0.178470  0.231880  0.957064  0.347713
7   0.861526  0.709835  0.136232  0.985813  0.488873
8   0.983279  0.902487  0.454021  0.993482  0.802075
9   0.333670  0.178470  0.231880  0.957064  0.347713
10  1.000000  1.000000  1.000000  1.000000  1.000000
11  0.351203  0.066233  0.493564  0.972598  0.322743
12  0.316642  0.094034  0.553074  0.978428  0.226346
13  0.861526  0.709835  0.136232  0.985813  0.488873
14  0.351203  0.066233  0.493564  0.972598  0.322743
15  1.000000  1.000000  1.000000  1.000000  1.000000


In [132]:

normalized_scores = {model: (sim + 1) / 2 for model, sim in cosine_similarities.items()}

normalized_scores_array = np.array(list(normalized_scores.values()))


In [133]:

ideal_solution = np.max(normalized_scores_array, axis=0)
anti_ideal_solution = np.min(normalized_scores_array, axis=0)


In [134]:

distance_to_ideal = np.linalg.norm(normalized_scores_array - ideal_solution, axis=1)
distance_to_anti_ideal = np.linalg.norm(normalized_scores_array - anti_ideal_solution, axis=1)


In [135]:

closeness_coefficient = np.divide(distance_to_anti_ideal, (distance_to_ideal + distance_to_anti_ideal),
                                  out=np.zeros_like(distance_to_anti_ideal), where=(distance_to_ideal + distance_to_anti_ideal) != 0)


In [136]:

ranked_models = sorted(zip(model_sentences.keys(), closeness_coefficient[:, 0]), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(ranked_models, columns=["Model", "Closeness Coefficient"])

df["Rank"] = df["Closeness Coefficient"].rank(ascending=False).astype(int)

print( df.to_string(index=False))


  Model  Closeness Coefficient  Rank
RoBERTa               1.000000     1
   BERT               0.402476     2
  XLNet               0.333772     3
    USE               0.290922     4
 Tf-Idf               0.279481     5
