In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model_names = [
    "facebook/bart-base",
    "allenai/longformer-base-4096",
    "google/electra-small-discriminator",
    "microsoft/mpnet-base",
    "squeezebert/squeezebert-uncased",
    "deepset/sentence_bert",
    "vinai/phobert-base",
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "sentence-transformers/paraphrase-MiniLM-L6-v2"
]

In [None]:
# Similarity and distance metrics
parameters = ["cosine_similarity", "euclidean_distance", "manhattan_distance", "minkowski_distance", "correlation_coefficient"]


In [None]:
data = []

# Sample paragraphs for comparison
paragraph1 = """
Natural language processing (NLP) is a field of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a way that is both meaningful and contextually relevant.
"""
paragraph2 = """
Machine learning is a subset of artificial intelligence that involves the development of algorithms and statistical models that enable computers to perform specific tasks without explicit programming. In the context of natural language processing, machine learning algorithms are often used to analyze and understand the structure and meaning of human language.
"""

In [None]:
for model_name in model_names:
    model = SentenceTransformer(model_name)

    # Generate sentence embeddings
    embedding1 = model.encode([paragraph1])
    embedding2 = model.encode([paragraph2])

    # Calculate similarity and distance metrics
    cosine_sim = cosine_similarity(embedding1, embedding2)[0][0]
    euclidean_dist = np.linalg.norm(embedding1 - embedding2)
    manhattan_dist = np.abs(embedding1 - embedding2).sum()
    minkowski_dist = np.power(np.power(np.abs(embedding1 - embedding2), 3).sum(), 1/3)
    correlation_coeff = np.corrcoef(embedding1[0], embedding2[0])[0, 1]

    # Store results
    parameter_values = [cosine_sim, euclidean_dist, manhattan_dist, minkowski_dist, correlation_coeff]
    data.append([model_name] + parameter_values)


Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create DataFrame
columns = ["Model"] + parameters
df = pd.DataFrame(data, columns=columns)

In [None]:
df_normalized = df.copy()
for param in parameters:
    df_normalized[param] = (df[param] - df[param].min()) / (df[param].max() - df[param].min())


In [None]:
criteria_weights = [1] * len(parameters)

In [None]:
# Compute weighted normalized matrix
weighted_normalized_matrix = df_normalized.iloc[:, 1:] * criteria_weights

In [None]:
# Compute positive and negative ideal solutions
positive_ideal_solution = weighted_normalized_matrix.max(axis=0)
negative_ideal_solution = weighted_normalized_matrix.min(axis=0)

In [None]:
df_normalized["TOPSIS_Score"] = np.linalg.norm(weighted_normalized_matrix - negative_ideal_solution, axis=1) / (
        np.linalg.norm(weighted_normalized_matrix - positive_ideal_solution, axis=1) +
        np.linalg.norm(weighted_normalized_matrix - negative_ideal_solution, axis=1))


In [None]:
df_ranked = df_normalized.sort_values(by="TOPSIS_Score", ascending=False).reset_index(drop=True)


In [None]:
df_ranked["Rank"] = df_ranked.index + 1  # Rank starts from 1


In [None]:
df_ranked.to_csv("topsis_results.csv", index=False)


In [None]:
print(df_ranked[["Rank", "Model", "TOPSIS_Score"]])


    Rank                                          Model  TOPSIS_Score
0      1                             facebook/bart-base      0.932619
1      2                          deepset/sentence_bert      0.750122
2      3                              bert-base-uncased      0.506996
3      4                        distilbert-base-uncased      0.484476
4      5                   allenai/longformer-base-4096      0.467023
5      6                                   roberta-base      0.463823
6      7             google/electra-small-discriminator      0.462087
7      8                             vinai/phobert-base      0.452892
8      9                           microsoft/mpnet-base      0.447087
9     10                squeezebert/squeezebert-uncased      0.417772
10    11  sentence-transformers/paraphrase-MiniLM-L6-v2      0.267881
