# Similarity Measures
Exponential Decay ![image-2.png](attachment:image-2.png)
Inverse Distance  ![image-3.png](attachment:image-3.png)
Gaussian Similarity ![image-4.png](attachment:image-4.png)

# Dependencies

In [2]:
import numpy as np
import pandas as pd
import torch
torch.device('cpu')
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance_matrix
# encoders BERT, RoBERTa, or DistilBERT

  from tqdm.autonotebook import tqdm, trange


# Initializing 

In [3]:
# df = pd.read_csv('submmited_forms.csv')
encoder = SentenceTransformer("paraphrase-MiniLM-L6-v2")

#   1st column is hobby   2nd column is favourite subject
df = pd.DataFrame({
    'William'   :   ['video games',   'Physics ', 'warm'],
    'Sterling'  :   ['Skateboarding', 'Organic chemistry', 'cold'],
    'Stanford'  :   ['programming',   'Mathematics', 'hot'],
    'Daniel'    :   ['fishing',       'Biology', 'moderate'],
}, index=['hobby', 'fav. subject', 'preffered temperature'])

names = np.array(df.columns)
num_features = df.shape[0]

def normalize_L2(matrix):
    '''
    Normalizes row vectors in a matrix
    '''
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    return matrix / norms


def exponential_similarity(dist_matrix, lambda_p = 0.01):
    return np.exp(- lambda_p * dist_matrix)


def inverse_distance(matrix):
    return np.power(matrix + 1, -1)

def gaussian_similarity(matrix, num_features):
    gamma = 1 / num_features
    quadratic_dist = np.power(matrix, 2)
    return np.exp(-quadratic_dist)


def match_pairs(names, sim_matrix, dist_matrix):
    maxes = sim_matrix.max(axis=0)
    maxes_indices = np.argmax(sim_matrix, axis=0)
    # number of pairs is half of the dimension
    dim_half = maxes.shape[0] // 2 
    indices = maxes_indices[:dim_half] 
    similarities = np.array(dist_matrix[range(dim_half), indices])
    candidates = names[indices]
    pairs = np.column_stack((names[:2], candidates, similarities))
    return pairs

def dist_matrices_embed(emb_features):# calculate dist matrices for every feature(subject, hobby) and sum them to one dist_matrix
    dist_matrices = np.array([distance_matrix(i, i) for i in emb_features])
    return np.sum(dist_matrices, axis=0)



# Main

In [304]:
matrix = df.to_numpy().reshape(-1)
embeddings = encoder.encode(matrix)
emb_features = np.split(embeddings, num_features)

dist_matrix = dist_matrices_embed(emb_features)
# deleting diagonal from considering
np.fill_diagonal(dist_matrix, np.inf)                 
sim_matrix = exponential_similarity(dist_matrix)
rounded_dist = np.round(sim_matrix, 2)
result = match_pairs(names, sim_matrix, rounded_dist)
sim_matrix, result

(array([[0.        , 0.80734731, 0.83624676, 0.81514666],
        [0.80734731, 0.        , 0.808115  , 0.81149584],
        [0.83624676, 0.808115  , 0.        , 0.82156425],
        [0.81514666, 0.81149584, 0.82156425, 0.        ]]),
 array([['William', 'Stanford', 0.84],
        ['Sterling', 'Daniel', 0.81]], dtype=object))

In [None]:
# Correct Results 
# df = pd.DataFrame({
#     'William':  ['video games', 'Physics '],
#     'Sterling': ['Skateboarding', 'Organic chemistry'],
#     'Stanford': ['programming', 'Mathematics'],
#     'Daniel':   ['fishing', 'Biology'],
# }, index=['hobby', 'fav. subject'])

# (torch.return_types.max(
#  values=tensor([0.8362, 0.8115, 0.8362, 0.8216], dtype=torch.float64),
#  indices=tensor([2, 3, 0, 2])),
#  tensor([[0.0000, 0.8073, 0.8362, 0.8151],
#          [0.8073, 0.0000, 0.8081, 0.8115],
#          [0.8362, 0.8081, 0.0000, 0.8216],
#          [0.8151, 0.8115, 0.8216, 0.0000]], dtype=torch.float64))