In [1]:
# Compute performances for intrinsic evaluations with clustering 
# - homogeneity score (H-Score ↑) 
# - mean inter-group cosine distance (CosDist ↑)

In [2]:
import json
from sentence_transformers import SentenceTransformer, util
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
import random
import copy
from collections import Counter
import re
from nltk.tokenize import word_tokenize
# k-means and evaluate 
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE


In [3]:
def read_json_lines(path_to_file): 
    with open(path_to_file) as f:
        content = f.readlines()
    f.close()
    raw_data  = [json.loads(x) for x in content] 
    return raw_data

def read_json_file(path):
    with open(path, 'r') as f:
        return json.load(f)
    
def write_json_file(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)
    return

In [4]:
# 1. Load the dictionaries
combined_dict_embed = read_json_file('./fusion_analysis/magpie_idiom2embed_dictionary_google_wiki_single_meaning_cleaner_by_bart.json')
print('Total Number of idioms: {}'.format(len(combined_dict_embed)))
# 2. load idiom groups
path_to_idiom_groups = './fusion_analysis/idiom_groups_by_meaning_20.txt'
with open(path_to_idiom_groups) as f:
    lines = f.readlines()
f.close()

idiom_groups = []
for l in lines:
    if l != '\n':
        l = l.strip()
        if ':' in l: 
            cur_group_idx = int(l.split(':')[0].split(' ')[1])-1
            cur_group_name = l.split(':')[1].strip()
        else: 
            idiom_groups.append([l, cur_group_idx, cur_group_name])
print("Number of idioms with groups:", len(idiom_groups))
# load idiom dictionary definitions
idiom_dict = read_json_file('./fusion_analysis/magpie_idiom_dictionary_google_wiki_single_meaning.json')
idiom_dict = {i[0]:idiom_dict[i[0]]  for i in idiom_groups}

Total Number of idioms: 1521
Number of idioms with groups: 189


## Clustering and compute scores

In [5]:
path_to_noncomp_dict= './generated_embeddings/idiom2embed-PIER.json'  # Generated from EVALUATION_IdiomAdapterEmbeddingSimilarity
noncompcombined_dict_embed = read_json_file(path_to_noncomp_dict)
N = 20  # Set to the number of groups 

In [6]:
idioms = [k for k in noncompcombined_dict_embed.keys()]
test_idioms = [k  for k in idiom_groups if k[0] in noncompcombined_dict_embed]
# produce cluster labels 
labels = np.array([k[1] for k in idiom_groups if k[0] in noncompcombined_dict_embed])
len(labels)

134

In [7]:
# Compute Homogeneity score
embed_matrix = np.array([noncompcombined_dict_embed[k[0]] for k in idiom_groups if k[0] in noncompcombined_dict_embed])
embed_matrix = np.asarray(embed_matrix, dtype='float64')
embed_matrix.shape
labels = np.array([k[1] for k in idiom_groups if  k[0] in noncompcombined_dict_embed])
embed_matrix.shape
embed_distances = cosine_distances(embed_matrix, embed_matrix)
standardization = StandardScaler()
# Run clustering
X = embed_distances
clustering = AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=N).fit(X)
preds = clustering.labels_.tolist()
print("Homogeneity score (H-score):")
print(homogeneity_score(labels, preds))

Homogeneity score (H-score):
0.6095213104253961


In [8]:
# Compute within group similarity
embed_sims = 1 - embed_distances
sims_dict = {}
for cur_idx in range(embed_sims.shape[0]): 
    sims_dict[cur_idx] = {'in': [], 'out': [], 'diff': 0.}
    for nei_idx in range(embed_sims.shape[0]):
        if cur_idx == nei_idx: 
            continue
        else: 
            if labels[cur_idx] == labels[nei_idx]: 
                sims_dict[cur_idx]['in'].append(embed_sims[cur_idx, nei_idx])
            else: 
                sims_dict[cur_idx]['out'].append(embed_sims[cur_idx, nei_idx])
    sims_dict[cur_idx]['diff'] = np.mean(sims_dict[cur_idx]['in']) - np.mean(sims_dict[cur_idx]['out'])
    
sim_diff_dict = {}
for cur_idx in sims_dict: 
    if labels[cur_idx] not in sim_diff_dict: 
        sim_diff_dict[labels[cur_idx]] = []
    sim_diff_dict[labels[cur_idx]].append(sims_dict[cur_idx]['diff'])
sim_diff_dict = {k: np.mean(v) for k, v in sim_diff_dict.items()}
difference = np.mean([v for v in sim_diff_dict.values()])
print("Within group similarity (CosDist):")
difference

Within group similarity (CosDist):


0.183753855786771