In [1]:
AVAILABLE_GPU = 3

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
import json
import unicodedata
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import f1_score, silhouette_score
from sklearn.cluster import KMeans, AffinityPropagation
import numpy as np
from torch.nn.functional import normalize
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
avg = lambda *r : sum(r)/len(r)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SHOULD_PRINT = False
MODEL = 'dccuchile/albert-base-spanish'
MODEL_TYPE = 'pure'

In [4]:
CHECKPOINT = MODEL
if MODEL_TYPE == "trained":
    if 'beto' in MODEL:
        if 'uncased' in MODEL:
            CHECKPOINT = 'dccuchile/bert-base-spanish-wwm-uncased'
        else:
            CHECKPOINT = 'dccuchile/bert-base-spanish-wwm-cased'
    elif 'mbert' in MODEL:
        if 'uncased' in MODEL:
            CHECKPOINT = 'bert-base-multilingual-uncased'
        else:
            CHECKPOINT = 'bert-base-multilingual-cased'
    else:
        CHECKPOINT = 'dccuchile/albert-base-spanish'

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
if MODEL_TYPE == "trained":
    model = torch.load(f"../output/{MODEL}.pt").to(device)
else:
    model = AutoModel.from_pretrained(MODEL).to(device)

In [7]:
df = pd.read_csv('../data/evaluation/dwug_es_uses.csv', sep='\t')
df_gold = pd.read_csv('../data/evaluation/dwug_es_judgments.csv', sep='\t')
df_gold['judgment_bin'] = np.where(df_gold['judgment'] <= 2, 1, 0)

df

Unnamed: 0,lemma,date,identifier,context,indexes_target_sentence
0,abundar,1858,old_corpus_spanish-17804-364,"Fueron concluidos en 1570. Hay, por fin, tres ...",27:186
1,abundar,1858,old_corpus_spanish-314696-1167,Rara vez sonaba en aquellos barrios el importu...,392:1045
2,abundar,1858,old_corpus_spanish-23288-235,—Hablaremos más despacio mañana… Puedes irte t...,111:165
3,abundar,1858,old_corpus_spanish-447700-231,El 30 se notaron violentísimos esfuerzos para ...,85:143
4,abundar,1858,old_corpus_spanish-622360-638,Y siempre que algún forastero de viso se prese...,234:450
...,...,...,...,...,...
3995,vuestro,2007,modern_corpus_spanish-790506-168,"Incluso algunas veces, existen expectativas cu...",69:109
3996,vuestro,2007,modern_corpus_spanish-868656-234,"Por turnos, la nueva señal de tráfico de Gary ...",53:175
3997,vuestro,2007,modern_corpus_spanish-633875-439,"""En cuanto a táctica política, también se ha m...",190:350
3998,vuestro,2007,modern_corpus_spanish-797048-553,"""Dije, """"si haces, de hecho, 'muéstrame mi bal...",202:322


In [15]:
def print_nice(input_ids, index, index_end=None, pad_token=None):
    if pad_token is not None:
        input_ids = [token for token in input_ids if token != pad_token]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    if index_end is None:
        tokens[index] = '\033[94m' + tokens[index] + '\033[0m'
    else:
        tokens[index] = '\033[94m' + tokens[index]
        tokens[index_end] = tokens[index_end] + '\033[0m'
    print(' '.join(tokens))

def generate_substrings(word):
    substrings = []
    for i in range(len(word), 0, -1):
        substrings.append(word[:i])
    return substrings[1:-1]

def find_sub_list(sl,l): # not used because some examples have no exact coincidence
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind+1,ind+sll # +1 for the [CLS] token

def remove_accents(word):
    normalized_word = unicodedata.normalize('NFD', word)
    cleaned_word = re.sub(r'[\u0300-\u036f]', '', normalized_word)
    return cleaned_word

def isletter(character):
    return re.sub(r'[^а-яА-Яa-zA-ZÀ-ÿёЁ\u0300-\u036f]', '', character) != ''

def extract_letters(input_string):
    # if input_string == "":
    #     return ""
    # if not isletter(input_string[0]):
    #     input_string = input_string[1:]
    # if not isletter(input_string[-1]):
    #     input_string = input_string[:-1]

    if "albert" in MODEL or "xlm-r" in MODEL:
        punc = re.escape(punctuation)
        regex_pattern = r"[^[\]а-яА-Яa-zA-ZÀ-ÿёЁ\u0300-\u036f'](?=—[¡" + punc + "]|$\")"
        return re.sub(regex_pattern, '', input_string)
    return re.sub(r"[^[\]а-яА-Яa-zA-ZÀ-ÿёЁ\u0300-\u036f']", '', input_string)

In [9]:
def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    final_char = final_char = " " if target_word.endswith(" ") else ""
    return sentence[start_index:end_index].split()[0] + final_char

def get_search(example, word, orth=None, print_search=False):
    # append the words to search in the example, in the desired ORDER
    # 1 - the word (with an ending character), and the word itself
    search = [f"{word} ", f"{word},", f"{word}.", word]

    # 2 - the orthographic form (with an ending character), and the orthographic form itself
    if orth and orth != word:
        search += [f"{orth} ", f"{orth},", f"{orth}.", orth]

    # 3 - all substrings of the word (i.e. выходить -> ['выходит', 'выходи', 'выход', 'выхо', 'вых', 'вы'])
    search += generate_substrings(word)

    # 4 - all substrings of the orthographic form
    if orth:
        search.extend([i for i in generate_substrings(orth) if i not in search])

    # 5 - the word without accents (with an ending character), and the word without accents itself
    unicoded_word = remove_accents(word)
    if unicoded_word != word:
        search += [f"{unicoded_word} ", f"{unicoded_word},", f"{unicoded_word}.", unicoded_word]
    
    # 6 - the orthographic form without accents (with an ending character), and the orthographic form without accents itself
    if orth and orth != word:
        unicoded_orth = remove_accents(orth)
        if unicoded_orth != orth:
            search += [f"{unicoded_orth} ", f"{unicoded_orth},", f"{unicoded_orth}.", unicoded_orth]

    # 7 - all substrings of the word without accents
    if unicoded_word != word:
        search.extend([i for i in generate_substrings(unicoded_word) if i not in search])

    # 8 - all substrings of the orthographic form without accents
    if orth and orth != word and unicoded_orth != orth:
        search.extend([i for i in generate_substrings(unicoded_orth) if i not in search])

    if print_search:
        print(f"Searching for: {search}")

    # FIND the first search-string that is within the example, if any (in upper or lowercase)
    for s in search:
        search_word = find_word_containing_target(example, s)
        if search_word:
            break
        search_word = find_word_containing_target(example.lower(), s.lower())
        if search_word:
            index = example.lower().find(search_word)
            if index == -1:
                # this should never happen
                raise Exception(f"Found '{search_word}' in '{example.lower()}', but then not found...")
            else:
                search_word = example[index:index + len(search_word)]
            break
    else:
        search_word = ""
    return extract_letters(search_word)

In [16]:
embeddings = dict()
word = ""
word_idx = 0
if SHOULD_PRINT: print(df.loc[0, "lemma"])

with tqdm(total=df['lemma'].nunique()) as pbar:
    for index, row in df.iterrows():
        if word != "" and word != row['lemma']:
            if SHOULD_PRINT: print(f"\n{row['lemma']}")
            pbar.update(1)
            word_idx += 1

        should_print = SHOULD_PRINT
        word = row['lemma']          # target word
        example = row['context']    # usage example of the target word
        sentence_indexes = row['indexes_target_sentence']
        identifier = row['identifier']

        # 1. Get the target word index in the example tokenized
        tokens = tokenizer.tokenize(example)

        if True:#len(tokens) >= 512:
            example = example[int(sentence_indexes.split(':')[0]) : int(sentence_indexes.split(':')[1])]
            tokens = tokenizer.tokenize(example)

        search_word = get_search(example, word)

        if search_word == "":
            if len(example.split()) == 1:
                print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking only word in example)")
                target_index, target_index_end = 1, 1
            else:
                print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking [CLS] token)")
                target_index, target_index_end = 0, 0
        else:
            #print(search_word)
            #print(example)
            search_tokens = tokenizer.tokenize(search_word)
            try:
                target_index, target_index_end = find_sub_list(search_tokens, tokens)
            except:
                # this should never happen
                raise ValueError(f"Error unpacking {search_tokens} in {tokens}")

        inputs = tokenizer(example, return_tensors="pt", max_length=512, truncation=True, padding='max_length')

        if should_print:
            print_nice(inputs['input_ids'][0], target_index, target_index_end, pad_token=tokenizer.pad_token_id)

        # 2. Compute the embedding of the token
        with torch.no_grad():
            input_ids = inputs["input_ids"].to(device)
            attention_mask = inputs["attention_mask"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

        if MODEL_TYPE == "trained": 
            embedding = outputs.hidden_states[-1][0][target_index]
        else:
            embedding = outputs.last_hidden_state[0][target_index]

        embeddings[identifier] = embedding.cpu()

df['embedding'] = embeddings.values() # ids are in the same order
assert len(embeddings) == len(df), "Embeddings and dataframe have different lengths"

 29%|████████████▏                             | 29/100 [00:13<00:33,  2.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors
 99%|█████████████████████████████████████████▌| 99/100 [00:47<00:00,  2.08it/s]


# Measure SS

In [25]:
F1 = dict()

## Non-Clustering

### Cosine Similarity

In [23]:
cdl = []
prtl = []

for i, row in tqdm(df_gold.iterrows(), total=df_gold.shape[0], position=0, leave=True):
    emb1 = normalize(embeddings[row["identifier1"]], p=2, dim=-1).reshape(1, -1)
    emb2 = normalize(embeddings[row["identifier2"]], p=2, dim=-1).reshape(1, -1)
    cs = cosine_similarity(emb1, emb2)[0][0]
    cdl.append(1 - cs)
    prtl.append(1 / cs)

cd_mean = sum(cdl)/len(cdl)
prt_mean = sum(prtl)/len(prtl)

100%|███████████████████████████████████| 62624/62624 [00:36<00:00, 1697.67it/s]


In [26]:
#df_gold["guess"] = [(min(int(number * 4) + 1, 4)) for number in normalized_csl]
#df_gold['guess_bin'] = np.where(df_gold['guess'] <= 2, 1, 0)
df_gold["guess_bin"] = [int(cd > cd_mean) for cd in cdl]
F1["CD"] = f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')
F1["CD"]

0.6372080088632686

### Inverted Similarity Over Word Prototype

In [27]:
df_gold["guess_bin"] = [int(prt > prt_mean) for prt in prtl]
F1["PRT"] = f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')
F1["PRT"]

0.6681654041047085

## Clustering

In [18]:
def get_silhouette(tensors, kmeans):
    n = kmeans.n_clusters
    labels = kmeans.labels_
    if n == 1: return 0 # doesn't allow 1-cluster solutions
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    return silhouette_score(X, labels=labels, metric='euclidean')

def elbow_method(inertia_values):
    deltas = []
    for i in range(1, len(inertia_values)):
        deltas.append(inertia_values[i - 1] - inertia_values[i])
    max_curvature_index = deltas.index(max(deltas))
    return max_curvature_index + 2

def compute_metric(tensors, kmeans, method="silhouette"):
    return get_silhouette(tensors, kmeans) if method == "silhouette" else 0

def KMeans_df(df, metric):
    best_score, best_n, inertias = -1, 0, []

    min_clusters = 1
    max_clusters = int(len(df)**.5)+1

    for n in range(min_clusters, max_clusters):
        km = KMeans(n_clusters=n, random_state=0, n_init='auto')
        km.fit([t.numpy() for t in df['embedding'].tolist()])
        df[f'sense_{n}'] = km.labels_
        if (metric == 'inertia'):
            inertias.append(km.inertia_)
        else:
            score = compute_metric(df['embedding'], km, metric)
            if (metric == 'silhouette') and (score > best_score):
                best_score, best_n = score, n
            elif (metric == 'compactness') and (score > best_score):
                best_score, best_n = score, n
    
    if metric == 'inertia':
        best_n = elbow_method(inertias)
        if SHOULD_PRINT:
            plt.plot(range(min_clusters, len(inertias) + 1), inertias, marker='o')
            plt.xlabel('Number of clusters')
            plt.ylabel('Inertia')
            plt.title('Inertia (old model)')
            plt.show()

    df['sense'] = df[f'sense_{best_n}']
    df = df.drop(columns=[f'sense_{n}' for n in range(min_clusters, max_clusters)])
    
    centroids = dict()
    intra_deviations = dict()
    for label in range(best_n):
        cluster_points = [e.numpy() for e in df[df['sense'] == label]['embedding']]
        centroid = np.mean(cluster_points, axis=0)
        similarities = cosine_similarity(cluster_points, [centroid])
        intra_deviation = np.mean(np.abs(1 - similarities))
        centroids[label] = centroid
        intra_deviations[label] = intra_deviation

    df['centroid'] = df['sense'].map(centroids)
    df['intra_deviation'] = df['sense'].map(intra_deviations)

    if SHOULD_PRINT:
        print(f"\n{df.iloc[0]['lemma']}, {best_n} clsuters [max {max_clusters}]")

    return df

def AffinityP_df(df, damping=0.5, affinity='precomputed'):
    ap = AffinityPropagation(affinity=affinity, damping=damping)
    if affinity == 'precomputed': 
        cd = cosine_distances([t.numpy() for t in df['embedding'].tolist()])
    else: 
        cd = [e.numpy() for e in df['embedding']]
    ap.fit(cd)

    df['sense'] = ap.labels_
    
    centroids = dict()
    intra_deviations = dict()
    for label in np.unique(ap.labels_):
        cluster_points = [e.numpy() for e in df[df['sense'] == label]['embedding']]
        centroid = np.mean(cluster_points, axis=0)
        similarities = cosine_similarity(cluster_points, [centroid])
        intra_deviation = np.mean(np.abs(1 - similarities))
        centroids[label] = centroid
        intra_deviations[label] = intra_deviation

    df['centroid'] = df['sense'].map(centroids)
    df['intra_deviation'] = df['sense'].map(intra_deviations)

    if SHOULD_PRINT:
        print(f"{df.iloc[0]['lemma']}, {len(ap.cluster_centers_indices_)} clusters")
    
    return df

def clustering(df, method, metric):
    return KMeans_df(df, metric) if method == "KM" else AffinityP_df(df, metric)

def cluster_df(df, method="AP", metric="silhouette"):
    result_df = pd.DataFrame()
    for word, group in tqdm(df.groupby('lemma')):
        group_cl = clustering(group, method, metric)
        result_df = pd.concat([result_df, group_cl], ignore_index=True)
        assert len(group) == len(group_cl), f"{len(group)} != {len(group_cl)} for word {lemma}"

    return result_df.reset_index()

### AP

In [None]:
CLUSTERING_METHOD = "AP" # "KM" or "AP"
CLUSTERING_METRIC = 0.975 # For KMeans: "silhouette" or "inertia". For AP: the damping value (between 0.5, 0.99)

result_df = cluster_df(df, CLUSTERING_METHOD, CLUSTERING_METRIC)
result_df.set_index('identifier', inplace=True)

In [None]:
guess = []
for i, row in tqdm(df_gold.iterrows(), total=df_gold.shape[0], position=0, leave=True):
    guess.append(int(
        result_df.loc[row["identifier1"], 'sense'] != result_df.loc[row["identifier2"], 'sense']
    #    (1 - avg(cosine_similarity(emb1, cent1)[0][0], cosine_similarity(emb2, cent2)[0][0])) > 3*res1['intra_deviation']
    ))

df_gold['guess_bin'] = guess
F1["AP"] = f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')
F1["AP"]

### KMeans

In [19]:
CLUSTERING_METHOD = "KM" # "KM" or "AP"
CLUSTERING_METRIC = "inertia" # For KMeans: "silhouette" or "inertia". For AP: the damping value (between 0.5, 0.99)

result_df = cluster_df(df, CLUSTERING_METHOD, CLUSTERING_METRIC)
result_df.set_index('identifier', inplace=True)

100%|█████████████████████████████████████████| 100/100 [00:02<00:00, 48.23it/s]


In [None]:
guess = []

for i, row in tqdm(df_gold.iterrows(), total=df_gold.shape[0], position=0, leave=True):
    guess.append(int(
        result_df.loc[row["identifier1"], 'sense'] != result_df.loc[row["identifier2"], 'sense']
    ))

df_gold['guess_bin'] = guess
F1["KM-in"] = f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')
F1["KM-in"]

In [28]:
CLUSTERING_METHOD = "KM" # "KM" or "AP"
CLUSTERING_METRIC = "silhouette" # For KMeans: "silhouette" or "inertia". For AP: the damping value (between 0.5, 0.99)

result_df = cluster_df(df, CLUSTERING_METHOD, CLUSTERING_METRIC)
result_df.set_index('identifier', inplace=True)

100%|█████████████████████████████████████████| 100/100 [00:10<00:00,  9.45it/s]


In [None]:
guess = []

for i, row in tqdm(df_gold.iterrows(), total=df_gold.shape[0], position=0, leave=True):
    # row1, row2 = result_df.loc[row["identifier1"]], result_df.loc[row["identifier2"]]
    # emb1, emb2 = row1['embedding'].reshape(1, -1), row2['embedding'].reshape(1, -1)
    # ctr1, ctr2 = row1['centroid'].reshape(1, -1), row2['centroid'].reshape(1, -1)
    # dev1, dev2 = row1['intra_deviation'], row2['intra_deviation']
    # guess.append(int(
    #     cosine_similarity(emb1, emb2)[0][0] < avg(dev1, dev2)
    # ))
    
    # if row1['sense'] == row2['sense']:
    #     guess.append(0)
    # else:
    #     if cosine_similarity(emb1, emb2)[0][0] < cosine_similarity(ctr1, ctr2)[0][0]:
    #         guess.append(0)
    #     else:
    #         guess.append(1)
    
    guess.append(int(
        result_df.loc[row["identifier1"], 'sense'] != result_df.loc[row["identifier2"], 'sense']
    ))

df_gold['guess_bin'] = guess
F1["KM-silh"] = f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')
F1["KM-silh"]

### CD between centroids

In [29]:
cdl = []
prtl = []

for i, row in tqdm(df_gold.iterrows(), total=df_gold.shape[0], position=0, leave=True):
    emb1 = result_df.loc[row["identifier1"], "centroid"].reshape(1, -1)
    emb2 = result_df.loc[row["identifier2"], "centroid"].reshape(1, -1)
    cs = cosine_similarity(emb1, emb2)[0][0]
    cdl.append(1 - cs)
    prtl.append(1 / cs)

cd_mean = sum(cdl)/len(cdl)
prt_mean = sum(prtl)/len(prtl)

100%|███████████████████████████████████| 62624/62624 [00:32<00:00, 1950.11it/s]


In [30]:
df_gold["guess_bin"] = [int(cd > cd_mean) for cd in cdl]
f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')

0.6118155526748179

In [31]:
df_gold["guess_bin"] = [int(prt > prt_mean) for prt in prtl]
f1_score(df_gold['judgment_bin'], df_gold['guess_bin'], average='weighted')

0.6261324913429491

In [None]:
F1["AVG"] = avg(*F1.values())
F1["AVG"]

In [None]:
f1_results_path = "./f1.tsv"
f1_keys = sorted(F1.keys(), key=lambda x: x != "AVG")

if not os.path.exists(f1_results_path):
    pd.DataFrame(columns=["model"] + f1_keys).to_csv(f1_results_path, sep='\t', index=False)

df = pd.read_csv(f1_results_path, sep='\t')

df.loc[df['model'] == MODEL, f1_keys] = list(F1.values())
if MODEL not in df['model'].values:
    df = pd.concat([df, pd.DataFrame({"model": [MODEL], **F1})], ignore_index=True)

df.to_csv(f1_results_path, sep='\t', index=False)

In [None]:
f1 = pd.read_csv("./f1-prev.tsv", sep="\t")

# Calculate the mean of selected columns and assign it to a new column
f1['AVG-clustering'] = f1[['AP', 'KM-in', 'KM-silh']].mean(axis=1)

# Rename the 'AVG' column to 'AVG-all'
f1.rename(columns={'AVG': 'AVG-all'}, inplace=True)

# Reorder the columns
f1 = f1[['model', 'AVG-clustering', 'AVG-all', 'AP', 'KM-in', 'KM-silh', 'CD', 'PRT']]

f1.to_csv("./f1.tsv", sep="\t", index=False)

f1