# Static models evaluation

## Environment preparation

### Imports

In [1]:
import os
from datetime import datetime

In [2]:
from tqdm import tqdm

In [3]:
from itertools import product

In [4]:
from joblib import Parallel, delayed, parallel_backend

In [5]:
from scipy.stats import pearsonr, spearmanr

In [6]:
import pandas as pd

In [7]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [8]:
from vstk.models import Word2Vec, GloVe, FastText

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from vstk.data import WordSimilarity, WordAnalogy, STS

### Constants

In [10]:
W2V_MODEL_PATH = '../resources/models/pre_trained/word_2_vec/'
GV_MODEL_PATH = '../resources/models/pre_trained/glove/'
FT_MODEL_PATH = '../resources/models/pre_trained/fast_text/'

In [11]:
WS_DATA_PATH = '../resources/data/raw/wordsim353'
WA_DATA_PATH = '../resources/data/raw/question-words'
STS_DATA_PATH = '../resources/data/raw/stsbenchmark'

In [12]:
if not os.path.exists('../experiments/'):
    os.mkdir('../experiments/')
if not os.path.exists('../experiments/static'):
    os.mkdir('../experiments/static')
RESULTS_FILE_PATH = f'../experiments/static/results_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.csv'

### Global

In [13]:
w2v = Word2Vec.load(W2V_MODEL_PATH)
gv = GloVe.load(GV_MODEL_PATH)
ft = FastText.load(FT_MODEL_PATH)

2024-05-04 13:21:15 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 192MB/s]                                                                                     
2024-05-04 13:21:15 INFO: Downloaded file to /home/vincenzo/stanza_resources/resources.json
2024-05-04 13:21:15 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2024-05-04 13:21:15 INFO: Using device: cuda
2024-05-04 13:21:15 INFO: Loading: tokenize
2024-05-04 13:21:16 INFO: Loading: mwt
2024-05-04 13:21:16 INFO: Done loading processors!


In [14]:
models = {
    'Word2Vec': w2v, 
    'GloVe': gv,
    'fastText': ft
}

In [15]:
ws = WordSimilarity(WS_DATA_PATH)
wa = WordAnalogy(WA_DATA_PATH)
sts = {split: STS(STS_DATA_PATH, split=split) for split in ['train', 'validation', 'test']}

In [25]:
top_k = [1, 10, 100]
similarity_configs =  [
    {'metric': metric, 'dynamax_method': None, 'pooling': pooling, 'wmd_stopwords': False}
    for metric, pooling in product(['cos', 'l2', 'jaccard'], ['avg', 'max', 'sif'])
] + [
    {'metric': 'dynamax', 'dynamax_method': method, 'pooling': None, 'wmd_stopwords': False}
    for method in ['jaccard', 'otsuka', 'dice']
] + [
    {'metric': 'wmd', 'wmd_stopwords': sw, 'dynamax_method': None, 'pooling': None }
    for sw in [True, False]
]

In [26]:
results = list()

## Evaluation

### Word similarity

Evaluate on word similarity

#### Extract samples

In [27]:
token_a, token_b, score_true = [*zip(*[
    (sample['token_1'], sample['token_2'], sample['similarity_score']) 
    for sample in ws
])]

#### Correlation

In [28]:
for model_id, model in tqdm(models.items()):
    with parallel_backend('threading', n_jobs=-1):
        score_pred = model.token_similarity(token_a, token_b)
    spearman_corr = spearmanr(score_true, score_pred)
    pearson_corr = pearsonr(score_true, score_pred)
    
    results.append({
        'model': model_id, 
        'task': 'Word similarity',
        'metric': 'Spearman corr.',
        'value': spearman_corr.correlation,
        'pvalue': spearman_corr.pvalue,
        'measure': 'cos'
    })
    results.append({
        'model': model_id, 
        'task': 'Word similarity',
        'metric': 'Pearson corr.',
        'value': pearson_corr.correlation,
        'pvalue': pearson_corr.pvalue,
        'measure': 'cos'
    })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24.40it/s]


#### Accuracy@k

In [29]:
max_k =  max(top_k)

for model_id, model in tqdm(models.items()):
    with parallel_backend('threading', n_jobs=-1):
        most_similar = model.get_similar_tokens(token_a, k=max_k)
    
        for k in top_k:
            accuracy_score = sum(
                Parallel()(
                    delayed(lambda x, y: any(t.strip().lower() == y.strip().lower() for t, _ in x[:k]))(similar_tokens, target) 
                    for similar_tokens, target in zip(most_similar, token_b)
                ) 
            )  / len(token_a)
            
            results.append({
                'model': model_id, 
                'task': 'Word similarity',
                'metric': f'Acc@{k}',
                'value': accuracy_score,
                'measure': 'cos'
            })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  3.91it/s]


### Word analogy

Evaluate on question-words

#### Extract samples

In [30]:
tokens_src, token_tgt, = [*zip(*[
    ((sample['token_1_pair_1'], sample['token_2_pair_1'], sample['token_1_pair_2']), sample['token_2_pair_2']) 
    for sample in wa if len(sample) == 5
])]

#### Accuracy@k

In [31]:
max_k =  max(top_k)

for model_id, model in tqdm(models.items()):
    with parallel_backend('threading', n_jobs=-1):
        most_similar = model.solve_analogy(tokens_src, k=max_k)
        
        for k in top_k:
            accuracy_score = sum(
                Parallel()(
                    delayed(lambda x, y: any(t.strip().lower() == y.strip().lower() for t, _ in x[:k]))(similar_tokens, target) 
                    for similar_tokens, target in zip(most_similar, token_tgt)
                ) 
            )  / len(tokens_src)
            
            results.append({
                'model': model_id, 
                'task': 'Word analogies',
                'metric': f'Acc@{k}',
                'value': accuracy_score,
                'measure': 'cos'
            })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:28<00:00,  9.46s/it]


### Semantic Textual Similarity

Evaluate on the STS benchmark

#### Extract samples

In [32]:
sif_data = [sample[key] for sample in sts['train'] for key in ['sequence_a', 'sequence_b']]
sequences = {
    split: [*zip(*[(sample['sequence_a'], sample['sequence_b'], sample['similarity_score']) for sample in sts[split]])]
    for split in ['validation', 'test']
}

#### Correlation

In [33]:
for model_id, model in tqdm(models.items()):
    model.fit_sif_embedding(sif_data)
    for split, (sequence_a, sequence_b, score_true) in sequences.items():
        for config in similarity_configs:
            with parallel_backend('threading', n_jobs=-1):
                score_pred = model.sequence_similarity(sequence_a, sequence_b, **config)
            spearman_corr = spearmanr(score_true, score_pred)
            pearson_corr = pearsonr(score_true, score_pred)
    
            results.append({
                'model': model_id, 
                'task': 'Sematic textual similarity',
                'metric': 'Spearman corr.',
                'value': spearman_corr.correlation,
                'pvalue': spearman_corr.pvalue,
                'measure': config['metric'],
                'dynamax_method': config['dynamax_method'],
                'pooling': config['pooling'],
                'split': split
            })
            results.append({
                'model': model_id, 
                'task': 'Sematic textual similarity',
                'metric': 'Pearson corr.',
                'value': pearson_corr.correlation,
                'pvalue': pearson_corr.pvalue,
                'measure': config['metric'],
                'dynamax_method':  config['dynamax_method'],
                'pooling': config['pooling'],
                'split': split
            })

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [08:17<00:00, 165.87s/it]


## Results

In [34]:
results = pd.DataFrame.from_dict(results)
results

Unnamed: 0,model,task,metric,value,pvalue,measure,dynamax_method,pooling,split
0,Word2Vec,Word similarity,Spearman corr.,0.700017,2.868667e-53,cos,,,
1,Word2Vec,Word similarity,Pearson corr.,0.652535,3.373411e-44,cos,,,
2,GloVe,Word similarity,Spearman corr.,0.737944,6.523243e-62,cos,,,
3,GloVe,Word similarity,Pearson corr.,0.733025,1.043601e-60,cos,,,
4,fastText,Word similarity,Spearman corr.,0.780395,1.444605e-73,cos,,,
...,...,...,...,...,...,...,...,...,...
187,fastText,Sematic textual similarity,Pearson corr.,0.156155,5.564347e-09,dynamax,dice,,test
188,fastText,Sematic textual similarity,Spearman corr.,0.549791,9.331995e-110,wmd,,,test
189,fastText,Sematic textual similarity,Pearson corr.,0.564229,9.673081e-117,wmd,,,test
190,fastText,Sematic textual similarity,Spearman corr.,0.433388,3.076163e-64,wmd,,,test


In [35]:
results.to_csv(RESULTS_FILE_PATH, index=False)