# Results

## Environment preparation

### Imports

In [7]:
import numpy as np

In [8]:
import pandas as pd

In [9]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

### Constants

In [10]:
RESULTS_FILE_PATH = f'../experiments/contextual/results.csv'  # 'results_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}.csv'

### Global

In [11]:
results_df = pd.read_csv(RESULTS_FILE_PATH)
results_df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
0,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy,8.124684e-01
1,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (macro),7.445450e-01
2,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (macro),6.819199e-01
3,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (macro),7.053761e-01
4,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (macro),9.752486e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


## Baselines

In [None]:
baselines_df = results_df[results_df['embeddings'].isin(['word_2_vec', 'glove', 'fast_text'])]

### Macro

In [35]:
df = baselines_df[baselines_df['metric'].apply(lambda x: x == 'accuracy' or '(macro)' in x or 'corr.' in x) ]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
0,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy,8.124684e-01
1,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (macro),7.445450e-01
2,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (macro),6.819199e-01
3,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (macro),7.053761e-01
4,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (macro),9.752486e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [30]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 92.95 & 76.60 & 50.02 & 54.92 & 97.08 \
('validation',) -- 93.37 & 63.00 & 51.31 & 54.59 & 97.53 \

('glove',)

('test',) -- 92.97 & 65.97 & 51.29 & 54.68 & 98.13 \
('validation',) -- 93.64 & 70.49 & 56.17 & 60.31 & 98.45 \

('word_2_vec',)

('test',) -- 81.78 & 60.88 & 48.39 & 52.22 & 95.56 \
('validation',) -- 84.57 & 66.64 & 50.21 & 53.80 & 95.55 \

('ner',)


('fast_text',)

('test',) -- 95.50 & 80.91 & 74.64 & 77.44 & 98.90 \
('validation',) -- 96.47 & 87.86 & 77.57 & 81.83 & 98.84 \

('glove',)

('test',) -- 95.98 & 81.90 & 79.93 & 80.84 & 99.09 \
('validation',) -- 97.01 & 88.82 & 82.49 & 85.41 & 99.23 \

('word_2_vec',)

('test',) -- 91.74 & 75.74 & 64.20 & 69.25 & 96.41 \
('validation',) -- 93.31 & 83.87 & 67.32 & 74.30 & 97.12 \

('pos',)


('fast_text',)

('test',) -- 91.20 & 89.57 & 81.84 & 84.10 & 99.25 \
('validation',) -- 91.98 & 83.96 & 78.96 & 80.79 & 99.18 \

('glove',)

('test',) -- 91.84 & 89.35 & 84.18 & 85.80 & 99.40 \
('

### Weighted

In [31]:
df = baselines_df[baselines_df['metric'].apply(lambda x: x == 'accuracy' or '(weighted)' in x or 'corr.' in x)]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
0,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy,8.124684e-01
5,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (weighted),8.134053e-01
6,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (weighted),8.124684e-01
7,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (weighted),8.096775e-01
8,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (weighted),9.811476e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [32]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 92.95 & 92.76 & 92.95 & 92.75 & 99.07 \
('validation',) -- 93.37 & 93.16 & 93.37 & 93.20 & 99.27 \

('glove',)

('test',) -- 92.97 & 92.80 & 92.97 & 92.82 & 99.09 \
('validation',) -- 93.64 & 93.49 & 93.64 & 93.53 & 99.33 \

('word_2_vec',)

('test',) -- 81.78 & 82.59 & 81.78 & 81.94 & 96.35 \
('validation',) -- 84.57 & 84.90 & 84.57 & 84.60 & 97.43 \

('ner',)


('fast_text',)

('test',) -- 95.50 & 95.23 & 95.50 & 95.30 & 99.27 \
('validation',) -- 96.47 & 96.26 & 96.47 & 96.27 & 99.16 \

('glove',)

('test',) -- 95.98 & 95.90 & 95.98 & 95.93 & 99.43 \
('validation',) -- 97.01 & 96.89 & 97.01 & 96.93 & 99.48 \

('word_2_vec',)

('test',) -- 91.74 & 91.09 & 91.74 & 91.25 & 96.33 \
('validation',) -- 93.31 & 92.87 & 93.31 & 92.87 & 97.20 \

('pos',)


('fast_text',)

('test',) -- 91.20 & 91.17 & 91.20 & 91.10 & 99.23 \
('validation',) -- 91.98 & 91.92 & 91.98 & 91.88 & 99.36 \

('glove',)

('test',) -- 91.84 & 91.78 & 91.84 & 91.77 & 99.35 \
('

### Sample weight

In [33]:
df = baselines_df[baselines_df['metric'].apply(lambda x: '(sample weight)' in x or 'corr.' in x)]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
9,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy (sample weight),8.314087e-01
10,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (sample weight),8.513314e-01
11,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (sample weight),8.314087e-01
12,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (sample weight),8.371360e-01
13,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (sample weight),9.744865e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [34]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 94.54 & 94.83 & 94.54 & 94.66 & 98.88 \
('validation',) -- 94.70 & 95.01 & 94.70 & 94.82 & 99.12 \

('glove',)

('test',) -- 94.23 & 94.59 & 94.23 & 94.38 & 98.86 \
('validation',) -- 94.85 & 95.24 & 94.85 & 95.01 & 99.20 \

('word_2_vec',)

('test',) -- 81.74 & 85.21 & 81.74 & 82.79 & 95.13 \
('validation',) -- 84.94 & 86.90 & 84.94 & 85.55 & 96.69 \

('ner',)


('fast_text',)

('test',) -- 99.05 & 99.59 & 99.05 & 99.29 & 99.42 \
('validation',) -- 99.45 & 99.70 & 99.45 & 99.56 & 99.40 \

('glove',)

('test',) -- 98.78 & 99.59 & 98.78 & 99.14 & 99.56 \
('validation',) -- 99.32 & 99.70 & 99.32 & 99.48 & 99.65 \

('word_2_vec',)

('test',) -- 97.87 & 99.26 & 97.87 & 98.49 & 96.52 \
('validation',) -- 98.67 & 99.39 & 98.67 & 98.98 & 97.53 \

('pos',)


('fast_text',)

('test',) -- 90.68 & 91.69 & 90.68 & 91.01 & 98.72 \
('validation',) -- 91.18 & 91.80 & 91.18 & 91.38 & 98.77 \

('glove',)

('test',) -- 91.47 & 92.32 & 91.47 & 91.77 & 98.94 \
('

## Def2Vec

In [None]:
def_2_vec_df = results_df[results_df['embeddings'].apply(lambda x: x == 'def_2_vec' and not (x.endswith('') or x.endswith('')))]

### Macro

In [35]:
df = def_2_vec_df[def_2_vec_df['metric'].apply(lambda x: x == 'accuracy' or '(macro)' in x or 'corr.' in x) ]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
0,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy,8.124684e-01
1,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (macro),7.445450e-01
2,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (macro),6.819199e-01
3,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (macro),7.053761e-01
4,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (macro),9.752486e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [30]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 92.95 & 76.60 & 50.02 & 54.92 & 97.08 \
('validation',) -- 93.37 & 63.00 & 51.31 & 54.59 & 97.53 \

('glove',)

('test',) -- 92.97 & 65.97 & 51.29 & 54.68 & 98.13 \
('validation',) -- 93.64 & 70.49 & 56.17 & 60.31 & 98.45 \

('word_2_vec',)

('test',) -- 81.78 & 60.88 & 48.39 & 52.22 & 95.56 \
('validation',) -- 84.57 & 66.64 & 50.21 & 53.80 & 95.55 \

('ner',)


('fast_text',)

('test',) -- 95.50 & 80.91 & 74.64 & 77.44 & 98.90 \
('validation',) -- 96.47 & 87.86 & 77.57 & 81.83 & 98.84 \

('glove',)

('test',) -- 95.98 & 81.90 & 79.93 & 80.84 & 99.09 \
('validation',) -- 97.01 & 88.82 & 82.49 & 85.41 & 99.23 \

('word_2_vec',)

('test',) -- 91.74 & 75.74 & 64.20 & 69.25 & 96.41 \
('validation',) -- 93.31 & 83.87 & 67.32 & 74.30 & 97.12 \

('pos',)


('fast_text',)

('test',) -- 91.20 & 89.57 & 81.84 & 84.10 & 99.25 \
('validation',) -- 91.98 & 83.96 & 78.96 & 80.79 & 99.18 \

('glove',)

('test',) -- 91.84 & 89.35 & 84.18 & 85.80 & 99.40 \
('

### Weighted

In [31]:
df = def_2_vec_df[def_2_vec_df['metric'].apply(lambda x: x == 'accuracy' or '(weighted)' in x or 'corr.' in x)]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
0,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy,8.124684e-01
5,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (weighted),8.134053e-01
6,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (weighted),8.124684e-01
7,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (weighted),8.096775e-01
8,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (weighted),9.811476e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [32]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 92.95 & 92.76 & 92.95 & 92.75 & 99.07 \
('validation',) -- 93.37 & 93.16 & 93.37 & 93.20 & 99.27 \

('glove',)

('test',) -- 92.97 & 92.80 & 92.97 & 92.82 & 99.09 \
('validation',) -- 93.64 & 93.49 & 93.64 & 93.53 & 99.33 \

('word_2_vec',)

('test',) -- 81.78 & 82.59 & 81.78 & 81.94 & 96.35 \
('validation',) -- 84.57 & 84.90 & 84.57 & 84.60 & 97.43 \

('ner',)


('fast_text',)

('test',) -- 95.50 & 95.23 & 95.50 & 95.30 & 99.27 \
('validation',) -- 96.47 & 96.26 & 96.47 & 96.27 & 99.16 \

('glove',)

('test',) -- 95.98 & 95.90 & 95.98 & 95.93 & 99.43 \
('validation',) -- 97.01 & 96.89 & 97.01 & 96.93 & 99.48 \

('word_2_vec',)

('test',) -- 91.74 & 91.09 & 91.74 & 91.25 & 96.33 \
('validation',) -- 93.31 & 92.87 & 93.31 & 92.87 & 97.20 \

('pos',)


('fast_text',)

('test',) -- 91.20 & 91.17 & 91.20 & 91.10 & 99.23 \
('validation',) -- 91.98 & 91.92 & 91.98 & 91.88 & 99.36 \

('glove',)

('test',) -- 91.84 & 91.78 & 91.84 & 91.77 & 99.35 \
('

### Sample weight

In [33]:
df = def_2_vec_df[def_2_vec_df['metric'].apply(lambda x: '(sample weight)' in x or 'corr.' in x)]
df

Unnamed: 0,corpus,task,embeddings,split,timestamp,metric,value
9,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,accuracy (sample weight),8.314087e-01
10,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,precision (sample weight),8.513314e-01
11,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,recall (sample weight),8.314087e-01
12,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,fscore (sample weight),8.371360e-01
13,CoNLL-2003,pos,word_2_vec,validation,2024_05_18_10_27_34,auc (sample weight),9.744865e-01
...,...,...,...,...,...,...,...
355,STS,similarity_score,fast_text,validation,2024_05_18_11_37_25,spearman corr. p,4.293119e-222
356,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr.,6.336248e-01
357,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,pearson corr. p,1.118321e-155
358,STS,similarity_score,fast_text,test,2024_05_18_11_37_25,spearman corr.,6.190004e-01


In [34]:
for task, task_groups in df.groupby(['task']):
    print()
    print(task)
    print()
    for embedding, embeddings_groups in task_groups.groupby(['embeddings']):
        print()
        print(embedding)
        print()
        for split, split_groups in embeddings_groups.groupby(['split']):
            # print(split, ' '.join(f'{row['metric']}: {row['value'] * 100:.2f}' for _, row in split_groups.iterrows()))
            print(split, '--', ' & '.join(f'{row['value'] * 100:.2f}' for _, row in split_groups.iterrows()), '\\\\')


('chunk',)


('fast_text',)

('test',) -- 94.54 & 94.83 & 94.54 & 94.66 & 98.88 \
('validation',) -- 94.70 & 95.01 & 94.70 & 94.82 & 99.12 \

('glove',)

('test',) -- 94.23 & 94.59 & 94.23 & 94.38 & 98.86 \
('validation',) -- 94.85 & 95.24 & 94.85 & 95.01 & 99.20 \

('word_2_vec',)

('test',) -- 81.74 & 85.21 & 81.74 & 82.79 & 95.13 \
('validation',) -- 84.94 & 86.90 & 84.94 & 85.55 & 96.69 \

('ner',)


('fast_text',)

('test',) -- 99.05 & 99.59 & 99.05 & 99.29 & 99.42 \
('validation',) -- 99.45 & 99.70 & 99.45 & 99.56 & 99.40 \

('glove',)

('test',) -- 98.78 & 99.59 & 98.78 & 99.14 & 99.56 \
('validation',) -- 99.32 & 99.70 & 99.32 & 99.48 & 99.65 \

('word_2_vec',)

('test',) -- 97.87 & 99.26 & 97.87 & 98.49 & 96.52 \
('validation',) -- 98.67 & 99.39 & 98.67 & 98.98 & 97.53 \

('pos',)


('fast_text',)

('test',) -- 90.68 & 91.69 & 90.68 & 91.01 & 98.72 \
('validation',) -- 91.18 & 91.80 & 91.18 & 91.38 & 98.77 \

('glove',)

('test',) -- 91.47 & 92.32 & 91.47 & 91.77 & 98.94 \
('