# Evaluate
**module: src.run_evaluate**
```
!python -m src.run_evaluate --results_path $RESULTS_PATH --exp_names $EXP_NAMES
```
- results_path: directory with experiment results
- exp_names: comma separated list of subdirs in results path, useful if only few experiments needs to be evaluated
see the main funtion of the module for more options

In [None]:
import src
!python -m src.run_evaluate \
--results_path=workdir/results/test-paper_roles_st_stopword\
--exp_names=melamud_add,melamud_baladd,melamud_mult,melamud_balmult

# Result Tables 

In [None]:
import os
import pandas as pd
import json
from src.result_analysis import create_precision_recall_plot, create_precision_plot
pd.set_option('display.max_colwidth', None)


def make_sorted_labels(precision_table):
    return [elem(str) for elem in sorted(list([int(e) for e in precision_table[0].keys()]))]


def make_precision_table(experiments, levels=[1,5], prefix='', round_signs=3):
    if prefix!='':
        prefix = prefix+'_'
    precision_table = []
    exp_names = []
    for exp_path in experiments:
        if type(exp_path) is tuple:
            exp_path, exp_name = exp_path
        else:
            exp_name = exp_path.split('/')[-1]
        
        exp_names.append(exp_name)
        try:
            with open(os.path.join(exp_path, f'{prefix}precision.json')) as f:
                precision_dict = json.load(f)
        except Exception as ex:
            continue
        row = {"model": exp_name}
        row1 = {level : round(prec, round_signs) for level, prec in precision_dict['precisions_at_level'].items()}
        for k,v in row1.items():
            row[k] = v
            
        if levels is None:
            levels = [str(lev) for lev in sorted([int(e) for e in precision_dict['precisions_at_level'].keys()])]
        else:
            levels = [str(lev) for lev in levels]

            row['MAP'] = round(precision_dict['map'], round_signs)
        
        precision_table.append(row)
    
    return pd.DataFrame(precision_table)[['model']+ levels + ['MAP']]


## Upperbound

In [None]:
result_dir = 'workdir/upperbound_results'
experiments = [
    (f'{result_dir}/paper_roles_st/', 'LexicalUnit-Verbs'),
    (f'{result_dir}/paper_nouns_st/', 'LexicalUnit-Nouns'),
    (f'{result_dir}/paper_preds_st/', 'SemanticRoles')
]

result_table = make_precision_table(experiments, levels=[1,5])
result_table=result_table.set_index("model")
result_table

In [None]:
print(result_table.to_latex())

## Verbs

In [None]:
!ls workdir/results

In [None]:
result_dir = 'workdir/results'
sub_dir = 'test-paper_verbs_st_pattern_vocabfilter'

experiments = [

    (f'{result_dir}/{sub_dir}/glove_840B_nolem/', 'GloVe'),
    (f'{result_dir}/{sub_dir}/fasttext_cc_nolem/', 'fastText'),
    (f'{result_dir}/{sub_dir}/word2vec_googlenews_nolem/', 'word2vec'),
    
    (f'{result_dir}/{sub_dir}/dt_wiki_lem', 'DT wiki'),
    (f'{result_dir}/{sub_dir}/dt_59g_lem', 'DT 59g'),

    
    (f'{result_dir}/{sub_dir}/melamud_add', 'Melamud add'),
    (f'{result_dir}/{sub_dir}/melamud_baladd', 'Melamud balAdd'),
    (f'{result_dir}/{sub_dir}/melamud_mult', 'Melamud mult'),
    (f'{result_dir}/{sub_dir}/melamud_balmult', 'Melamud balMult'),

    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200', 'BERT large cased w/o mask'),
    (f'{result_dir}/{sub_dir}/gie_swv_test_semiPURExlnet', 'XLNet'),
        
    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-Tand', 'BERT large cased Tand w/o mask'),
    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-TandT', 'BERT large cased TandT w/o mask'),
    (f'{result_dir}/{sub_dir}/gie_swv_test_semiPUREBert_embs', 'BERT+embs (default))'),

    (f'{result_dir}/{sub_dir}/gie_swv_test_semiPURExlnet_embs', 'XLNet+embs (default)'),
    
    (f'{result_dir}/{sub_dir}/gie_swv_test_semiPURExlnet_embs_swvhypers', 'XLNet+embs (optimal)'),
]

result_table = make_precision_table(experiments, prefix=prefix)
result_table=result_table.set_index("model")
result_table

In [None]:
# print(result_table.to_latex())

## Nouns

In [None]:
result_dir = 'workdir/results'
sub_dir = 'test-paper_nouns_st_pattern_stopwords_nounfilter'
# sub_dir = 'test-paper_nouns_st_lemminflect_stopwords_nounfilter'
experiments = [

    (f'{result_dir}/{sub_dir}/glove_840B_nolem/', 'GloVe'),
    (f'{result_dir}/{sub_dir}/fasttext_cc_nolem/', 'fastText'),
    (f'{result_dir}/{sub_dir}/word2vec_googlenews_nolem/', 'word2vec'),
    
    (f'{result_dir}/{sub_dir}/dt_wiki_lem', 'DT wiki'),
    (f'{result_dir}/{sub_dir}/dt_59g_lem', 'DT 59g'),

    
    (f'{result_dir}/{sub_dir}/melamud_add', 'Melamud add'),
    (f'{result_dir}/{sub_dir}/melamud_baladd', 'Melamud balAdd'),
    (f'{result_dir}/{sub_dir}/melamud_mult', 'Melamud mult'),
    (f'{result_dir}/{sub_dir}/melamud_balmult', 'Melamud balMult'),

    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200', 'BERT large cased w/o mask'),
        (f'{result_dir}/{sub_dir}/gie_swn_test_semiPURExlnet', 'XLNet'),

    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-Tand', 'BERT large cased Tand w/o mask'),
    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-TandT', 'BERT large cased TandT w/o mask'),
    (f'{result_dir}/{sub_dir}/gie_swn_test_semiPUREBert_embs', 'BERT+embs (default))'),

    (f'{result_dir}/{sub_dir}/gie_swn_test_semiPURExlnet_embs', 'XLNet+embs (default)'),
    (f'{result_dir}/{sub_dir}/gie_swn_test_semiPURExlnet_embs_swnhypers', 'XLNet+embs (optimal)'),
]

 
print(sub_dir)
result_table = make_precision_table(experiments)
result_table=result_table.set_index("model")
result_table

In [None]:
# print(result_table.to_latex())

## Roles

In [None]:
result_dir = 'workdir/results'
sub_dir = 'test-paper_roles_st_pattern'
# sub_dir = 'test-paper_roles_st_pattern_stopwords'

experiments = [

    (f'{result_dir}/{sub_dir}/glove_840B_nolem/', 'GloVe'),
    (f'{result_dir}/{sub_dir}/fasttext_cc_nolem/', 'fastText'),
    (f'{result_dir}/{sub_dir}/word2vec_googlenews_nolem/', 'word2vec'),
    
    (f'{result_dir}/{sub_dir}/dt_wiki_lem', 'DT wiki'),
    (f'{result_dir}/{sub_dir}/dt_59g_lem', 'DT 59g'),

    
    (f'{result_dir}/{sub_dir}/melamud_add', 'Melamud add'),
    (f'{result_dir}/{sub_dir}/melamud_baladd', 'Melamud balAdd'),
    (f'{result_dir}/{sub_dir}/melamud_mult', 'Melamud mult'),
    (f'{result_dir}/{sub_dir}/melamud_balmult', 'Melamud balMult'),

    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200', 'BERT large cased w/o mask'),
         (f'{result_dir}/{sub_dir}/gie_swr_test_semiPURExlnet', 'XLNet'),

    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-Tand', 'BERT large cased Tand w/o mask'),
    (f'{result_dir}/{sub_dir}/blc-ntok1-nunits1-nomask-k200-TandT', 'BERT large cased TandT w/o mask'),

    (f'{result_dir}/{sub_dir}/gie_swr_test_semiPUREBert_embs', 'BERT+embs (default))'),

    (f'{result_dir}/{sub_dir}/gie_swr_test_semiPURExlnet_embs', 'XLNet+embs (default)'),
    (f'{result_dir}/{sub_dir}/gie_swr_test_semiPURExlnet_embs_swrhypers', ' XLNet+embs (optimal)'),

]

result_table = make_precision_table(experiments)
result_table=result_table.set_index("model")
result_table

In [None]:
# print(result_table2.to_latex())

## Merged models

### simplyfy names for merged models

In [None]:
def pretify(x):
    
    x = x.replace("gie_swv_test_semiPURExlnet_embs_swvhypers", "XLNet embs")
    x = x.replace("gie_swr_test_semiPURExlnet_embs_swrhypers", "XLNet embs")
    x = x.replace("gie_swn_test_semiPURExlnet_embs_swnhypers", "XLNet embs")

    x = x.replace("gie_swv_test_semiPURExlnet", "XLNet")
    x = x.replace("gie_swr_test_semiPURExlnet", "XLNet")
    x = x.replace("gie_swn_test_semiPURExlnet", "XLNet")

    x = x.replace("-ntok1-nunits1-nomask-k200", "")
    x = x.replace("-ntok1-nunits1-mask-k200", "")

    x = x.replace("blc", "BERT")
    x = x.replace("-TandT", " [TandT]")
    x = x.replace("glove_840B_nolem", "nc-emb")
    x = x.replace("fasttext_cc_nolem", "nc-emb")
    x = x.replace("dt_59g_lem", "DT")
    x = x.replace("dt_wiki_lem", "DT")
    x = x.replace("melamud_baladd", "Melamud balAdd")
    x = x.strip()
    
    order = {"XLNet embs": 0,
             "XLNet": 1,
            "BERT [TandT]":2,
            "BERT":3,
            "Melamud balAdd":4,
             "nc-emb" : 5,
             "DT":6
            }
    
    x = x.split("+")
#     print(x)
    o = [(t, order[t]) for t in x]
    o = sorted(o, key=lambda x:x[1])
    x = [t[0] for t in o]
#     print(x)
    x = " + ".join(x)

    return x


In [None]:
print_order = """nc-emb + DT                                 
Melamud balAdd + DT                      
Melamud balAdd + nc-emb                     
XLNet + nc-emb                              
XLNet + DT                               
XLNet + Melamud balAdd                   
XLNet+embs + nc-emb                         
XLNet+embs + DT                          
XLNet+embs + Melamud balAdd              
Melamud balAdd + nc-emb + DT                
XLNet + nc-emb + DT                         
XLNet + Melamud balAdd + nc-emb             
XLNet + Melamud balAdd + DT             
XLNet+embs + nc-emb + DT                    
XLNet+embs + Melamud balAdd + DT""".split("\n")
print_order = [l.strip().replace('+embs', ' embs') for l in print_order]
print_order

### Verbs

In [None]:
!ls workdir/results

In [None]:
path = "workdir/results/test-paper_verbs_st_pattern_vocabfilter_merged"
experiments = os.listdir(path)
experiments = [f'{path}/{exp}' for exp in experiments if not "." in exp]

experiments.sort()
result_table = make_precision_table(experiments)
result_table['model']=result_table['model'].apply(lambda x:pretify(x))

result_table = result_table.loc[result_table['model'].apply(lambda x: not 'BERT' in x)].reset_index(drop=True)
result_table['order'] = result_table['model'].apply(lambda x: print_order.index(x) if x in print_order else 100)

result_table= result_table.sort_values(by='order', ascending=True).reset_index(drop=True)

result_table=result_table.set_index("model")
result_table

### Nouns

In [None]:
path = "workdir/results/test-paper_nouns_st_pattern_stopwords_nounfilter_merged"
experiments = os.listdir(path)
experiments = [f'{path}/{exp}' for exp in experiments if not "." in exp]

experiments.sort()
result_table = make_precision_table(experiments)
result_table['model']=result_table['model'].apply(lambda x:pretify(x))

result_table = result_table.loc[result_table['model'].apply(lambda x: not 'BERT' in x)].reset_index(drop=True)
result_table['order'] = result_table['model'].apply(lambda x: print_order.index(x) if x in print_order else 100)

result_table= result_table.sort_values(by='order', ascending=True).reset_index(drop=True)


result_table=result_table.set_index("model")
result_table

In [None]:
print(result_table.to_latex())

### Roles

In [None]:
path = "workdir/results/test-paper_roles_st_pattern_merged"
# path = "workdir/results/test-paper_roles_st_pattern_stopwords_merged"
experiments = os.listdir(path)
experiments = [f'{path}/{exp}' for exp in experiments if not "." in exp]
# experiments = [exp for exp in experiments if "T" in exp]
experiments.sort()
result_table2 = make_precision_table(experiments)
result_table2['model']=result_table2['model'].apply(lambda x:pretify(x))

result_table2 = result_table2.loc[result_table2['model'].apply(lambda x: not 'BERT' in x)].reset_index(drop=True)
result_table2['order'] = result_table2['model'].apply(lambda x: print_order.index(x) if x in print_order else 100)

result_table2= result_table2.sort_values(by='order', ascending=True).reset_index(drop=True)


result_table2=result_table2.set_index("model")
result_table2

# Precision Curves

In [None]:
def create_multiple_curves(experiments, score_file_name, prefixes=None):
    curves = {}
    
    for i, exp in enumerate(experiments):
        if type(exp) is tuple:
            exp, exp_name = exp
        else:
            exp_name = exp.split('/')[-1]
            
        if prefixes!=None:
            prefix = prefixes[i]+'_'
            if prefix=='_':
                prefix=''
        else:
            prefix=''
#         print(prefix)    
        curve = pd.read_csv(os.path.join(exp, prefix+score_file_name))
#         print(curve)
        curves[exp_name] = curve
    return curves

def create_multiple_precision_recall_curves(experiments, prefixes):
    return create_multiple_curves(experiments, 'precision_recall.csv', prefixes=prefixes)


def create_multiple_precision_curves(experiments, prefixes):
    return create_multiple_curves(experiments, 'precs_all_hard.csv', prefixes=prefixes)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from typing import Dict, Tuple
from itertools import cycle
import numpy as np


#DEFAULT_COLORS = cycle(['b', 'r', 'g', 'y', 'black', 'brown', '#660066', 'orange'])
DEFAULT_COLORS = ['b', 'r', 'g', 'y', 'black', 'brown', '#660066', 'orange']
    
    
def create_precision_plot(precision_curves: Dict[str, pd.DataFrame], 
                          colors=DEFAULT_COLORS, output_path=None, step=True):
    """ precision_recall_curves: {exp_name : pd.DataFrame([precision, recall])} """
    
    plt.figure(dpi=200, figsize=(6.4, 4.8), facecolor='w', edgecolor='k')
    
    max_y = -1
    rng = 0
    for i, (exp_name, curve) in enumerate(precision_curves.items()):
        data = pd.DataFrame({'precision' : curve.precs_all_hard})
        
        col = colors[i]
        marker = ''
        if type(col) is tuple:
            col, marker = col
        
        rng = np.array(range(1, data.shape[0] + 1))
        if step:
            graph = plt.step(rng, data['precision'], color=col, 
                             marker=marker, alpha=0.5, where='post', linewidth=1.)
        else:
            graph = plt.plot(rng, data['precision'], 
                             color=col, marker=marker, alpha=0.5, linewidth=1.)
            
        graph[0].set_label(exp_name)
        
        if data['precision'].max() > max_y:
            max_y = data['precision'].max()

    plt.xlabel('k')
    plt.ylabel('Precision')
    plt.ylim([0.0, max_y + 0.025])
    plt.xlim([0.0, len(rng) + 1])
    plt.legend(prop={'size': 9})
    
    if output_path is not None:
        plt.savefig(output_path)
        
def create_precision_plots(precision_curves: Dict[str, pd.DataFrame], 
                           precision_curves2: Dict[str, pd.DataFrame], 
                           precision_curves3: Dict[str, pd.DataFrame],
                          colors=DEFAULT_COLORS, output_path=None, step=True):
    """ precision_recall_curves: {exp_name : pd.DataFrame([precision, recall])} """
    fig, axes = plt.subplots(3,1, 
                         dpi=480,
                         figsize=(10, 8))

#     plt.figure(dpi=200, figsize=(6.4, 4.8), facecolor='w', edgecolor='k')
    
    max_y = -1
    rng = 0
    for i, (exp_name, curve) in enumerate(precision_curves.items()):
        data = pd.DataFrame({'precision' : curve.precs_all_hard})
        
        col = colors[i]
        marker = ''
        if type(col) is tuple:
            col, marker = col
        
        rng = np.array(range(1, data.shape[0] + 1))
        if step:
            graph = axes[0].step(rng, data['precision'], color=col, 
                             marker=marker, alpha=0.5, where='post', linewidth=1.)
        else:
            graph = axes[0].plot(rng, data['precision'], 
                             color=col, marker=marker, alpha=0.5, linewidth=1.)
            
        graph[0].set_label(exp_name)
        
        if data['precision'].max() > max_y:
            max_y = data['precision'].max()

    axes[0].set_ylabel('Precision')
    axes[0].set_ylim([0.0, max_y + 0.025])
#     axes[0].xlim([0.0, len(rng) + 1])
#     axes[0].legend(prop={'size': 9})
    for i, (exp_name, curve) in enumerate(precision_curves2.items()):
        data = pd.DataFrame({'precision' : curve.precs_all_hard})
        
        col = colors[i]
        marker = ''
        if type(col) is tuple:
            col, marker = col
        
        rng = np.array(range(1, data.shape[0] + 1))
        if step:
            graph = axes[1].step(rng, data['precision'], color=col, 
                             marker=marker, alpha=0.5, where='post', linewidth=1.)
        else:
            graph = axes[1].plot(rng, data['precision'], 
                             color=col, marker=marker, alpha=0.5, linewidth=1.)
            
        graph[0].set_label(exp_name)
        
        if data['precision'].max() > max_y:
            max_y = data['precision'].max()    
            
    axes[1].set_ylabel('Precision')
    axes[1].set_ylim([0.0, max_y + 0.025])
    for i, (exp_name, curve) in enumerate(precision_curves3.items()):
        data = pd.DataFrame({'precision' : curve.precs_all_hard})
        
        col = colors[i]
        marker = ''
        if type(col) is tuple:
            col, marker = col
        
        rng = np.array(range(1, data.shape[0] + 1))
        if step:
            graph = axes[2].step(rng, data['precision'], color=col, 
                             marker=marker, alpha=0.5, where='post', linewidth=1.)
        else:
            graph = axes[2].plot(rng, data['precision'], 
                             color=col, marker=marker, alpha=0.5, linewidth=1.)
            
        graph[0].set_label(exp_name)
        
        if data['precision'].max() > max_y:
            max_y = data['precision'].max()
            
            
    axes[2].set_ylabel('Precision') 
    axes[2].set_xlabel('k')
    axes[2].set_ylim([0.0, max_y + 0.025])
#     axes[0].xlim([0.0, len(rng) + 1])
#     axes[0].legend(prop={'size': 9})
#     axes[1].legend(prop={'size': 9})
# font1 = {
# #                 'family': 'serif',
#                     'color':  colors[i],
#                     'weight': 'bold',
#                     'size': 7
#                     }
    axes[0].text(.83, .98, 'Verb Lexical Unit',
                 ha='left', va='top', 
                 transform = axes[0].transAxes)
    axes[1].text(.83, .98, 'Noun Lexical Unit',
                 ha='left', va='top', 
                 transform = axes[1].transAxes)
    
    axes[2].text(.83, .98, 'Semantic Role',
                 ha='left', va='top', 
                 transform = axes[2].transAxes)
    
#     axes[0].set_title(f'Verbs-Lexical Unit')
#     axes[1].set_title(f'Nouns-Lexical Unit')
#     axes[2].set_title(f'Frame Roles')

    lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes[:1]]
    lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
    plt.figlegend( lines, labels, loc = 'lower center', ncol=4, labelspacing=0. , bbox_to_anchor=(0.5, -0.001))

    if output_path is not None:
        plt.savefig(output_path)


### Verbs

In [None]:
result_dir = '~/generative-ie/workdir/results'
res_dir = 'test-paper_verbs_st_pattern_vocabfilter'

experiments = [
    (f'{result_dir}/{res_dir}/gie_swv_test_semiPURExlnet_embs_swvhypers/', 'default', '','black'),
    (f'{result_dir}/{res_dir}/gie_swv_test_semiPURExlnet_embs_swvhypers/', 'mgs-5', 'mcs-5','green'),
    (f'{result_dir}/{res_dir}/gie_swv_test_semiPURExlnet_embs_swvhypers/', 'mgs-10', 'mcs-10', 'red'),
    (f'{result_dir}/{res_dir}/gie_swv_test_semiPURExlnet_embs_swvhypers/', 'mgs-15', 'mcs-15', 'purple'),

]

# colors = ['#63B8FF',  'b',  '#DB7093', 'red', '#696969',  'black', 'b',
#             '#660066', 'brown', '#FFC0CB', '#00FF00', 'red', '#FF8C00']

colors = [exp[-1] for exp in experiments]
prefixes = [exp[-2] for exp in experiments]
experiments = [exp[:2] for exp in experiments]

curves1 = create_multiple_precision_curves(experiments, prefixes=prefixes)
create_precision_plot(curves1, output_path=None, colors=colors, step=False)


### Nouns

In [None]:
res_dir = 'test-paper_nouns_st_pattern_stopwords_nounfilter'
output_path = result_dir + 'cluster_size_preds.png'

experiments = [
    (f'{result_dir}/{res_dir}/gie_swn_test_semiPURExlnet_embs_swnhypers/', 'default', '','black'),
    (f'{result_dir}/{res_dir}/gie_swn_test_semiPURExlnet_embs_swnhypers/', 'mgs-5', 'mcs-5','green'),
    (f'{result_dir}/{res_dir}/gie_swn_test_semiPURExlnet_embs_swnhypers/', 'mgs-10', 'mcs-10', 'red'),
    (f'{result_dir}/{res_dir}/gie_swn_test_semiPURExlnet_embs_swnhypers/', 'mgs-15', 'mcs-15', 'purple'),

]

# colors = ['#63B8FF',  'b',  '#DB7093', 'red', '#696969',  'black', 'b',
#             '#660066', 'brown', '#FFC0CB', '#00FF00', 'red', '#FF8C00']

colors = [exp[-1] for exp in experiments]
prefixes = [exp[-2] for exp in experiments]
experiments = [exp[:2] for exp in experiments]

curves2 = create_multiple_precision_curves(experiments, prefixes=prefixes)
create_precision_plot(curves2, output_path=None, colors=colors, step=False)


### Roles

In [None]:
res_dir = 'test-paper_roles_st_pattern'
output_path = result_dir + 'cluster_size_preds.png'

experiments = [
    (f'{result_dir}/{res_dir}/gie_swr_test_semiPURExlnet_embs_swrhypers/', 'default', '','black'),
    (f'{result_dir}/{res_dir}/gie_swr_test_semiPURExlnet_embs_swrhypers/', 'mgs-5', 'mcs-5','green'),
    (f'{result_dir}/{res_dir}/gie_swr_test_semiPURExlnet_embs_swrhypers/', 'mgs-10', 'mcs-10', 'red'),
    (f'{result_dir}/{res_dir}/gie_swr_test_semiPURExlnet_embs_swrhypers/', 'mgs-15', 'mcs-15', 'purple'),

]

# colors = ['#63B8FF',  'b',  '#DB7093', 'red', '#696969',  'black', 'b',
#             '#660066', 'brown', '#FFC0CB', '#00FF00', 'red', '#FF8C00']

colors = [exp[-1] for exp in experiments]
prefixes = [exp[-2] for exp in experiments]
experiments = [exp[:2] for exp in experiments]

curves3 = create_multiple_precision_curves(experiments, prefixes=prefixes)
create_precision_plot(curves3, output_path=None, colors=colors, step=False)


### Combine

In [None]:
create_precision_plots(curves1, curves2, curves3, output_path=None, colors=colors, step=False)
