### stats

In [None]:
from sesame.dataio import read_conll
from sesame.conll09 import CoNLL09Example, CoNLL09Element
from sesame.sentence import Sentence
import sys
from glob import glob
import statistics
import pandas as pd
pd.set_option('max_colwidth', None)

from lexsub.conll_helper import conll_to_sentence, get_frameName, get_luName
def get_example_stats(examples):
    
    examples_per_frame = {}
    examples_per_sent = {}
    examples_per_lu = {}
    frames = set()
    sents = set()
    lus = set()
    
    for example in examples:
        frame = get_frameName(example)
        lu = get_luName(example)
        sent = conll_to_sentence(example)
        
        frames.add(frame)
        sents.add(sent)
        lus.add(lu)
        
        if frame in examples_per_frame.keys():
            examples_per_frame[frame] += 1
        else:
            examples_per_frame[frame] = 1
            
        if sent in examples_per_sent.keys():
            examples_per_sent[sent] += 1
        else:
            examples_per_sent[sent] = 1   
            
        if lu in examples_per_lu.keys():
            examples_per_lu[lu] += 1
        else:
            examples_per_lu[lu] = 1       
    
            
    return examples_per_sent, examples_per_frame, examples_per_lu


In [None]:
main_data_dir = 'data/open_sesame_v1_data/fn1.7'

In [None]:
exps = [
    'original',
    'verbs',
    'nExPerSent_verbs_randAllExps/01ExPerSent_verbs_rand01',
    'nouns',
    'nExPerSent_nouns_randAllExps/01ExPerSent_nouns_rand01',

]

columns=['exp', 'examples', 'sents', 'frames', 'lexical units',
         'avg_examples_per_sent', 'avg_examples_per_frame', 'avg_examples_per_lexicalunit',
#         'min_examples_per_sent', 'min_examples_per_frame', 'min_examples_per_lexicalunit',
        'max_examples_per_sent', 'max_examples_per_frame', 'max_examples_per_lexicalunit']

df = pd.DataFrame(columns=columns)

for exp in exps:
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            
        except:
            print(model)
            continue
        
        exp_short = exp
        df.loc[len(df)] = [exp_short, len(examples), 
                           len(examples_per_sent), len(examples_per_frame), len(examples_per_lu), 
                           statistics.mean(examples_per_sent.values()), 
                           statistics.mean(examples_per_frame.values()),
                           statistics.mean(examples_per_lu.values()),
#                            min(examples_per_sent.values()), 
#                            min(examples_per_frame.values()),
#                            min(examples_per_lu.values()),
                           max(examples_per_sent.values()), 
                           max(examples_per_frame.values()),
                           max(examples_per_lu.values())
                          ]
        
df

In [None]:
exps_dir='expanded_nExPerSent_verbs_randAllExps'
exps = [
'01ExPerSent_verbs_rand01_expanded_lu',
 '01ExPerSent_verbs_rand01_expanded_roles',
'01ExPerSent_verbs_rand01_expanded_nouns-10pc',
 '01ExPerSent_verbs_rand01_expanded_nouns-30pc',
 '01ExPerSent_verbs_rand01_expanded_nouns-50pc',
 '01ExPerSent_verbs_rand01_expanded_lu_roles_nouns-10pc',
 '01ExPerSent_verbs_rand01_expanded_lu_roles_nouns-30pc',
 '01ExPerSent_verbs_rand01_expanded_lu_roles_nouns-50pc',
]

pipeline = 'lugold_rolegold_nltk_nolemma_role_stopwords_N2'
pipeline = 'lugold_rolegold_nltk_nolemma_N2'

# preds_model = 'xlnet_embs_hypers'
preds_model = 'bert'
exps = [f'{exps_dir}/{exp}/{preds_model}/{pipeline}' for exp in exps]



columns=['exp', 'examples', 'sents', 'frames', 'lexical units',
#          'avg_examples_per_sent', 
         'avg_examples_per_frame', 'avg_examples_per_lexicalunit',
#         'min_examples_per_sent', 'min_examples_per_frame', 'min_examples_per_lexicalunit',
#         'max_examples_per_sent', 'max_examples_per_frame', 'max_examples_per_lexicalunit'
        ]

df = pd.DataFrame(columns=columns)

for exp in exps:
#     print(exp)
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            
        except:
            print(model)
            continue
        
        exp_short = exp.split('/')[1]
        df.loc[len(df)] = [exp_short, len(examples), 
                           len(examples_per_sent), len(examples_per_frame), len(examples_per_lu), 
#                            statistics.mean(examples_per_sent.values()), 
                           statistics.mean(examples_per_frame.values()),
                           statistics.mean(examples_per_lu.values()),
#                            min(examples_per_sent.values()), 
#                            min(examples_per_frame.values()),
#                            min(examples_per_lu.values()),
#                            max(examples_per_sent.values()), 
#                            max(examples_per_frame.values()),
#                            max(examples_per_lu.values())
                          ]
        
df

In [None]:
exps_dir='expanded_nExPerSent_nouns_randAllExps'
exps = [
'01ExPerSent_nouns_rand01_expanded_lu',
 '01ExPerSent_nouns_rand01_expanded_roles',
'01ExPerSent_nouns_rand01_expanded_nouns-10pc',
 '01ExPerSent_nouns_rand01_expanded_nouns-30pc',
 '01ExPerSent_nouns_rand01_expanded_nouns-50pc',
 '01ExPerSent_nouns_rand01_expanded_lu_roles_nouns-10pc',
 '01ExPerSent_nouns_rand01_expanded_lu_roles_nouns-30pc',
 '01ExPerSent_nouns_rand01_expanded_lu_roles_nouns-50pc',
]

pipeline = 'lugold_rolegold_nltk_nolemma_role_stopwords_N2'

preds_model = 'bert'
# preds_model = 'xlnet_embs_hypers'
exps = [f'{exps_dir}/{exp}/{preds_model}/{pipeline}' for exp in exps]



columns=['exp', 'examples', 'sents', 'frames', 'lexical units',
#          'avg_examples_per_sent', 
         'avg_examples_per_frame', 'avg_examples_per_lexicalunit',
#         'min_examples_per_sent', 'min_examples_per_frame', 'min_examples_per_lexicalunit',
#         'max_examples_per_sent', 'max_examples_per_frame', 'max_examples_per_lexicalunit'
        ]

df = pd.DataFrame(columns=columns)

for exp in exps:
#     print(exp)
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            
        except:
            print(model)
            continue
        exp_short = exp.split('/')[1]
        df.loc[len(df)] = [exp_short, len(examples), 
                           len(examples_per_sent), len(examples_per_frame), len(examples_per_lu), 
#                            statistics.mean(examples_per_sent.values()), 
                           statistics.mean(examples_per_frame.values()),
                           statistics.mean(examples_per_lu.values()),
#                            min(examples_per_sent.values()), 
#                            min(examples_per_frame.values()),
#                            min(examples_per_lu.values()),
#                            max(examples_per_sent.values()), 
#                            max(examples_per_frame.values()),
#                            max(examples_per_lu.values())
                          ]
        
df

In [None]:
def prettify(x):
    x = x.split("/")[1]
    x = x.replace("expanded_nExPerSent_verbs_randAllExps/", "")
#     x = x.replace("bert/", "")
#     x = x.replace("xlnet_embs_hypers/", "")
#     x = x.replace("lugold_rolegold_nltk_nolemma_role_stopwords_N2", "")
    x = x.replace("01ExPerSent_verbs_rand01_", "")
    x = x.replace("_", "-")
    x = x.replace("expanded", "augmented")
    x = x.replace("lu", "lexical unit")
    return x
    
    
df['exp'] = df['exp'].apply(lambda x: prettify(x))
df[['exp', 'examples', 'lexical units']]

In [None]:
base_exps = [
    'original',
    'verbs',
    'nExPerSent_verbs_randAllExps/01ExPerSent_rand01',
    'nouns',
    'nExPerSent_nouns_randAllExps/01ExPerSent_rand01',
]

columns=['exp', 'train_frames', 'test_frames', 'dev_frames']

df = pd.DataFrame(columns=columns)

for exp in base_exps:
    frames = [exp]
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            frames.append(list(examples_per_frame.keys()))
        except:
            print(model)
            continue
    for model in glob(f'{main_data_dir}/{exp}/*test*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            frames.append(list(examples_per_frame.keys()))
        except:
            print(model)
            continue 
    for model in glob(f'{main_data_dir}/{exp}/*dev*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            frames.append(list(examples_per_frame.keys()))
        except:
            print(model)
            continue   
        df.loc[len(df)] = frames
        
df

In [None]:
df['test_not_in_train'] = df.apply(lambda row: len(set(row['test_frames']) - set(row['train_frames'])), axis=1)
df['test_frames_n'] = df['test_frames'].apply(lambda row: len(row))
df

In [None]:
exps = [
#     'original',
    'verbs',
]
columns=['exp', 'examples', 'sents', 'frames', 'lus', 'avg_examples_per_sent', 'avg_examples_per_frame', 'avg_examples_per_lu']
df = pd.DataFrame(columns=columns)

for exp in exps:
    for model in glob(f'{main_data_dir}/{exp}/*test*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent2, examples_per_frame2, examples_per_lu2 = get_example_stats(examples)
            
        except:
            print(model)
            continue
        
        df.loc[len(df)] = [exp, len(examples), 
                           len(examples_per_sent2), len(examples_per_frame2), len(examples_per_lu2), 
                           statistics.mean(examples_per_sent2.values()), 
                           statistics.mean(examples_per_frame2.values()),
                           statistics.mean(examples_per_lu2.values())
                          ]
        
df

In [None]:
exps_dir = "nPc_verbs_randAllExps"
exps = [
    '010pc_verbs',
    '020pc_verbs',
    '030pc_verbs',
    '040pc_verbs',
    '050pc_verbs',
    '100pc_verbs'
]

exps = [f"{exps_dir}/{exp}" for exp in exps]

columns=['exp', 'examples', 'sents', 'frames', 'lus', 'avg_examples_per_sent', 'avg_examples_per_frame', 'avg_examples_per_lu']
df = pd.DataFrame(columns=columns)

for exp in exps:
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            
        except:
            print(model)
            continue
        
        df.loc[len(df)] = [exp, len(examples), 
                           len(examples_per_sent), len(examples_per_frame), len(examples_per_lu), 
                           statistics.mean(examples_per_sent.values()), 
                           statistics.mean(examples_per_frame.values()),
                           statistics.mean(examples_per_lu.values())
                          ]
        
df

In [None]:
exps_dir = "nPc_nouns_randAllExps"
exps = [
    '010pc_nouns',
    '020pc_nouns',
    '030pc_nouns',
    '040pc_nouns',
    '050pc_nouns',
    '100pc_nouns'
]

exps = [f"{exps_dir}/{exp}" for exp in exps]

columns=['exp', 'examples', 'sents', 'frames', 'lus', 'avg_examples_per_sent', 'avg_examples_per_frame', 'avg_examples_per_lu']
df = pd.DataFrame(columns=columns)

for exp in exps:
    for model in glob(f'{main_data_dir}/{exp}/*train*.conll'):
        try:
            examples, __, __ =read_conll(model)
            examples_per_sent, examples_per_frame, examples_per_lu = get_example_stats(examples)
            
        except:
            print(model)
            continue
        
        df.loc[len(df)] = [exp, len(examples), 
                           len(examples_per_sent), len(examples_per_frame), len(examples_per_lu), 
                           statistics.mean(examples_per_sent.values()), 
                           statistics.mean(examples_per_frame.values()),
                           statistics.mean(examples_per_lu.values())
                          ]
        
df

In [None]:
df[['exp', 'examples']]