In [26]:
import pandas as pd
import re
import plotly.express as px

### Load vocab

In [18]:
common_vocab = pd.read_csv("../data/vocab_2000.csv")
common_500 = {w.lower() for w in common_vocab['word'][:500]}

### Load generations

In [None]:
# df = pd.read_csv("../data/human_evals_3.csv").drop(['mechanics_andy', 'semantics_andy', 'context_andy'], axis=1)
df = pd.read_csv("../data/human_evals_4.csv")

column_renamer = {}
for i,annotator_name in enumerate(['philip', 'emilie', 'william', 'andy']):
    for metric_name in ['mechanics', 'semantics', 'context']:
        column_renamer[f'{metric_name}_{annotator_name}'] = f'{metric_name}_{i}'

df = df.rename(columns=column_renamer)
df.head()

### Eval: confirming target word is in generation

In [22]:
# note: only checks for substring. would be aided in accuracy by lemmatization
def check_containments(df, generation_column: str):
    containments = []
    for row in df.to_dict(orient='records'):
        vocab_word = row['vocab_word'].lower()
        generation = row[generation_column].lower()
        # vocab_in_generation = bool(re.search(vocab_word, generation))
        containments.append(vocab_word in generation)
    return containments

def count_containments(df, generation_column):
    return sum(check_containments(df, generation_column))

In [None]:
####################
# Running the eval #
####################
df['contains_target'] = check_containments(df, 'baseline_prompt')
df['contains_target'].value_counts()

# print(f"# of rows: {len(df)}\n===\n# containing target vocab:")
# for col in experiment_columns_names:
#     print(col, count_containments(df, col))

### Eval: Counting out-of-vocabulary words

In [30]:
def clean_text(input: str):
    input = input.lower()
    input = re.sub("[\s,.?!'\"]+", " ", input)
    input = re.sub("\s+", " ", input)
    input = input.strip()
    return input

def split_words(generation):
    generation = clean_text(generation)
    words = re.split("\s+", generation)
    return words

def count_out_of_vocab(generation, vocab):
    return len(filter_out_of_vocab(generation, vocab))
    # words = split_words(generation)
    # return sum(w not in vocab for w in words)

# TODO: perhaps do simple tweak to account for (regular) plurals (if need is demonstrated)
# given more time/complexity, could preprocess all words by lemmatizing them
def filter_out_of_vocab(generation, vocab):
    words = split_words(generation)
    return [w for w in words if w not in vocab]

def vocab_count_eval(df, generation_column: str, vocab_set):
    out_of_vocab_counts = []
    for row in df.to_dict(orient='records'):
        target_word = row['vocab_word'].lower()
        vocab_set = vocab_set | {target_word}
        generation = row[generation_column].lower()
        out_of_vocab_counts.append(count_out_of_vocab(generation, vocab_set))
    return out_of_vocab_counts

def word_count(df, generation_column):
    counts = []
    for row in df.to_dict(orient='records'):
        generation = row[generation_column].lower()
        counts.append(len(split_words(generation)))
    return counts

In [31]:
####################
# Running the eval #
####################
df['n_out_of_vocab'] = vocab_count_eval(df, 'baseline_prompt', common_500)
df['generation_length'] = word_count(df, 'baseline_prompt')
df['%_out_of_vocab'] = df['n_out_of_vocab'] / df['generation_length']

### Analysis of human evals

### Plotting