In [None]:
! huggingface-cli login

In [2]:
%%capture
! pip install transformers datasets evaluate
! pip install accelerate -U

In [25]:
import csv
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from datasets import load_dataset
from tqdm.auto import tqdm
import pandas as pd

import plotly.express as px
import re

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.device(device)

# 8B model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct").to(device)

# 1B model
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct").to(device)

In [None]:
ds = load_dataset("community-datasets/generics_kb", "generics_kb_best")

In [None]:
print(ds)

In [7]:
generic_sentences_dict = {}
for split_name, split_data in ds.items():
    for example in split_data:
        word = example['term'].lower() if example['term'] is not None else None
        if word and " " not in word:
            if word not in generic_sentences_dict:
                generic_sentences_dict[word] = []
            if example['generic_sentence'] not in generic_sentences_dict[word]:
                generic_sentences_dict[word].append(example['generic_sentence'])

In [None]:
# generic_sentences_dict['atheist']

In [8]:
def get_dataset():
  ds = load_dataset("marksverdhei/wordnet-definitions-en-2021")

  definitions_dict = {}
  for split_name, split_data in ds.items():
    for example in split_data:
      word = example['Word'].lower() if example['Word'] is not None else None
      if word and " " not in word:
        if word not in definitions_dict:
          definitions_dict[word] = []
        if example['Definition'] not in definitions_dict[word]:
          definitions_dict[word].append(example['Definition'])

  word_set = list(definitions_dict.keys())
  seed = 42
  random.seed(seed)
  random.shuffle(word_set)
  print(len(word_set))

  example_dict = {}
  for split_name, split_data in ds.items():
      for example in split_data:
        word = example['Word'].lower() if example['Word'] is not None else None
        if word and " " not in word:
          if word not in example_dict:
            example_dict[word] = []
          example_dict[word].append(example['Example'])

  return definitions_dict, word_set, example_dict

In [None]:
definitions_dict, word_set, example_dict = get_dataset()

In [None]:
word_set = word_set[:200]
#word_set = word_set[:1]
output_data = [] # (low_context_example, high_context_example)
print(len(word_set))

for vocab_word in word_set:
  definitions_temp = definitions_dict.get(vocab_word, ["Word not found"])
  examples_temp = example_dict.get(vocab_word, ["Word not found"])

  # print(definitions_temp)
  # print(examples_temp)
  # print()

  if len(definitions_temp) > 2:
    definitions = [definitions_temp[0], definitions_temp[1]]
    examples = [examples_temp[0], examples_temp[1]]
  else:
    definitions = [definitions_temp[0]]
    examples = [examples_temp[0]]

  for i in range(len(definitions)):
    high_context_example = definitions[i] + ', ' + examples[i]
    low_context_example = examples[i]

    # print(high_context_example)
    # print(low_context_example)
    output_data.append((low_context_example, high_context_example))

file_path = 'examples.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["low_context_example", "high_context_example"])
  writer.writerows(output_data)
print("DONE")

### Load common vocab list

In [None]:
common_vocab = pd.read_csv("data/vocab_2000.csv")
common_vocab.head()

In [19]:
# common_vocab['word'][:200]
common_200 = {w.lower() for w in common_vocab['word'][:200]}
common_500 = {w.lower() for w in common_vocab['word'][:500]}
common_all = {w.lower() for w in common_vocab['word']}

### `lexical_boost` implmentation and LLM wrangling

In [11]:
from transformers import ConstrainedBeamSearchScorer, BeamScorer, LogitsProcessor, LogitsProcessorList
from collections import defaultdict, deque

In [20]:
# adds llama 3.2 special tokens
def wrap_prompt(prompt):
    return "<|start_header_id|>user<|end_header_id|>" + prompt + "<|start_header_id|>assistant<|end_header_id|>"

In [38]:
class CustomLogitsProcessor(LogitsProcessor):
    def __init__(self, boost_words, tokenizer=None, boost_coefficient=4):
        self.boost_words = boost_words or []
        self.boosters = []
        self.tokenizer = tokenizer
        self.boost_coefficient = boost_coefficient

        self.initialize_boosters()

    def initialize_boosters(self):
        self.boost_words = set(self.boost_words)
        self.boost_words |= {w.capitalize() for w in self.boost_words}
        self.boost_words |= {" " + w for w in self.boost_words} | {w + " " for w in self.boost_words}
        self.boost_words = list(self.boost_words)
        for word in self.boost_words:
            word_toks = list(self.tokenizer.encode(word, add_special_tokens=False))
            self.boosters.append(word_toks)


    def __call__(self, input_ids, scores):
        boosts = defaultdict(int) # token_id -> boost
        for booster in self.boosters:
            # print(booster)
            # find longest suffix of generation that fits this booster
            # not the most efficient implementation, but len(booster) should be short so should be fine
            for match_len in reversed(range(len(booster))):
                for hist_tok,boost_tok in zip(input_ids[0, input_ids.shape[1]-match_len:], booster):
                    hist_tok = hist_tok.item()
                    # print(hist_tok, boost_tok, self.tokenizer.decode(hist_tok), self.tokenizer.decode(boost_tok))
                    if hist_tok != boost_tok: break
                else: # `else` not necessary, but makes branching more explicit
                    dist_from_goal = len(booster) - match_len
                    # new_boost = 10*self.boost_coefficient* 1/(dist_from_goal ** 2) # boost function 1
                    new_boost = self.boost_coefficient # boost function 2
                    boost_tok = booster[match_len]
                    boosts[boost_tok] = max(boosts[boost_tok], new_boost)
                    break

        for tok,boost in boosts.items():
            # print(f"pretend: Boosting {tok} by {boost}")
            # print(f"Boosting {tok} ({tokenizer.decode(tok)}) by {boost}")
            scores[:, tok] += boost

        return scores



In [None]:
definitions_dict, word_set, example_dict = get_dataset()

# 0-200 already generated
word_set = word_set[100:200]
# word_set = word_set[:1]

# print(word_set)
# for word in word_set:
#   print(definitions_dict.get(word, ["Word not found"]))

for boost_coefficient in (0, 2, 4, 8, 12, 16):
    print(f"Boost coefficient: {boost_coefficient}")

    output_data = []
    for vocab_word in tqdm(list(word_set)[:30]):
        # definitions = definitions_dict.get(vocab_word, ["Word not found"])
        # NOTE: I picked the condition randomly
        definitions_temp = definitions_dict.get(vocab_word, ["Word not found"])

        if len(definitions_temp) > 2:
            definitions = [definitions_temp[0], definitions_temp[1]]
        else:
            definitions = [definitions_temp[0]]

            processor = CustomLogitsProcessor(
                boost_words=list(common_500 | {vocab_word}),
                tokenizer=tokenizer,
                boost_coefficient=boost_coefficient,
            )

            mask_processor = CustomLogitsProcessor(
                boost_words=list(common_500 | {vocab_word} | {"<vocab>"}),
                tokenizer=tokenizer,
            )
            logits_processors = LogitsProcessorList([processor])
            mask_logits_processors = LogitsProcessorList([mask_processor])

        for definition in definitions:
            baseline_prompt = f'Please write a sentence using the word "{vocab_word}", with enough context clues that someone can understand the meaning of the word. However, don\'t simply define the word. Respond with only the sentence, nothing else, no explanations.'
            # baseline_prompt = f"the german word 'mich' translates in english to: "
            # baseline_prompt = f"What is the classic animal that you see accompanying a witch?"
            # baseline_prompt = f"Please write a sentence using the word \"{vocab_word}\", with enough context clues that someone can understand the meaning of the word. However, don't simply define the word."
            # vocab_with_definition_prompt = f"Please write a sentence using the word \"{vocab_word}\", with the definition of \"{definition}\" with enough context clues that someone can understand the meaning of the word. However, don't simply define the word. Respond with only the sentence, nothing else, no explanations."
            # vocab_with_definition_prompt = f"Please write a sentence using the word \"{vocab_word}\", with the definition of \"{definition}\" with enough context clues that someone can understand the meaning of the word. However, don't simply define the word."
            # masked_vocab_word_prompt = f"Using <vocab> in place of a word with the definition \"{definition}\", write a sentence, using <vocab>, with enough context clues for someone to understand the meaning of the word without directly using the definition. Respond with only the sentence, nothing else, no explanations."
            # masked_vocab_word_prompt = f"Pretend the word \"<vocab>\" has the definition \"{definition}\". Now write a sentence, using \"<vocab>\", with enough context clues for someone to understand the meaning of the word without directly using the definition."
            # baseline_prompt = f"Given the vocab word \"{vocab_word}\", generate a sentence, using the word directly, with enough context clues for someone to understand the meaning of the word without directly using the definition. Respond with only the sentence, nothing else, no explanations."
            # vocab_with_definition_prompt = f"Given the vocab word \"{vocab_word}\" with the definition of \"{definition}\", generate a sentence with enough context clues for someone to understand the meaning of the word without directly using the definition. Respond with only the sentence, nothing else, no explanations."
            # masked_vocab_word_prompt = f"Using <vocab> in place of a word with the definition \"{definition}\", generate a sentence, using <vocab>, with enough context clues for someone to understand the meaning of the word without directly using the definition. Respond with only the sentence, nothing else, no explanations."
            # constrained_beam_search_prompt = "Generate a sentence containing enough context clues for someone to understand the meaning of the sentence entirely. Respond with only the sentence, nothing else, no explanations."

            # prompts = [baseline_prompt, vocab_with_definition_prompt, masked_vocab_word_prompt]
            prompts = [baseline_prompt]
            prompts = [wrap_prompt(p) for p in prompts]

            method_outputs = (
                []
            )  # baseline_prompt, vocab_with_definition_prompt, masked_vocab_word_prompt

            for i, prompt in enumerate(prompts):
                inputs = tokenizer(prompt, return_tensors="pt").to(device)
                input_token_len = inputs.input_ids.shape[-1]

                #   logits_processors_to_use = mask_logits_processors if i == 2 else logits_processors
                #   output = model.generate(**inputs, max_new_tokens=50, no_repeat_ngram_size=3, logits_processor=logits_processors_to_use)
                output = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    no_repeat_ngram_size=3,
                    logits_processor=logits_processors,
                    pad_token_id=tokenizer.eos_token_id,
                )

                response = tokenizer.decode(
                    output[0][input_token_len:], skip_special_tokens=True
                )
                sentence = response.split(".")[0] + "."
                sentence = sentence.strip()
                # print(sentence)
                method_outputs.append(sentence)
            # method_outputs[2] = method_outputs[2].replace("<vocab>", vocab_word)
            # output_data.append((vocab_word, method_outputs[0], method_outputs[1], method_outputs[2]))
            output_data.append((vocab_word, method_outputs[0], boost_coefficient))
            # print(output_data)
    # output_dict = {word: [output1, output2, output3] for word, output1, output2, output3 in output_data}

    file_path = f"output_{boost_coefficient}.csv"
    with open(file_path, mode="w", newline="") as file:
        writer = csv.writer(file)
        #   writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt"])
        writer.writerow(["vocab_word", "baseline_prompt", "boost_coefficient"])
        writer.writerows(output_data)
    print("DONE")

**EVALUATIONS**

In [None]:
%%capture
! pip install "git+https://github.com/AIPHES/DiscoScore.git"
! pip install bert_score
! pip install rouge_score

In [None]:
from disco_score import DiscoScorer
from evaluate import load
import csv
import nltk
import evaluate
nltk.download('punkt_tab')

In [None]:
disco_scorer = DiscoScorer(device='cuda:0', model_name='bert-base-uncased')

# system : output genereated by model
# references : baseline to compare to

system = ["Her abject expression was a stark contrast to her confident demeanor earlier in the day."]
#system = ["The painting depicts a scene of utter despair, with the subject's face contorted in a twisted grimace, surrounded by the remnants of a life that has been ravaged by war. The once vibrant colors have faded to a dull, abject grey, reflecting the bleakness of"]

references = [["showing utter resignation or hopelessness, an abject apology", "of the most contemptible kind, abject cowardice", "most unfortunate or miserable, abject poverty", "showing utter resignation or hopelessness, abject surrender"]]

for s, refs in zip(system, references):
   s = s.lower()
   refs = [r.lower() for r in refs]
   print(disco_scorer.EntityGraph(s, refs))
   print(disco_scorer.LexicalChain(s, refs))
   print(disco_scorer.RC(s, refs))
   print(disco_scorer.LC(s, refs))
   print(disco_scorer.DS_Focus_NN(s, refs)) # FocusDiff
   print(disco_scorer.DS_SENT_NN(s, refs)) # SentGraph

In [None]:
#import csv
disco_scorer = DiscoScorer(device='cuda:0', model_name='bert-base-uncased')
bertscore = load("bertscore")
rouge = evaluate.load('rouge')

eval_scores_disco_scorer_FOCUS = []
eval_scores_disco_scorer_SENT = []
eval_scores_precision_BERTscore = []
eval_scores_recall_BERTscore = []
eval_scores_f1_BERTscore = []
eval_scores_ROUGE_L = []

file_path = 'gen_data.csv'
with open(file_path, mode='r') as file:
  csv_reader = csv.reader(file)
  #print(type(csv_reader))

  index = 0
  for row in csv_reader:
    vocab_word = row[0]
    baseline_prompt = row[1]
    vocab_with_definition_prompt = row[2]
    masked_vocab_word_prompt = row[3]
    low_context_example = row[4]
    high_context_example = row[5]
    outputs = [baseline_prompt, vocab_with_definition_prompt, masked_vocab_word_prompt, low_context_example]

    focus_scores = []
    sent_scores = []
    rouge_scores = []
    for output in outputs:
      # disco_scorer
      system = [output] # system : output genereated by model
      references = [[high_context_example]] # references : baseline to compare to
      for s, refs in zip(system, references):
        s = s.lower()
        refs = [r.lower() for r in refs]
        focus_scores.append(disco_scorer.DS_Focus_NN(s, refs)) # FocusDiff
        sent_scores.append(disco_scorer.DS_SENT_NN(s, refs)) # SentGraph
      # ROUGE
      rouge_result = rouge.compute(predictions=[output], references=[high_context_example])
      rouge_scores.append(rouge_result['rougeL'])

    # BERTscore
    high_context_references = [high_context_example] * len(outputs)
    bertscore_result = bertscore.compute(predictions=outputs, references=high_context_references, lang="en")

    bert_precision_scores = bertscore_result['precision']
    bert_recall_scores = bertscore_result['recall']
    bert_f1_scores = bertscore_result['f1']


    # print(focus_scores)
    # print(sent_scores)
    # print(bert_precision_scores)
    # print(bert_recall_scores)
    # print(bert_f1_scores)
    # print(rouge_scores)
    # print(row)

    eval_scores_disco_scorer_FOCUS.append((vocab_word, focus_scores[0], focus_scores[1], focus_scores[2], focus_scores[3]))
    eval_scores_disco_scorer_SENT.append((vocab_word, sent_scores[0], sent_scores[1], sent_scores[2], sent_scores[3]))
    eval_scores_precision_BERTscore.append((vocab_word, bert_precision_scores[0], bert_precision_scores[1], bert_precision_scores[2], bert_precision_scores[3]))
    eval_scores_recall_BERTscore.append((vocab_word, bert_recall_scores[0], bert_recall_scores[1], bert_recall_scores[2], bert_recall_scores[3]))
    eval_scores_f1_BERTscore.append((vocab_word, bert_f1_scores[0], bert_f1_scores[1], bert_f1_scores[2], bert_f1_scores[3]))
    eval_scores_ROUGE_L.append((vocab_word, rouge_scores[0], rouge_scores[1], rouge_scores[2], rouge_scores[3]))
    print(index)
    index += 1
    #break

**RUNNING WITH ALTERNATIVE GENERIC EXAMPLE SENTENCES**

In [None]:
rouge = evaluate.load('rouge')

eval_scores_ROUGE_L = []

file_path = 'gen_data.csv'
with open(file_path, mode='r') as file:
  csv_reader = csv.reader(file)

  index = 0
  for row in csv_reader:
    vocab_word = row[0]
    baseline_prompt = row[1]
    vocab_with_definition_prompt = row[2]
    masked_vocab_word_prompt = row[3]
    low_context_example = row[4]
    high_context_example = row[5]
    outputs = [baseline_prompt, vocab_with_definition_prompt, masked_vocab_word_prompt, low_context_example]

    focus_scores = []
    sent_scores = []
    rouge_scores = []

    if vocab_word in generic_sentences_dict:
      generic_sentences = generic_sentences_dict[vocab_word]
      #cat = " ".join(sentence if sentence.endswith(".") else sentence + "." for sentence in generic_sentences)

      for output in outputs:
        # ROUGE score
        g_scores_to_avg = []
        for s in generic_sentences:
          rouge_result = rouge.compute(predictions=[output], references=[s])
          g_scores_to_avg.append(rouge_result['rougeL'])
        score_avg = sum(g_scores_to_avg) / len(g_scores_to_avg)
        rouge_scores.append(score_avg)

      eval_scores_ROUGE_L.append((vocab_word, rouge_scores[0], rouge_scores[1], rouge_scores[2], rouge_scores[3]))
    else:
      eval_scores_ROUGE_L.append((vocab_word, None, None, None, None))

  file_path = 'eval_scores_ROUGE_L_generic_examples.csv'
  with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
    writer.writerows(eval_scores_ROUGE_L)

In [None]:
from google.colab import files
files.download('eval_scores_ROUGE_L_generic_examples.csv')

In [None]:
file_path = 'eval_scores_disco_scorer_FOCUS.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_disco_scorer_FOCUS)

file_path = 'eval_scores_disco_scorer_SENT.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_disco_scorer_SENT)

file_path = 'eval_scores_precision_BERTscore.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_precision_BERTscore)

file_path = 'eval_scores_recall_BERTscore.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_recall_BERTscore)

file_path = 'eval_scores_f1_BERTscore.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_f1_BERTscore)

file_path = 'eval_scores_ROUGE_L.csv'
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["vocab_word", "baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt", "low_context_example"])
  writer.writerows(eval_scores_ROUGE_L)

**GRAPHS**

In [None]:
%matplotlib inline

In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def read_scores(file_path):
  vocab_words = []
  baseline_prompt_scores = []
  vocab_with_definition_prompt_scores = []
  masked_vocab_word_prompt_scores = []
  low_context_example_scores = []

  with open(file_path, mode='r') as file:
    csv_reader = csv.reader(file)
    index = 0
    for row in csv_reader:
      if index != 0:
        vocab_words.append(row[0])
        baseline_prompt_scores.append(row[1])
        vocab_with_definition_prompt_scores.append(row[2])
        masked_vocab_word_prompt_scores.append(row[3])
        low_context_example_scores.append(row[4])
      index+=1

  # baseline_prompt_scores = list(map(float, baseline_prompt_scores))
  # vocab_with_definition_prompt_scores = list(map(float, vocab_with_definition_prompt_scores))
  # masked_vocab_word_prompt_scores = list(map(float, masked_vocab_word_prompt_scores))
  # low_context_example_scores = list(map(float, low_context_example_scores))

  baseline_prompt_scores = list(map(lambda x: np.nan if x == '' else float(x), baseline_prompt_scores))
  vocab_with_definition_prompt_scores = list(map(lambda x: np.nan if x == '' else float(x), vocab_with_definition_prompt_scores))
  masked_vocab_word_prompt_scores = list(map(lambda x: np.nan if x == '' else float(x), masked_vocab_word_prompt_scores))
  low_context_example_scores = list(map(lambda x: np.nan if x == '' else float(x), low_context_example_scores))

  return vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores

In [None]:
file_name = 'eval_scores_disco_scorer_SENT.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='disco_scorer_sent', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_disco_scorer_SENT')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='disco_scorer_sent', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_disco_scorer_SENT')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='disco_scorer_sent', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_disco_scorer_SENT')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='disco_scorer_sent', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_disco_scorer_SENT')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_disco_scorer_FOCUS.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='disco_scorer_focus', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_disco_scorer_FOCUS')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='disco_scorer_focus', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_disco_scorer_FOCUS')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='disco_scorer_focus', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_disco_scorer_FOCUS')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='disco_scorer_focus', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_disco_scorer_FOCUS')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_precision_BERTscore.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='precision_BERTscore', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_precision_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='precision_BERTscore', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_precision_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='precision_BERTscore', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_precision_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='precision_BERTscore', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_precision_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_recall_BERTscore.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='recall_BERTscore', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_recall_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='recall_BERTscore', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_recall_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='recall_BERTscore', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_recall_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='recall_BERTscore', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_recall_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_f1_BERTscore.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='f1_BERTscore', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_f1_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='f1_BERTscore', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_f1_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='f1_BERTscore', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_f1_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='f1_BERTscore', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_f1_BERTscore')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_ROUGE_L.csv'
vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='ROUGE_L', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_ROUGE_L')
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='ROUGE_L', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_ROUGE_L')
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='ROUGE_L', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_ROUGE_L')
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='ROUGE_L', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_ROUGE_L')
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
file_name = 'eval_scores_ROUGE_L_generic_examples.csv'

vocab_words, baseline_prompt_scores, vocab_with_definition_prompt_scores, masked_vocab_word_prompt_scores, low_context_example_scores = read_scores(file_name)

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

axs[0, 0].scatter(range(len(vocab_words)), baseline_prompt_scores, label='ROUGE_L_generic_examples', s=30, color='blue', alpha=0.6)
axs[0, 0].set_ylim(0, 1)
axs[0, 0].set_xlim(0, len(vocab_words))
axs[0, 0].set_title('baseline_prompt_scores_ROUGE_L_generic_examples', fontsize=10)
axs[0, 0].set_xlabel('vocab words')
axs[0, 0].legend()

axs[0, 1].scatter(range(len(vocab_words)), vocab_with_definition_prompt_scores, label='ROUGE_L_generic_examples', s=30, color='orange', alpha=0.6)
axs[0, 1].set_ylim(0, 1)
axs[0, 1].set_xlim(0, len(vocab_words))
axs[0, 1].set_title('vocab_with_definition_prompt_scores_ROUGE_L_generic_examples', fontsize=10)
axs[0, 0].set_xlabel('vocab words')
axs[0, 1].legend()

axs[1, 0].scatter(range(len(vocab_words)), masked_vocab_word_prompt_scores, label='ROUGE_L_generic_examples', s=30, color='green', alpha=0.6)
axs[1, 0].set_ylim(0, 1)
axs[1, 0].set_xlim(0, len(vocab_words))
axs[1, 0].set_title('masked_vocab_word_prompt_scores_ROUGE_L_generic_examples', fontsize=10)
axs[0, 0].set_xlabel('vocab words')
axs[1, 0].legend()

axs[1, 1].scatter(range(len(vocab_words)), low_context_example_scores, label='ROUGE_L_generic_examples', s=30, color='pink', alpha=0.6)
axs[1, 1].set_ylim(0, 1)
axs[1, 1].set_xlim(0, len(vocab_words))
axs[1, 1].set_title('low_context_example_scores_ROUGE_L_generic_examples', fontsize=10)
axs[0, 0].set_xlabel('vocab words')
axs[1, 1].legend()

plt.tight_layout()
plt.show()

---
---
---
# William evals

In [None]:
# df = pd.read_csv("data/0-200 output.csv")
df = pd.read_csv("output.csv")
experiment_columns_names = ["baseline_prompt", "vocab_with_definition_prompt", "masked_vocab_word_prompt"]

In [None]:
df.head()

### Eval: confirming target word is in generation

In [None]:
# def check_containment(row, vocab_co)

def check_containments(df, generation_column: str):
    containments = []
    for row in df.to_dict(orient='records'):
        vocab_word = row['vocab_word'].lower()
        generation = row[generation_column].lower()
        # vocab_in_generation = bool(re.search(vocab_word, generation))
        containments.append(vocab_word in generation)
    return containments

def count_containments(df, generation_column):
    return sum(check_containments(df, generation_column))

print(f"{count_containments(df, 'baseline_prompt')}/{len(df)}")

In [None]:
####################
# Running the eval #
####################
print(f"# of rows: {len(df)}\n===\n# containing target vocab:")
for col in experiment_columns_names:
    print(col, count_containments(df, col))

In [None]:
# so the masked technique is even failing sometimes. where?
for i,b in enumerate(check_containments(df, "masked_vocab_word_prompt")):
    if not b: print(i, df['vocab_word'][i], df['masked_vocab_word_prompt'][i])
# ah, it seems that it's mangling the special marker

### Eval: Counting out-of-vocabulary words

In [None]:
def clean_text(input: str):
    input = input.lower()
    input = re.sub("[\s,.?!'\"]+", " ", input)
    input = re.sub("\s+", " ", input)
    input = input.strip()
    return input

In [None]:
def split_words(generation):
    generation = clean_text(generation)
    words = re.split("\s+", generation)
    return words

def count_out_of_vocab(generation, vocab):
    return len(filter_out_of_vocab(generation, vocab))
    # words = split_words(generation)
    # return sum(w not in vocab for w in words)

# TODO: perhaps do simple tweak to account for (regular) plurals (if need is demonstrated)
def filter_out_of_vocab(generation, vocab):
    words = split_words(generation)
    return [w for w in words if w not in vocab]

example = "I love to eat apples. Ah, look, this is my friend, Apples!"
example_vocab = {"i", "eat", "love", "my", "friend"}
count_out_of_vocab(example, example_vocab)
print(filter_out_of_vocab(example, example_vocab))
print(f"{count_out_of_vocab(example, example_vocab)}/{len(split_words(example))}")

In [None]:
count_out_of_vocab(example, {w.lower() for w in common_vocab['word']})

In [None]:
####################
# Running the eval #
####################
out_of_vocab_set = set()
print(f"# of rows: {len(df)}\n===\n% words out-of-vocab:")
for col in experiment_columns_names:
    counts = []
    for row in df.to_dict(orient='records'):
        # counts.append(count_out_of_vocab(row[col], common_all)/len(split_words(row[col])))
        preferred_vocab = common_500 | {row['vocab_word'.lower()]}
        out_of_vocab_set |= {w for w in filter_out_of_vocab(row[col], preferred_vocab)}
        out_of_vocab = count_out_of_vocab(row[col], preferred_vocab)
        fraction_out_of_vocab = out_of_vocab / len(split_words(row[col]))
        counts.append(fraction_out_of_vocab)
    fig = px.histogram(counts, title=f"{col}: #generations, bucketed by %words out of vocab").update_layout(xaxis={'range':(0,1)}).show()
    # fig.
    # print(counts)
    print(sum(counts)/len(counts))
    # print(col, count_containments(df, col))

In [None]:
common_vocab

In [None]:
out_of_vocab_set

In [None]:
{w for w in common_vocab['word'][:500]}

In [None]:
frequency_accum = list(itertools.accumulate(common_vocab['freq'], operator.add))
frequency_accum = [f/frequency_accum[-1] for f in frequency_accum]
px.line(frequency_accum, title="Fraction of words in corpus covered by vocab of size x")
# Based on this plot, reasonable prefix sizes seem to be 1000, 500, 200, maybe 100. Just go with 500 for now.