In [None]:
!pip install minicons
!pip install nltk
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install transformers
!pip install huggingface_hub

In [None]:
# Import "scorer" module from the minicons library
from minicons import scorer

# Load helper functions from the file helper_functions.py
from helper_functions import *

In [None]:
model_name = "HuggingFaceTB/SmolLM2-135M"

# Load the specified model IncrementalLMScorer is a wrapper around any causal (autoregressive) language model accessible via Hugging Face’s transformers.
lmScorer = scorer.IncrementalLMScorer(model_name)

BOS = initialize_bos(model_name)

In [None]:
# Get the tokenizer for calculating word-level surprisals
from nltk.tokenize import TweetTokenizer
word_tokenizer = TweetTokenizer().tokenize

In [None]:
gram_sg = "The key to the cabinet was rusty from many years of disuse."
gram_pl = "The key to the cabinets was rusty from many years of disuse."
ungram_sg = "The key to the cabinet were rusty from many years of disuse."
ungram_pl = "The key to the cabinets were rusty from many years of disuse."

#### Generate word-by-word surprisals for `Agreement Attraction` experiment in English

In [None]:
word_surprisal_gram_sg = lmScorer.word_score_tokenized(
    gram_sg, 
    bos_token=BOS,
    tokenize_function=word_tokenizer,
    surprisal=True,
    bow_correction=True,
)

plot_word_by_word_surprisals(word_surprisal_gram_sg)

word_surprisal_gram_pl = lmScorer.word_score_tokenized(
    gram_pl, 
    bos_token=BOS,
    tokenize_function=word_tokenizer,
    surprisal=True,
    bow_correction=True,
)

plot_word_by_word_surprisals(word_surprisal_gram_pl)

word_surprisal_ungram_sg = lmScorer.word_score_tokenized(
    ungram_sg, 
    bos_token=BOS,
    tokenize_function=word_tokenizer,
    surprisal=True,
    bow_correction=True,
)

plot_word_by_word_surprisals(word_surprisal_ungram_sg)

word_surprisal_ungram_pl = lmScorer.word_score_tokenized(
    ungram_pl, 
    bos_token=BOS,
    tokenize_function=word_tokenizer,
    surprisal=True,
    bow_correction=True,
)

plot_word_by_word_surprisals(word_surprisal_ungram_pl)

In [None]:
from helper_functions import get_word_surprisal

gram_sg = "The key to the cabinet was rusty from many years of disuse."
gram_pl = "The key to the cabinets was rusty from many years of disuse."
ungram_sg = "The key to the cabinet were rusty from many years of disuse."
ungram_pl = "The key to the cabinets were rusty from many years of disuse."

target_word = "was"
word_surp = get_word_surprisal(scorer=lmScorer, BOS=BOS, sentence=gram_sg, target=target_word)
print(f"Surprisal for \'{target_word}\' in \'{gram_sg}\'\n{word_surp}\n")

target_word = "was"
word_surp = get_word_surprisal(scorer=lmScorer, BOS=BOS, sentence=gram_pl, target=target_word)
print(f"Surprisal for \'{target_word}\' in \'{gram_pl}\'\n{word_surp}\n")

target_word = "were"
word_surp = get_word_surprisal(scorer=lmScorer, BOS=BOS, sentence=ungram_sg, target=target_word)
print(f"Surprisal for \'{target_word}\' in \'{ungram_sg}\'\n{word_surp}\n")

target_word = "were"
word_surp = get_word_surprisal(scorer=lmScorer, BOS=BOS, sentence=ungram_pl, target=target_word)
print(f"Surprisal for \'{target_word}\' in \'{ungram_pl}\'\n{word_surp}\n")