In [None]:

from transformers import GPT2Model, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained('gpt2')

text_complex = "In information theory, the Rényi entropy is a quantity that generalizes various notions of entropy, including Hartley entropy, Shannon entropy, collision entropy, and min-entropy. The Rényi entropy is named after Alfréd Rényi, who looked for the most general way to quantify information while preserving additivity for independent events. In the context of fractal dimension estimation, the Rényi entropy forms the basis of the concept of generalized dimensions. As α approaches zero, the Rényi entropy increasingly weighs all events with nonzero probability more equally, regardless of their probabilities. In the limit for α → 0, the Rényi entropy is just the logarithm of the size of the support of X. The limit for α → 1 is the Shannon entropy. As α approaches infinity, the Rényi entropy is increasingly determined by the events of highest probability."
text_simple = "The Rényi entropy is a type of entropy that measures information. It is named after Alfréd Rényi, who wanted to find a way to measure information that was general and worked for different types of events. The Rényi entropy is used in fractal dimension estimation. As the value of alpha changes, the Rényi entropy values different events differently. When alpha is very small, the Rényi entropy values all events equally. When alpha is very large, the Rényi entropy values the events with the highest probability the most. When alpha is 1, the Rényi entropy is the same as the Shannon entropy."

tok_complex = tokenizer.encode(text_complex, return_tensors='tf')
tok_simple = tokenizer.encode(text_simple, return_tensors='tf')

In [None]:
from transformers import pipeline
import numpy as np
import tqdm

unmasker = pipeline('fill-mask', model='bert-large-uncased-whole-word-masking')

text_complex = "In information theory, the Rényi entropy is a quantity that generalizes various notions of entropy, including Hartley entropy, Shannon entropy, collision entropy, and min-entropy. The Rényi entropy is named after Alfréd Rényi, who looked for the most general way to quantify information while preserving additivity for independent events. In the context of fractal dimension estimation, the Rényi entropy forms the basis of the concept of generalized dimensions. As α approaches zero, the Rényi entropy increasingly weighs all events with nonzero probability more equally, regardless of their probabilities. In the limit for α → 0, the Rényi entropy is just the logarithm of the size of the support of X. The limit for α → 1 is the Shannon entropy. As α approaches infinity, the Rényi entropy is increasingly determined by the events of highest probability."
text_simple = "The Rényi entropy is a type of entropy that measures information. It is named after Alfréd Rényi, who wanted to find a way to measure information that was general and worked for different types of events. The Rényi entropy is used in fractal dimension estimation. As the value of alpha changes, the Rényi entropy values different events differently. When alpha is very small, the Rényi entropy values all events equally. When alpha is very large, the Rényi entropy values the events with the highest probability the most. When alpha is 1, the Rényi entropy is the same as the Shannon entropy."

# out = unmasker("Hello I'm a [MASK] model.")
# print(out)

def compute_complexity_single(text, word):
    out = unmasker(text)
    out_matching = [x for x in out if x["token_str"] == word]
    prob_top = out[0]["score"]
    return prob_top, len(out_matching)

def compute_complexity(text):
    text = text.split(" ")
    prob_top_all = []
    matched_all = []
    for i in tqdm.tqdm(range(len(text))):
        text_new = " ".join(text[:i] + ["[MASK]"] + text[i+1:])
        prob_top, matched = compute_complexity_single(text_new, text[i])
        prob_top_all.append(prob_top)
        matched_all.append(matched)
    print(f"Prob: {np.average(prob_top_all):.2f}")
    print(f"Matched: {np.average(matched):.5%}")

compute_complexity(text_complex)
compute_complexity(text_simple)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b7")
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7")


# Set up input text
input_text = "Your existing text here."

# Encode input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate output   
output = model.generate(
    input_ids=input_ids,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=1,
    # max_length=len(input_ids[0]),
    # do_sample=False,
    # num_beams=1,
    output_scores=True,
    return_dict_in_generate=True,
)
print(output["scores"])

# Get token scores
# token_scores = output[0][0].tolist()
# decoded_tokens = tokenizer.decode(input_ids[0])

In [None]:
def compute_complexity_2(text):
    input_ids = tokenizer.encode(text, return_tensors="pt")
    output = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=1,
        # max_length=len(input_ids[0]),
        # do_sample=False,
        # num_beams=1,
        output_scores=True,
        return_dict_in_generate=True,
    )
    print(output['scores'][0][0])
    print(f"Prob: {np.average(output['scores'][0][0]):.2f}")

compute_complexity_2(text_complex)
compute_complexity_2(text_simple)