# Measuring Entropy in Text with a Quick Demonstration Script

## Introduction

Because Large Language Models (LLMs) fundamentally predict the next token, LLMs can be used as a measure of how "shocking" the next character is is pre-generated text.

## Model Selection

[Microsoft's Phi 1.5](https://huggingface.co/microsoft/phi-1_5) model will be used. No particular reason. The model is large enough to do the job which is nice.

In [3]:
import numpy as np
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import matplotlib.pyplot as plt
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
with Path("../samples/chatgpt-narrative-economics.txt").open() as f:
    CHATGPT_TEXT = f.read()

In [None]:
torch.set_default_device("cuda")

if 'model' not in locals():
    torch.cuda.empty_cache()
    model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto", trust_remote_code=True)
if 'tokenizer' not in locals():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

In [None]:
inputs = tokenizer(
    CHATGPT_TEXT,
    return_tensors="pt",
    return_attention_mask=False
)
inputs

In [None]:
from math import log
from torch.amp import autocast

# zeroing grad reduces memory usage
for param in model.parameters():
    param.grad = None

def measure_losses(input_txt: str) -> tuple[float, list]:
    """
    Measures the losses of a piece of text
    """
    inputs = tokenizer(
        input_txt,
        return_tensors="pt",
        return_attention_mask=True
    )
    with autocast():
        result = model.forward(**inputs)
    
    logits = result.logits
    
    new_res = torch.nn.functional.softmax(logits, dim=1)
    
    # correct
    correct_res = torch.nn.functional.one_hot(inputs.input_ids, num_classes=51_200)
    correct_res = correct_res.float()
    
    # making loss functions
    loss = torch.nn.CrossEntropyLoss()
    
    setup = loss(logits, correct_res)
    _ = setup.backward() # result is not needed
    torch.cuda.empty_cache()
    
    # iteration through each vector
    return_array = []
    num_tokens = inputs.input_ids.shape[1]
    for i in range(num_tokens):
        index = int(inputs.input_ids[0, i].cpu())
        
        res = float(new_res[0, i, index])

        if res > 0.00000001:
            return_array.append(-1 * log(res))
        else:
            return_array.append(100)
        
    return inputs['input_ids'], return_array

torch.cuda.empty_cache()
chat_tokens, chat_entropy = measure_losses(CHATGPT_TEXT[1_000])

In [None]:
def sum_over_interval(entropy: list, window_size: int = 30) -> list:
    """
    Calculates the sum of entropy over a token window size (likely to be much more stable)
    """

    summed_entropies = []

    curr_sum = 0.0
    for idx, bits_of_info in enumerate(entropy):
        if (idx + 1) % window_size == 0:
            summed_entropies.append(curr_sum)
            curr_sum = 0

        curr_sum += bits_of_info

    return summed_entropies
    
chat_entropy_totals = sum_over_interval(chat_entropy)

fig, ax = plt.subplots()
ax.plot([i for i in range(len(chat_entropy_totals))], chat_entropy_totals, label='chat')
ax.legend()
ax.set_ylim(0, max(chat_entropy_totals) * 1.1)
ax.set_title("Entropy of a passage I wrote over time")
plt.show()

In [None]:
# running a fft on the information
from scipy.fft import fft

# Perform FFT
fft_result = fft(chat_entropy)

# Calculate the frequency axis
freq = np.fft.fftfreq(len(fft_result), 1/len(chat_entropy))

plt.subplot(2, 1, 1)
plt.plot([i for i in range(len(chat_entropy))], chat_entropy)
plt.title('Original Signal')

plt.subplot(2, 1, 2)
plt.plot(freq, np.abs(fft_result))
plt.title('FFT of the Signal')

plt.show()

In [None]:
# marking new lines
ids = tokenizer(
    "\n",
    return_tensors="pt",
    return_attention_mask=True
).input_ids
new_line = ids[0, 0]
new_line

ids = tokenizer(
    ".",
    return_tensors="pt",
    return_attention_mask=True
).input_ids
period = ids[0, 0]

# sum(inputs.input_ids == new_line).sum()
tokenized_first_sent = tokenizer(
    CHATGPT_TEXT,
    return_tensors="pt",
    return_attention_mask=True
).input_ids
    
new_line

In [None]:
# taking a rolling average of the data

kernel = np.ones(18) / 18

entropy_smoothed = np.convolve(chat_entropy, kernel, mode='valid')

j = 0
for i in tokenized_first_sent[0].cpu():
#     print(i)
    if i == new_line:
        plt.axvline(x=j, color='r', linestyle='--')
    if i == period:
        plt.axvline(x=j, color='b', linestyle='--')
    j += 1


plt.plot([i for i in range(len(entropy_smoothed))], entropy_smoothed)
plt.xlim(0, 200)
plt.legend()
plt.title('Original Signal')
plt.show()

print("Example text:", CHATGPT_TEXT[:len(CHATGPT_TEXT) // 2])

In [None]:
inputs = tokenizer(
    CHATGPT_TEXT[len(CHATGPT_TEXT) // 4:len(CHATGPT_TEXT) // 2],
    return_tensors="pt",
    return_attention_mask=True
)

print("60-100:", tokenizer.decode(inputs.input_ids[0, 180:240]))
print("200-260:", tokenizer.decode(inputs.input_ids[0, 200:260]))

In [None]:
torch.nn.functional.one_hot(inputs.input_ids, num_classes=51_200).shape

## Measuring Entropy Using ZSTD

This will be measured using the compression ratio, or how much the text can be compressed.

- could potentially be inaccurate as longer pieces of text have lower entropy by default / quantization effects with the black-box algorithim which zstd is being treated as

In [1]:
import zstd

def print_compression_ratio(input_str: str):
    compressed_result = zstd.compress(input_str.encode("utf-8"))

    print(f"Compression ratio {1 - len(compressed_result) / len(input_str)}")

print_compression_ratio("I have some sample text which I want to compress with zstd compresoin" * 20)
print_compression_ratio(CHATGPT_TEXT)

Compression ratio 0.9398550724637681


NameError: name 'CHATGPT_TEXT' is not defined