In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=".cache")
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=".cache")
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [2]:
def top_k_token(prompt, k=5):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits[:, -1, :]
    topk_values, topk_indices = torch.topk(logits, top_k, dim=-1)
    top_tokens = tokenizer.batch_decode(topk_indices[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # print(topk_indices[0][:k])
    
    return top_tokens, topk_values[0]

import torch
import torch.nn.functional as F

def top_k_token_prop(prompt, k=5):
    inputs = tokenizer(prompt, return_tensors="pt")
    print(inputs)
    print(tokenizer.batch_decode(
        inputs['input_ids'][0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    ))

    with torch.no_grad():
        outputs = model(**inputs)

    print(outputs)
    logits = outputs.logits[:, -1, :]
    topk_values, topk_indices = torch.topk(logits, k, dim=-1)

    # Apply softmax to top-k logits to get probabilities
    topk_probs = F.softmax(topk_values, dim=-1)

    top_tokens = tokenizer.batch_decode(
        topk_indices[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    return top_tokens, topk_probs[0]

In [3]:
prompt = "How can you"
top_k = 10

top_tokens, top_values = top_k_token_prop(prompt, k=top_k)
for i, (token, score) in enumerate(zip(top_tokens, top_values)):
    print(f"{i+1}. {token} (logit: {score.item():.4f})")
print(f"Sum: {top_values[0:10].sum().item():.4f}")

{'input_ids': tensor([[4340,  646,  498]]), 'attention_mask': tensor([[1, 1, 1]])}
['How', ' can', ' you']
CausalLMOutputWithPast(loss=None, logits=tensor([[[ 8.2367,  6.6111,  4.5391,  ..., -4.0680, -4.0680, -4.0681],
         [ 4.1064,  5.5320,  4.0867,  ..., -3.4824, -3.4814, -3.4826],
         [ 6.9169,  8.1698,  9.0604,  ..., -3.9949, -3.9945, -3.9951]]]), past_key_values=<transformers.cache_utils.DynamicCache object at 0x76550ce6cfe0>, hidden_states=None, attentions=None)
1.  determine (logit: 0.2247)
2.  use (logit: 0.1334)
3.  find (logit: 0.1147)
4.  modify (logit: 0.1048)
5.  simplify (logit: 0.0876)
6.  calculate (logit: 0.0757)
7.  solve (logit: 0.0678)
8.  express (logit: 0.0650)
9.  prove (logit: 0.0634)
10.  create (logit: 0.0630)
Sum: 1.0000


In [15]:
import json
import os
import numpy as np

from arithmetic_coder import ArithmeticEncoder, BitInputStream, BitOutputStream


def gen_rank(probs, next_token):
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True, stable=True)
    rank_list = []
    if next_token.shape[0] > 1:
        for i in range(next_token.shape[0]):
            rank_list += [torch.where(probs_idx[i:i+1, :] == next_token[i])[-1]]
        rank = torch.squeeze(torch.stack(rank_list))
    else:
        rank = torch.where(probs_idx == next_token)[-1]
    return rank


def read_bitstream(bitin):
    temp_list = []
    while True:
        temp = bitin.read()
        if temp == -1:
            break
        temp_list += [temp]
    temp_arr = np.array(temp_list)
    final_ind = (np.where(temp_arr == 1)[0][-1]).astype(int)
    final_arr = temp_arr[:final_ind+1]

    return final_arr


def print_tokens(tokens):
    decoded_tokens = tokenizer.batch_decode(
        tokens[0:1000],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    print("|".join(decoded_tokens))


def print_probs(probs):
    print("|".join([f"{p:.4f}" for p in probs]))


class LLMzip_encode:
    def __init__(self, model, tokenizer, filename):
        self.model = model
        self.tokenizer = tokenizer

        self.filename = filename
        self.file_out = open(self.filename+'_llmzip_ac.txt', 'wb')
        self.bitout = BitOutputStream(self.file_out)
        self.AC_encoder = ArithmeticEncoder(32, self.bitout)

    def encode_batch(self, prompt_tokens):
        print_tokens(prompt_tokens)
        bsz = prompt_tokens.shape[0]

        prompt_size = prompt_tokens.shape[1]

        tokens = torch.full((bsz, prompt_size), self.tokenizer.pad_token_id).long()
        tokens[:bsz, : prompt_size] = torch.tensor(prompt_tokens).long()
        print(tokens)

        cur_pos = prompt_size-1
        prev_pos = 0

        logits = self.model.forward(tokens[:, prev_pos:cur_pos]).logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        rank = gen_rank(probs, next_token=tokens[:, cur_pos])

        probs_np2 = probs.cpu().detach().numpy()
        tokens_np2 = tokens[:, cur_pos].cpu().numpy()
        ranks_np2 = rank.cpu().numpy()

        probs_tok = probs_np2[np.arange(bsz), tokens_np2]
        print_tokens(tokens_np2)
        print_probs(probs_tok)

        cumul = np.zeros(self.model.vocab_size+1, dtype=np.uint64)
        for j in range(bsz):
            prob1 = probs_np2[j]
            cumul[1:] = np.cumsum(prob1*10000000 + 1)
            self.AC_encoder.write(cumul, tokens_np2[j])

        return ranks_np2, probs_tok

    def encode(self, win_size: int):
        if not os.path.exists(self.filename + '_tokens.npy'):
            with open(self.filename, 'r') as f_in:
                text_input = f_in.read()

            tokens_full = np.array(tokenizer.encode(text_input))
            np.save(self.filename + '_tokens.npy', tokens_full)
        else:
            tokens_full = np.load(self.filename + '_tokens.npy')

        tokens_full = tokens_full[0:1000]
        print_tokens(tokens_full)

        win_size_enc = win_size + 1  # additional 1 is to pass the true token apart from the context of win_size
        bsz = 2048

        ranks_list = []
        probs_tok_list = []

        n_runs = tokens_full.size-win_size_enc+1

        tokens_encoded = tokens_full[win_size:win_size+n_runs]
        print(win_size)
        starter_tokens = tokens_full[:win_size]
        np.save(self.filename + '_starter_tokens.npy', starter_tokens)

        n_batches = np.ceil(n_runs/bsz).astype(int)

        for b_ind in range(n_batches):
            batch_range_start = b_ind*bsz
            batch_range_stop = np.minimum(n_runs, (b_ind+1) * bsz)
            # tokens_batch = np.array([np.concatenate(([tokens_full[0]], tokens_full[i:i+win_size_enc])) for i in range(batch_range_start, batch_range_stop)])
            tokens_batch = np.array([tokens_full[i: i + win_size_enc] for i in range(batch_range_start, batch_range_stop)])
            ranks, probs_tok = self.encode_batch(tokens_batch)
            ranks_list += [ranks]
            probs_tok_list += [probs_tok]

            if (b_ind*bsz*100/n_batches) % 10 == 0:
                print(f'Encoder: Completed {int(b_ind*bsz*100/n_batches)} %')

        ranks_full = np.concatenate(ranks_list, 0).squeeze()
        probs_tok_full = np.concatenate(probs_tok_list, 0).squeeze()

        self.AC_encoder.finish()
        self.bitout.close()
        self.file_out.close()

        self.compute_compression_ratio(tokens_encoded, probs_tok_full)

    def compute_compression_ratio(self, tokens_encoded, probs_tok):
        text_encoded = self.tokenizer.decode(tokens_encoded.squeeze().tolist())

        N_T = tokens_encoded.size
        N_C = len(text_encoded)

        df_out = {}
        df_out['characters'] = N_C
        df_out['tokens'] = N_T

        entropy_val = np.sum(-np.log2(probs_tok)) / N_C
        df_out['entropy'] = [f"{entropy_val:.4f}"]

        file_in = open(self.filename+"_llmzip_ac.txt", 'rb')
        bitin = BitInputStream(file_in)
        compressed_bits = read_bitstream(bitin)
        rho_AC = compressed_bits.size/N_C
        print(f'Compression Ratio for Arithmetic Coding :  {rho_AC} bits/char')
        file_in.close()

        df_out['Llama+AC compressed file size'] = compressed_bits.size
        df_out['bits per character'] = rho_AC

        print(df_out)

        with open(self.filename+'_metrics.json', 'w') as file_metrics:
            json.dump(df_out, file_metrics)


Encoder = LLMzip_encode(model, tokenizer, filename='../test.txt')

Encoder.encode(win_size=13)

Alice| was| beginning| to| get| very| tired| of| sitting| by| her| sister| on| the| bank|.
13
Alice was beginning to get very tired of sitting by her sister on the| was beginning to get very tired of sitting by her sister on the bank| beginning to get very tired of sitting by her sister on the bank.
tensor([[61686,   572,  7167,   311,   633,  1602, 19227,   315, 11699,   553,
          1059, 12923,   389,   279],
        [  572,  7167,   311,   633,  1602, 19227,   315, 11699,   553,  1059,
         12923,   389,   279,  6073],
        [ 7167,   311,   633,  1602, 19227,   315, 11699,   553,  1059, 12923,
           389,   279,  6073,    13]])
 the| bank|.
0.9598|0.0177|0.2120
Encoder: Completed 0 %
Compression Ratio for Arithmetic Coding :  0.7 bits/char
{'characters': 10, 'tokens': 3, 'entropy': ['0.8113'], 'Llama+AC compressed file size': 7, 'bits per character': 0.7}


In [17]:
Encoder = LLMzip_encode(model, tokenizer, filename='../text8.txt')

Encoder.encode(win_size=100)

 anarch|ism| originated| as| a| term| of| abuse| first| used| against| early| working| class| radicals| including| the| dig|gers| of| the| english| revolution| and| the| sans| cul|ottes| of| the| french| revolution| whilst| the| term| is| still| used| in| a| pe|j|orative| way| to| describe| any| act| that| used| violent| means| to| destroy| the| organization| of| society| it| has| also| been| taken| up| as| a| positive| label| by| self| defined| anarchists| the| word| anarch|ism| is| derived| from| the| g|reek| without| arch|ons| ruler| chief| king| anarch|ism| as| a| political| philosophy| is| the| belief| that| rulers| are| unnecessary| and| should| be| abolished| although| there| are| differing| interpretations| of| what| this| means| anarch|ism| also| refers| to| related| social| movements| that| advocate| the| elimination| of| authoritarian| institutions| particularly| the| state| the| word| an|archy| as| most| anarchists| use| it| does| not| imply| chaos| nihil|ism| or| anom|ie| 

In [None]:
# string_data = "1. 子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」"
# string_data = "蓋聞天地之數，有十二萬九千六百歲為一元。將一元分為十二會，乃子、丑、寅、卯、辰、巳、午、未、申、酉、戌、亥之十二支也。"
string_data = "Alice was beginning to get very tired of sitting by her sister on the bank"
# string_data = "En 1815, M. Charles-François-Bienvenu Myriel était évêque de Digne. C'était un vieillard d'environ soixante-quinze ans; il occupait le siège de Digne depuis 1806."
top_k = 10

stored_data = []
prompt = ""
i = 0
while i < len(string_data):
    print(f"\n\nCurrent index: {i}")
    print(f"Prompt ({i}): {prompt}")
    print(f"stored_data ({i}): {stored_data}")
    if prompt == "":
        prompt = string_data[i]
        stored_data.append(string_data[i])
        i += 1
    else:
        top_tokens, top_values = top_k_token(prompt, k=top_k)
        for j, (token, score) in enumerate(zip(top_tokens, top_values)):
            print(f"{j+1}. '{token}' (logit: {score.item():.4f})")
            token_len = len(token)
            print(f"\tToken length: {token_len}")
            print(f"\tComparing with string_data[{i}:{i+token_len}] = '{string_data[i:i+token_len]}'")
            if token == string_data[i:i+token_len]:
                stored_data.append(j)
                prompt += token
                i += token_len - 1
                break
            elif j == top_k - 1:
                print(f"Character '{string_data[i]}' not found in top {top_k} tokens.")
                stored_data.append(string_data[i])
                prompt += string_data[i]
        i += 1
        # if i == 6:
        #     break

            




Current index: 0
Prompt (0): 
stored_data (0): []


Current index: 1
Prompt (1): A
stored_data (1): ['A']
1. ' high' (logit: 12.8682)
	Token length: 5
	Comparing with string_data[1:6] = 'lice '
2. ' ' (logit: 12.5429)
	Token length: 1
	Comparing with string_data[1:2] = 'l'
3. ' certain' (logit: 12.3691)
	Token length: 8
	Comparing with string_data[1:9] = 'lice was'
4. ' middle' (logit: 12.3153)
	Token length: 7
	Comparing with string_data[1:8] = 'lice wa'
5. ' company' (logit: 12.0925)
	Token length: 8
	Comparing with string_data[1:9] = 'lice was'
6. ' group' (logit: 12.0796)
	Token length: 6
	Comparing with string_data[1:7] = 'lice w'
7. ' poly' (logit: 11.8678)
	Token length: 5
	Comparing with string_data[1:6] = 'lice '
8. ' circle' (logit: 11.8444)
	Token length: 7
	Comparing with string_data[1:8] = 'lice wa'
9. ' rectangular' (logit: 11.7547)
	Token length: 12
	Comparing with string_data[1:13] = 'lice was beg'
10. ' square' (logit: 11.7440)
	Token length: 7
	Comparing with string

In [None]:
print(f"\nFinal store_data: {stored_data}")
print(f"original string_data: {string_data}")


Final store_data: ['A', 'l', 'i', 'c', 'e', 7, 'w', 'a', 's', ' ', 'b', 'e', 'g', 9, 'n', 1, 'i', 5, 0, ' ', 'g', 'e', 't', 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]
original string_data: Alice was beginning to get very tired of sitting by her sister on the bank
