In [1]:
!pip install ipywidgets



In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

print("Installing ipywidgets...")

# Load model and tokenizer
print("Loading model and tokenizer...")
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=".cache")
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=".cache")
model.eval()

Installing ipywidgets...
Loading model and tokenizer...


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [2]:
def top_k_token(prompt, k=5):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits[:, -1, :]
    topk_values, topk_indices = torch.topk(logits, top_k, dim=-1)
    top_tokens = tokenizer.batch_decode(topk_indices[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # print(topk_indices[0][:k])
    
    return top_tokens, topk_values[0]

import torch
import torch.nn.functional as F

def top_k_token_prop(prompt, k=5):
    inputs = tokenizer(prompt, return_tensors="pt")
    print(inputs)
    print(tokenizer.batch_decode(
        inputs['input_ids'][0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    ))

    with torch.no_grad():
        outputs = model(**inputs)

    print(outputs)
    logits = outputs.logits[:, -1, :]
    topk_values, topk_indices = torch.topk(logits, k, dim=-1)

    # Apply softmax to top-k logits to get probabilities
    topk_probs = F.softmax(topk_values, dim=-1)

    top_tokens = tokenizer.batch_decode(
        topk_indices[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    return top_tokens, topk_probs[0]

In [4]:
prompt = "How can you"
top_k = 10

top_tokens, top_values = top_k_token_prop(prompt, k=top_k)
for i, (token, score) in enumerate(zip(top_tokens, top_values)):
    print(f"{i+1}. {token} (logit: {score.item():.4f})")
print(f"Sum: {top_values[0:10].sum().item():.4f}")

{'input_ids': tensor([[4340,  646,  498]]), 'attention_mask': tensor([[1, 1, 1]])}
['How', ' can', ' you']


CausalLMOutputWithPast(loss=None, logits=tensor([[[ 8.2367,  6.6111,  4.5391,  ..., -4.0680, -4.0680, -4.0681],
         [ 4.1064,  5.5320,  4.0867,  ..., -3.4824, -3.4814, -3.4826],
         [ 6.9169,  8.1698,  9.0604,  ..., -3.9949, -3.9945, -3.9951]]]), past_key_values=<transformers.cache_utils.DynamicCache object at 0x7fb1fd963c20>, hidden_states=None, attentions=None)
1.  determine (logit: 0.2247)
2.  use (logit: 0.1334)
3.  find (logit: 0.1147)
4.  modify (logit: 0.1048)
5.  simplify (logit: 0.0876)
6.  calculate (logit: 0.0757)
7.  solve (logit: 0.0678)
8.  express (logit: 0.0650)
9.  prove (logit: 0.0634)
10.  create (logit: 0.0630)
Sum: 1.0000


In [2]:
import json
import os
import numpy as np
import time
from arithmetic_coder import ArithmeticDecoder, ArithmeticEncoder, BitInputStream, BitOutputStream


def gen_rank(probs, next_token):
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True, stable=True)
    rank_list = []
    if next_token.shape[0] > 1:
        for i in range(next_token.shape[0]):
            rank_list += [torch.where(probs_idx[i:i+1, :] == next_token[i])[-1]]
        rank = torch.squeeze(torch.stack(rank_list))
    else:
        rank = torch.where(probs_idx == next_token)[-1]
    return rank


def read_bitstream(bitin):
    temp_list = []
    while True:
        temp = bitin.read()
        if temp == -1:
            break
        temp_list += [temp]
    temp_arr = np.array(temp_list)
    final_ind = (np.where(temp_arr == 1)[0][-1]).astype(int)
    final_arr = temp_arr[:final_ind+1]

    return final_arr


def print_tokens(tokens):
    decoded_tokens = tokenizer.batch_decode(
        tokens[0:1000],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    print("|".join(decoded_tokens))


def print_probs(probs):
    print("|".join([f"{p:.4f}" for p in probs]))


class LLMzip_encode:
    def __init__(self, model, tokenizer, filename, extra_string=None, batch_size=32, win_size=100, max_tokens=10_000):
        self.model = model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        self.tokenizer = tokenizer

        self.filename = filename
        self.file_out = open(self.filename+'_llmzip_ac.txt', 'wb')
        self.bitout = BitOutputStream(self.file_out)
        self.AC_encoder = ArithmeticEncoder(32, self.bitout)

        self.AC_encoder = ArithmeticEncoder(32, self.bitout)

        self.alphabet_size = self.model.config.vocab_size

        self.token_length = 0
        self.starter_tokens = []
        self.extra_tokens = np.array(tokenizer.encode(extra_string)) if extra_string is not None else None

        self.batch_size = batch_size
        self.win_size = win_size
        self.max_tokens = max_tokens

        self.total_time = 0
        self.gpu_time = 0

    def encode_batch(self, prompt_tokens):
        #print_tokens(prompt_tokens)
        bsz = prompt_tokens.shape[0]

        prompt_size = prompt_tokens.shape[1]

        start = time.time()
        tokens = torch.full((bsz, prompt_size), 0).long() # self.tokenizer.pad_token_id (instead of 0)
        tokens[:bsz, : prompt_size] = torch.tensor(prompt_tokens).long()
        tokens = tokens.to(self.device)

        cur_pos = prompt_size-1
        prev_pos = 0

        logits = self.model.forward(tokens[:, prev_pos:cur_pos]).logits[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        rank = gen_rank(probs, next_token=tokens[:, cur_pos])

        probs_np2 = probs.cpu().detach().numpy()
        tokens_np2 = tokens[:, cur_pos].cpu().numpy()
        ranks_np2 = rank.cpu().numpy()
        self.gpu_time = time.time() - start

        probs_tok = probs_np2[np.arange(bsz), tokens_np2]
        #print_tokens(tokens_np2)
        #print_probs(probs_tok)

        cumul = np.zeros(self.model.config.vocab_size+1, dtype=np.uint64)
        for j in range(bsz):
            prob1 = probs_np2[j]
            cumul[1:] = np.cumsum(prob1*10000000 + 1)
            self.AC_encoder.write(cumul, tokens_np2[j])

        return ranks_np2, probs_tok

    def encode(self):
        if not os.path.exists(self.filename + '_tokens.npy'):
            with open(self.filename, 'r') as f_in:
                text_input = f_in.read()

            tokens_full = np.array(self.tokenizer.encode(text_input))
            print(f"max token: {np.max(tokens_full)}")
            np.save(self.filename + '_tokens.npy', tokens_full)
        else:
            tokens_full = np.load(self.filename + '_tokens.npy')

        start = time.time()
        tokens_full = tokens_full[0:self.max_tokens]
        #print_tokens(tokens_full)

        win_size_enc = self.win_size + 1  # additional 1 is to pass the true token apart from the context of win_size

        ranks_list = []
        probs_tok_list = []

        n_runs = tokens_full.size-win_size_enc+1

        tokens_encoded = tokens_full[self.win_size:self.win_size+n_runs]
        #print(self.win_size)
        self.starter_tokens = tokens_full[:self.win_size]
        np.save(self.filename + '_starter_tokens.npy', self.starter_tokens)

        n_batches = np.ceil(n_runs/self.batch_size).astype(int)

        for b_ind in range(n_batches):
            batch_range_start = b_ind*self.batch_size
            batch_range_stop = np.minimum(n_runs, (b_ind+1) * self.batch_size)
            # tokens_batch = np.array([np.concatenate(([tokens_full[0]], tokens_full[i:i+win_size_enc])) for i in range(batch_range_start, batch_range_stop)])
            if self.extra_tokens is None:
                tokens_batch = np.array([tokens_full[i: i + win_size_enc] for i in range(batch_range_start, batch_range_stop)])
            else:
                tokens_batch = np.array([np.concatenate((self.extra_tokens, tokens_full[i: i + win_size_enc])) for i in range(batch_range_start, batch_range_stop)])
            ranks, probs_tok = self.encode_batch(tokens_batch)
            ranks_list += [ranks]
            probs_tok_list += [probs_tok]

            if (b_ind * 100 / n_batches) % 10 == 0:
                print(f'Encoder: Completed {int(b_ind * 100 / n_batches)} %')

        self.total_time = time.time() - start

        ranks_full = np.concatenate(ranks_list, 0).squeeze()
        probs_tok_full = np.concatenate(probs_tok_list, 0).squeeze()

        self.token_length = len(tokens_encoded)

        self.AC_encoder.finish()
        self.bitout.close()
        self.file_out.close()

        self.compute_compression_ratio(tokens_encoded, probs_tok_full)

    def decode(self):
        # Open the compressed bit-stream
        with open(self.filename + '_llmzip_ac.txt', 'rb') as f_in:
            bitin   = BitInputStream(f_in)
            decoder = ArithmeticDecoder(32, bitin)

            # Load initial window
            decoded = list(self.starter_tokens[:self.win_size])
            n_to_decode = self.token_length

            # Loop through tokens
            for _ in range(n_to_decode):
                # build context from win_size tokens
                ctx = torch.tensor([decoded[-self.win_size:]])
                ctx = ctx.to(self.device)
                with torch.no_grad():
                    logits = self.model(ctx).logits[:, -1, :]
                    probs  = torch.softmax(logits, dim=-1).cpu().numpy()[0]

                # rebuild distribution for the next token
                cumul = np.zeros(probs.shape[0] + 1, dtype=np.uint64)
                cumul[1:] = np.cumsum(probs * 10_000_000 + 1)

                # pull one symbol out of the bit-stream
                sym = decoder.read(cumul, alphabet_size=self.alphabet_size)
                decoded.append(int(sym))

        # convert to text
        return self.tokenizer.decode(decoded)

    def compute_compression_ratio(self, tokens_encoded, probs_tok):
        text_encoded = self.tokenizer.decode(tokens_encoded.squeeze().tolist())

        N_T = tokens_encoded.size
        N_C = len(text_encoded)

        df_out = {}
        df_out['characters'] = N_C
        df_out['tokens'] = N_T

        entropy_val = np.sum(-np.log2(probs_tok)) / N_C
        df_out['entropy'] = [f"{entropy_val:.4f}"]

        file_in = open(self.filename+"_llmzip_ac.txt", 'rb')
        bitin = BitInputStream(file_in)
        compressed_bits = read_bitstream(bitin)
        rho_AC = compressed_bits.size/N_C
        print(f'Compression Ratio for Arithmetic Coding :  {rho_AC} bits/char')
        file_in.close()

        df_out['Llama+AC compressed file size'] = compressed_bits.size
        df_out['bits per character'] = rho_AC
        df_out['throughput (bits/s)'] = N_C / self.total_time
        print(f'Throughput: {int(N_C / self.total_time)} bytes/s')
        print(f'total time: {self.total_time}, gpu time: {self.gpu_time}')

        print(df_out)

        with open(self.filename+'_metrics.json', 'w') as file_metrics:
            json.dump(df_out, file_metrics)



In [None]:

Encoder = LLMzip_encode(model, tokenizer, filename='../test.txt', win_size=13)
Encoder.encode()
#recovered_text = Encoder.decode()
#print("Recovered text:", recovered_text)

Encoder = LLMzip_encode(model, tokenizer, filename='../test.txt', extra_string='bank, ', win_size=13)
Encoder.encode()


In [3]:
Encoder = LLMzip_encode(model, tokenizer, filename='../text8.txt', win_size=100)
Encoder.encode()


Encoder: Completed 0 %
Encoder: Completed 10 %
Encoder: Completed 20 %
Encoder: Completed 30 %
Encoder: Completed 40 %
Encoder: Completed 50 %
Encoder: Completed 60 %
Encoder: Completed 70 %
Encoder: Completed 80 %
Encoder: Completed 90 %
Compression Ratio for Arithmetic Coding :  0.9711058908565233 bits/char
Throughput: 1444 bytes/s
total time: 38.31452965736389, gpu time: 0.04167461395263672
{'characters': 55340, 'tokens': 9900, 'entropy': ['0.9681'], 'Llama+AC compressed file size': 53741, 'bits per character': 0.9711058908565233, 'throughput (bits/s)': 1444.3606771345003}


In [4]:
Encoder = LLMzip_encode(model, tokenizer, filename='../text8.txt', batch_size=4, win_size=1000)
Encoder.encode()

Encoder: Completed 0 %
Encoder: Completed 10 %
Encoder: Completed 20 %
Encoder: Completed 30 %
Encoder: Completed 40 %
Encoder: Completed 50 %
Encoder: Completed 60 %
Encoder: Completed 70 %
Encoder: Completed 80 %
Encoder: Completed 90 %
Compression Ratio for Arithmetic Coding :  0.9043212328223051 bits/char
Throughput: 169 bytes/s
total time: 297.659095287323, gpu time: 0.12717437744140625
{'characters': 50356, 'tokens': 9000, 'entropy': ['0.9012'], 'Llama+AC compressed file size': 45538, 'bits per character': 0.9043212328223051, 'throughput (bits/s)': 169.1733959998521}


In [6]:
Encoder = LLMzip_encode(model, tokenizer, filename='../text8.txt', batch_size=64, win_size=10)
Encoder.encode()


Encoder: Completed 0 %
Compression Ratio for Arithmetic Coding :  1.3040557486295725 bits/char
Throughput: 3962 bytes/s
total time: 14.08653974533081, gpu time: 0.02246546745300293
{'characters': 55822, 'tokens': 9990, 'entropy': ['1.3045'], 'Llama+AC compressed file size': 72795, 'bits per character': 1.3040557486295725, 'throughput (bits/s)': 3962.7900825327256}


In [36]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model2 = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer2 = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
model2.eval()


GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [None]:
Encoder = LLMzip_encode(model3, tokenizer3, filename='../text8_3.txt', batch_size=1, win_size=10, max_tokens=100_000)
Encoder.encode()

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model3 = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer3 = AutoTokenizer.from_pretrained("distilgpt2")

model3.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
Encoder = LLMzip_encode(model3, tokenizer3, filename='../text8_3.txt', batch_size=256, win_size=100, max_tokens=100_000)

Encoder.encode()

Encoder: Completed 0 %
Compression Ratio for Arithmetic Coding :  1.3245215160296426 bits/char
Throughput: 5692 bytes/s
total time: 92.03052592277527, gpu time: 0.05153179168701172
{'characters': 523842, 'tokens': 99900, 'entropy': ['1.3243'], 'Llama+AC compressed file size': 693840, 'bits per character': 1.3245215160296426, 'throughput (bits/s)': 5692.046141728742}


In [6]:
Encoder = LLMzip_encode(model3, tokenizer3, filename='../text8_3.txt', batch_size=128, win_size=10, max_tokens=100_000)

Encoder.encode()

Encoder: Completed 0 %
Encoder: Completed 50 %
Compression Ratio for Arithmetic Coding :  1.5966639457442995 bits/char
Throughput: 13922 bytes/s
total time: 37.660134077072144, gpu time: 0.00895071029663086
{'characters': 524332, 'tokens': 99990, 'entropy': ['1.5985'], 'Llama+AC compressed file size': 837182, 'bits per character': 1.5966639457442995, 'throughput (bits/s)': 13922.733225722062}


In [7]:
Encoder = LLMzip_encode(model3, tokenizer3, filename='../text8_3.txt', batch_size=2048, win_size=10, max_tokens=100_000)

Encoder.encode()

Encoder: Completed 0 %
Compression Ratio for Arithmetic Coding :  1.5966868320072016 bits/char
Throughput: 12032 bytes/s
total time: 43.57472252845764, gpu time: 0.3364715576171875
{'characters': 524332, 'tokens': 99990, 'entropy': ['1.5985'], 'Llama+AC compressed file size': 837194, 'bits per character': 1.5966868320072016, 'throughput (bits/s)': 12032.939501968622}


In [8]:
# string_data = "1. 子曰：「學而時習之，不亦說乎？有朋自遠方來，不亦樂乎？人不知而不慍，不亦君子乎？」"
# string_data = "蓋聞天地之數，有十二萬九千六百歲為一元。將一元分為十二會，乃子、丑、寅、卯、辰、巳、午、未、申、酉、戌、亥之十二支也。"
string_data = "Alice was beginning to get very tired of sitting by her sister on the bank"
# string_data = "En 1815, M. Charles-François-Bienvenu Myriel était évêque de Digne. C'était un vieillard d'environ soixante-quinze ans; il occupait le siège de Digne depuis 1806."
top_k = 10

stored_data = []
prompt = ""
i = 0
while i < len(string_data):
    print(f"\n\nCurrent index: {i}")
    print(f"Prompt ({i}): {prompt}")
    print(f"stored_data ({i}): {stored_data}")
    if prompt == "":
        prompt = string_data[i]
        stored_data.append(string_data[i])
        i += 1
    else:
        top_tokens, top_values = top_k_token(prompt, k=top_k)
        for j, (token, score) in enumerate(zip(top_tokens, top_values)):
            print(f"{j+1}. '{token}' (logit: {score.item():.4f})")
            token_len = len(token)
            print(f"\tToken length: {token_len}")
            print(f"\tComparing with string_data[{i}:{i+token_len}] = '{string_data[i:i+token_len]}'")
            if token == string_data[i:i+token_len]:
                stored_data.append(j)
                prompt += token
                i += token_len - 1
                break
            elif j == top_k - 1:
                print(f"Character '{string_data[i]}' not found in top {top_k} tokens.")
                stored_data.append(string_data[i])
                prompt += string_data[i]
        i += 1
        # if i == 6:
        #     break

            




Current index: 0
Prompt (0): 
stored_data (0): []


Current index: 1
Prompt (1): A
stored_data (1): ['A']


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
print(f"\nFinal store_data: {stored_data}")
print(f"original string_data: {string_data}")