In [11]:
from collections import Counter
import tabulate
import pickle
import re

# with open('enwik9', encoding='utf8') as f:
#     string_content = f.read()

# # Count occurrences of each character
# char_counter = Counter(string_content)

# # Calculate total number of characters for relative frequency
# total_chars = sum(char_counter.values())

# # Prepare data for tabulation
# table_data = []
# for char, count in char_counter.items():
#     relative_frequency = count / total_chars
#     bit_number = len(char.encode('utf-8')) * 8
#     table_data.append([char, count, bit_number])

# table_data.sort(key=lambda x: -x[1])
# table_data[0]

# with open("frequency_table.txt", 'w', encoding='utf8') as f:
#     f.write(tabulate.tabulate(table_data, headers=['Character', 'Char count', 'Bit number'], tablefmt="pipe"))
    
# with open('frequency_table.pkl', 'wb') as f:
#     pickle.dump(table_data, f)

In [None]:
with open('frequency_table.pkl', 'rb') as f:
    table_data = pickle.load(f)

In [51]:
def summarize_table(table_data):
    total_bytes = sum(x[1] * x[2] for x in table_data)
    total_chars = sum(x[1] for x in table_data)
    bits_per_char = 8 * total_bytes / total_chars

    summary_data = [
        ["Total Bytes", total_bytes],
        ["Total Characters", total_chars],
        ["Bits per Character", f"{bits_per_char:.2f}"]
    ]

    return tabulate.tabulate(summary_data, tablefmt="pipe", headers=["Metric", "Value"])

print(summarize_table(table_data))
print(summarize_table(table_data[:256]))


| Metric             |        Value |
|:-------------------|-------------:|
| Total Bytes        |  8e+09       |
| Total Characters   |  9.97521e+08 |
| Bits per Character | 64.16        |
| Metric             |        Value |
|:-------------------|-------------:|
| Total Bytes        |  7.98801e+09 |
| Total Characters   |  9.96961e+08 |
| Bits per Character | 64.1         |


In [68]:

def summarize_table_row(table_data, label):
    total_bits = sum(x[1] * x[2] for x in table_data)
    total_chars = sum(x[1] for x in table_data)
    bits_per_char = total_bits / total_chars
    
    # Return a row of data including the label
    return [label, total_bits, f"{total_bits/8e9:.3f}", 
                   total_chars, f"{total_chars/997520891:.3f}", 
            f"{bits_per_char:.2f}"]

# Assuming table_data is available and properly defined
# Generate rows for the specified segments and the entire table
rows = [summarize_table_row(table_data[:x], f"{x}") for x in [8, 16, 32, 64, 128, 256]]
rows.append(summarize_table_row(table_data, "entire text"))

# Define headers for the table
headers = ["Number of chars", "bit", "relative bit", "char", "relative char", "bit/char"]

# Generate and print the table
print(tabulate.tabulate(rows, headers=headers, tablefmt="pipe"))

| Number of chars   |        bit |   relative bit |      char |   relative char |   bit/char |
|:------------------|-----------:|---------------:|----------:|----------------:|-----------:|
| 8                 | 4144629008 |          0.518 | 518078626 |           0.519 |       8    |
| 16                | 5737195088 |          0.717 | 717149386 |           0.719 |       8    |
| 32                | 7011014872 |          0.876 | 876376859 |           0.879 |       8    |
| 64                | 7800719456 |          0.975 | 975089932 |           0.978 |       8    |
| 128               | 7978126872 |          0.997 | 996393559 |           0.999 |       8.01 |
| 256               | 7988014448 |          0.999 | 996961394 |           0.999 |       8.01 |
| entire text       | 8000000000 |          1     | 997520891 |           1     |       8.02 |


In [42]:
996961394/997520891 

0.9994391124987476

In [74]:
def summarize_world_record(bytes):
    compression_ratio = 1e9/bytes
    bit_per_char = bytes*8/997520891
    print(compression_ratio)
    print(bit_per_char)

summarize_world_record(114156155)

8.759930640621175
0.9155189111723576


In [44]:
print(tabulate.tabulate(table_data[:256], headers=['Character', 'Char count', 'Bit number'], tablefmt="pipe"))

| Character   |   Char count |   Bit number |
|:------------|-------------:|-------------:|
|             |    139132610 |            8 |
| e           |     77130764 |            8 |
| t           |     57589780 |            8 |
| a           |     55297286 |            8 |
| i           |     50051692 |            8 |
| o           |     49514893 |            8 |
| n           |     46064994 |            8 |
| r           |     43296607 |            8 |
| s           |     40844869 |            8 |
| l           |     28424211 |            8 |
| h           |     26177831 |            8 |
| d           |     22976495 |            8 |
| c           |     20385009 |            8 |
| ]           |     20216224 |            8 |
| [           |     20214205 |            8 |
| u           |     19831916 |            8 |
| m           |     19597417 |            8 |
| p           |     15486987 |            8 |
| g           |     14109043 |            8 |
|             |     13147025 |    

In [29]:
# For a character within the BMP (e.g., 'A')
char_bmp = 'A'
print(f"'{char_bmp}' Unicode representation: U+{ord(char_bmp):04X}")

# For a character outside the BMP (e.g., a supplementary character like an emoji)
char_supplementary = '🔥'  # Fire emoji
print(f"'{char_supplementary}' Unicode representation: U+{ord(char_supplementary):06X}")

'A' Unicode representation: U+0041
'🔥' Unicode representation: U+01F525


In [39]:
whitespace_entries = [[f'U+{ord(entry[0]):04X}', entry[1], entry[2]] for entry in table_data if re.match(r'\s', entry[0])]
print(tabulate.tabulate(whitespace_entries, headers=['Character', 'Char count', 'Bit number'], tablefmt="pipe"))

| Character   |   Char count |   Bit number |
|:------------|-------------:|-------------:|
| U+0020      |    139132610 |            8 |
| U+000A      |     13147025 |            8 |
| U+0009      |        36693 |            8 |
| U+00A0      |          368 |           16 |
| U+3000      |          301 |           24 |
| U+2008      |           27 |           24 |
| U+2002      |           14 |           24 |
| U+200A      |           12 |           24 |
| U+2028      |            8 |           24 |
| U+2005      |            6 |           24 |
| U+2003      |            6 |           24 |
| U+1680      |            2 |           24 |


In [None]:
from transformers import GPT2TokenizerFast

model_id = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

with open('enwik9', encoding='utf8') as f:
    text = f.read()

# Tokenize the input text and count the number of tokens
tokens = tokenizer.tokenize(text)
num_tokens = len(tokens)

print(f"Number of tokens: {num_tokens}")