In [1]:
import pandas as pd
import tiktoken

from tokenizers import ByteLevelBPETokenizer

In [2]:
from llm_lang.utils import get_dataset, get_tokens, get_token_stats

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = get_dataset(lang="all")

# Ensure that all languages have the same number of sentences
lens = [len(ds[col]) for col in ds.column_names if col.startswith("sentence_")]

assert min(lens) == max(lens)

In [27]:
tokens = get_tokens(ds, "eng_Latn", model_name="gpt-4")

In [31]:
pd.Series([len(s) for s in tokens]).describe()

count    997.000000
mean      25.878636
std        8.518484
min        7.000000
25%       20.000000
50%       25.000000
75%       31.000000
max       69.000000
dtype: float64

In [4]:
token_stats = get_token_stats(ds, "eng_Latn", model_name="gpt-4")
token_stats

{'lang': 'eng_Latn',
 'model_name': 'gpt-4',
 'n_tokens_count': 997.0,
 'n_tokens_mean': 25.87863590772317,
 'n_tokens_std': 8.518484210103676,
 'n_tokens_min': 7.0,
 'n_tokens_25%': 20.0,
 'n_tokens_50%': 25.0,
 'n_tokens_75%': 31.0,
 'n_tokens_max': 69.0,
 'n_tokens': [49,
  41,
  36,
  14,
  12,
  16,
  18,
  35,
  19,
  35,
  54,
  16,
  11,
  21,
  36,
  36,
  22,
  23,
  22,
  26,
  19,
  16,
  19,
  17,
  24,
  23,
  51,
  29,
  27,
  16,
  16,
  30,
  24,
  16,
  28,
  14,
  24,
  28,
  23,
  20,
  44,
  31,
  35,
  22,
  39,
  31,
  29,
  37,
  29,
  49,
  24,
  45,
  36,
  16,
  33,
  19,
  29,
  29,
  57,
  47,
  22,
  28,
  28,
  27,
  35,
  24,
  40,
  37,
  47,
  30,
  31,
  42,
  39,
  66,
  22,
  39,
  34,
  19,
  22,
  21,
  16,
  30,
  29,
  22,
  17,
  14,
  30,
  39,
  10,
  39,
  28,
  27,
  22,
  19,
  18,
  7,
  35,
  25,
  29,
  30,
  34,
  19,
  22,
  25,
  32,
  32,
  46,
  44,
  14,
  33,
  41,
  42,
  32,
  38,
  10,
  31,
  22,
  26,
  26,
  22,
  37,
  27,

In [5]:
token_stats = get_token_stats(ds, "hin_Deva", model_name="gpt-4")
token_stats

{'lang': 'hin_Deva',
 'model_name': 'gpt-4',
 'n_tokens_count': 997.0,
 'n_tokens_mean': 124.03911735205617,
 'n_tokens_std': 42.648317556321125,
 'n_tokens_min': 32.0,
 'n_tokens_25%': 95.0,
 'n_tokens_50%': 120.0,
 'n_tokens_75%': 148.0,
 'n_tokens_max': 346.0,
 'n_tokens': [304,
  202,
  160,
  61,
  95,
  60,
  82,
  162,
  97,
  155,
  266,
  46,
  52,
  74,
  150,
  147,
  139,
  151,
  129,
  164,
  71,
  83,
  99,
  59,
  82,
  86,
  199,
  104,
  126,
  68,
  102,
  112,
  113,
  61,
  88,
  69,
  109,
  134,
  100,
  104,
  179,
  162,
  175,
  94,
  187,
  116,
  126,
  169,
  121,
  230,
  112,
  204,
  222,
  75,
  181,
  124,
  125,
  113,
  271,
  205,
  97,
  127,
  138,
  153,
  155,
  130,
  171,
  193,
  238,
  137,
  177,
  156,
  197,
  346,
  132,
  201,
  162,
  99,
  89,
  72,
  60,
  92,
  120,
  91,
  80,
  72,
  87,
  178,
  77,
  179,
  145,
  114,
  122,
  104,
  80,
  40,
  119,
  134,
  139,
  132,
  109,
  94,
  116,
  112,
  152,
  134,
  199,
  196,
  