In [1]:
import pandas as pd
import yaml
import pickle
from transformers import AutoTokenizer
from datasets import load_dataset
from tokenizers.pre_tokenizers import Sequence
from tokenizers import NormalizedString, PreTokenizedString
from tokenizers.pre_tokenizers import PreTokenizer
from DPSplitter import DPSplitter
from typing import List
from bidict import bidict

from main import DPTokenizer
from main import DPTokenizer2
from tqdm import tqdm

# LlamaTokenizer experiments

In [2]:
llamatokenizer = AutoTokenizer.from_pretrained('/Users/suyashkr/tokenization/llama', use_fast=False,
                                               add_bos_token=False)
vocab = llamatokenizer.get_vocab()

In [3]:
vocab_bidict = bidict(vocab)

In [4]:
vocab_bidict.inverse[12]

'<0x09>'

In [5]:
vocab

{'<unk>': 0,
 '<s>': 1,
 '</s>': 2,
 '<0x00>': 3,
 '<0x01>': 4,
 '<0x02>': 5,
 '<0x03>': 6,
 '<0x04>': 7,
 '<0x05>': 8,
 '<0x06>': 9,
 '<0x07>': 10,
 '<0x08>': 11,
 '<0x09>': 12,
 '<0x0A>': 13,
 '<0x0B>': 14,
 '<0x0C>': 15,
 '<0x0D>': 16,
 '<0x0E>': 17,
 '<0x0F>': 18,
 '<0x10>': 19,
 '<0x11>': 20,
 '<0x12>': 21,
 '<0x13>': 22,
 '<0x14>': 23,
 '<0x15>': 24,
 '<0x16>': 25,
 '<0x17>': 26,
 '<0x18>': 27,
 '<0x19>': 28,
 '<0x1A>': 29,
 '<0x1B>': 30,
 '<0x1C>': 31,
 '<0x1D>': 32,
 '<0x1E>': 33,
 '<0x1F>': 34,
 '<0x20>': 35,
 '<0x21>': 36,
 '<0x22>': 37,
 '<0x23>': 38,
 '<0x24>': 39,
 '<0x25>': 40,
 '<0x26>': 41,
 '<0x27>': 42,
 '<0x28>': 43,
 '<0x29>': 44,
 '<0x2A>': 45,
 '<0x2B>': 46,
 '<0x2C>': 47,
 '<0x2D>': 48,
 '<0x2E>': 49,
 '<0x2F>': 50,
 '<0x30>': 51,
 '<0x31>': 52,
 '<0x32>': 53,
 '<0x33>': 54,
 '<0x34>': 55,
 '<0x35>': 56,
 '<0x36>': 57,
 '<0x37>': 58,
 '<0x38>': 59,
 '<0x39>': 60,
 '<0x3A>': 61,
 '<0x3B>': 62,
 '<0x3C>': 63,
 '<0x3D>': 64,
 '<0x3E>': 65,
 '<0x3F>': 66,
 '<0x40>': 

In [6]:
llamatokenizer.encode('\t')

[29871, 12]

# End of Llamatokenizer experiments

# Create new tokenizer which uses DP Splitter

In [7]:
vocab = llamatokenizer.get_vocab()

dp_tokenizer = DPTokenizer2(vocab, llamatokenizer)

mbpp_dataset = load_dataset('mbpp', split="test")

# Evaluate results

In [8]:
from main import get_tokens_fn

original_tokenizer_mapper = get_tokens_fn(llamatokenizer)
new_tokenizer_mapper = get_tokens_fn(dp_tokenizer)

original_tokens = []
new_tokens = []

for row in tqdm(mbpp_dataset):
    original_tokens.append(original_tokenizer_mapper(row))
    new_tokens.append(new_tokenizer_mapper(row))

indices_with_difference = [
    (i, 100.0 * float(len(original_tokens[i]) - len(new_tokens[i])) / len(original_tokens[i])) for i in
    range(len(original_tokens)) if
    len(original_tokens[i]) != len(new_tokens[i])]

100%|██████████| 500/500 [00:02<00:00, 204.96it/s]


In [9]:
len(indices_with_difference)

42

In [10]:
indices_with_difference

[(5, 1.36986301369863),
 (17, 3.8461538461538463),
 (32, 1.4492753623188406),
 (41, 8.0),
 (53, 1.4285714285714286),
 (96, 0.6944444444444444),
 (101, 8.0),
 (122, 6.382978723404255),
 (127, 2.2222222222222223),
 (137, 0.6578947368421053),
 (140, 1.639344262295082),
 (141, 0.4608294930875576),
 (161, 2.6315789473684212),
 (164, 2.803738317757009),
 (167, 1.6666666666666667),
 (174, 1.7543859649122806),
 (175, 1.694915254237288),
 (196, 0.5555555555555556),
 (235, 3.2967032967032965),
 (263, 3.0303030303030303),
 (275, 0.6451612903225806),
 (316, 2.1739130434782608),
 (328, 3.125),
 (331, 1.6129032258064515),
 (334, 2.1739130434782608),
 (363, 0.6578947368421053),
 (378, 4.225352112676056),
 (384, 0.8547008547008547),
 (392, 0.5681818181818182),
 (403, 0.9345794392523364),
 (419, 2.127659574468085),
 (423, 1.639344262295082),
 (440, 3.125),
 (445, 3.0303030303030303),
 (451, 1.3888888888888888),
 (469, 0.847457627118644),
 (472, 2.7777777777777777),
 (474, 1.4492753623188406),
 (478, 1.

In [12]:
indices = [idx[0] for idx in indices_with_difference]

In [13]:
text_inputs = mbpp_dataset[indices]["code"]

In [14]:
import pandas as pd

df = pd.DataFrame(data=indices_with_difference, columns=["mbpp_test_index", "tokenization_percentage_improvement"])

In [15]:
df['text_inputs'] = text_inputs

In [18]:
df["original_tokens"] = [original_tokens[idx] for idx in indices]
df["dp_tokens"] = [new_tokens[idx] for idx in indices]


In [24]:
df['original_reconstructed'] = df['original_tokens'].apply(llamatokenizer.decode)
df['dp_reconstructed'] = df['dp_tokens'].apply(llamatokenizer.decode)

In [26]:
df

Unnamed: 0,mbpp_test_index,tokenization_percentage_improvement,text_inputs,original_tokens,dp_tokens,original_reconstructed,dp_reconstructed
0,1,-4.166667,"def sort_matrix(M):\r\n result = sorted(M, ...","[822, 2656, 29918, 5344, 29898, 29924, 1125, 3...","[822, 2656, 29918, 5344, 29898, 29924, 1125, 3...","def sort_matrix(M):\r\n result = sorted(M, ...","def sort_matrix(M):\r\n result = sorted(M, ..."
1,2,-5.660377,from collections import Counter\r\ndef count_c...,"[515, 16250, 1053, 315, 5336, 30004, 13, 1753,...","[515, 16250, 1053, 29871, 17779, 30004, 13, 17...",from collections import Counter\r\ndef count_c...,from collections import Counter\r\ndef count_c...
2,5,1.369863,import re\r\ndef text_lowercase_underscore(tex...,"[1053, 337, 30004, 13, 1753, 1426, 29918, 1360...","[1053, 337, 30004, 13, 1753, 1426, 29918, 1360...",import re\r\ndef text_lowercase_underscore(tex...,import re\r\ndef text_lowercase_underscore(tex...
3,11,-1.020408,def find_first_duplicate(nums):\r\n num_set...,"[822, 1284, 29918, 4102, 29918, 20908, 5926, 2...","[822, 1284, 29918, 4102, 29918, 20908, 5926, 2...",def find_first_duplicate(nums):\r\n num_set...,def find_first_duplicate(nums):\r\n num_set...
4,17,3.846154,"def binomial_Coeff(n,k): \r\n if k > n : \r...","[822, 9016, 7615, 29918, 29907, 7297, 600, 298...","[822, 9016, 7615, 29918, 7967, 12352, 29898, 2...","def binomial_Coeff(n,k): \r\n if k > n : \r...","def binomial_Coeff(n,k): \r\n if k > n : \r..."
...,...,...,...,...,...,...,...
146,486,-3.921569,"import math\r\ndef surfacearea_cone(r,h):\r\n ...","[1053, 5844, 30004, 13, 1753, 7101, 6203, 2991...","[1053, 5844, 30004, 13, 1753, 7101, 6203, 2991...","import math\r\ndef surfacearea_cone(r,h):\r\n ...","import math\r\ndef surfacearea_cone(r,h):\r\n ..."
147,490,-2.339181,"def ngcd(x,y):\r\n i=1\r\n while(i<=x an...","[822, 8736, 2252, 29898, 29916, 29892, 29891, ...","[822, 8736, 2252, 29898, 29916, 29892, 29891, ...","def ngcd(x,y):\r\n i=1\r\n while(i<=x an...","def ngcd(x,y):\r\n i=1\r\n while(i<=x an..."
148,492,2.127660,def add_consecutive_nums(nums):\r\n result ...,"[822, 788, 29918, 535, 3471, 11067, 29918, 194...","[822, 788, 29918, 535, 3471, 11067, 29918, 298...",def add_consecutive_nums(nums):\r\n result ...,def add_consecutive_nums(nums):\r\n result ...
149,496,-2.127660,"def remove_words(list1, removewords):\r\n f...","[822, 3349, 29918, 9303, 29898, 1761, 29896, 2...","[822, 3349, 29918, 9303, 29898, 1761, 29896, 2...","def remove_words(list1, removewords):\r\n f...","def remove_words(list1, removewords):\r\n f..."


In [27]:
df.apply(lambda x: x['text_inputs'] == x['original_reconstructed'], axis=1)

0      True
1      True
2      True
3      True
4      True
       ... 
146    True
147    True
148    True
149    True
150    True
Length: 151, dtype: bool

In [28]:
df['original_reconstructed_compare_original'] = df.apply(lambda x: x['text_inputs'] == x['original_reconstructed'],
                                                         axis=1)

In [29]:
df['dp_reconstructed_compare_original'] = df.apply(lambda x: x['text_inputs'] == x['dp_reconstructed'],
                                                   axis=1)

In [31]:
df['dp_reconstructed_compare_original'].unique()

array([ True])

In [34]:
len(df.loc[0, 'original_tokens'])

24

In [35]:
len(df.loc[0, 'dp_tokens'])

25

In [36]:
df.loc[0, 'original_tokens']

[822,
 2656,
 29918,
 5344,
 29898,
 29924,
 1125,
 30004,
 13,
 1678,
 1121,
 353,
 12705,
 29898,
 29924,
 29892,
 1820,
 29922,
 2083,
 8443,
 13,
 1678,
 736,
 1121]

In [37]:
df.loc[0, 'dp_tokens']

[822,
 2656,
 29918,
 5344,
 29898,
 29924,
 1125,
 30004,
 13,
 1678,
 1121,
 353,
 12705,
 29898,
 29924,
 29892,
 1820,
 29922,
 2083,
 29897,
 30004,
 13,
 1678,
 736,
 1121]

In [40]:
vocab_bidict.inverse[30004]

'\r'

In [42]:
df.loc[0]

mbpp_test_index                                                                            1
tokenization_percentage_improvement                                                -4.166667
text_inputs                                def sort_matrix(M):\r\n    result = sorted(M, ...
original_tokens                            [822, 2656, 29918, 5344, 29898, 29924, 1125, 3...
dp_tokens                                  [822, 2656, 29918, 5344, 29898, 29924, 1125, 3...
original_reconstructed                     def sort_matrix(M):\r\n    result = sorted(M, ...
dp_reconstructed                           def sort_matrix(M):\r\n    result = sorted(M, ...
original_reconstructed_compare_original                                                 True
dp_reconstructed_compare_original                                                       True
Name: 0, dtype: object

In [42]:
tokens = original_tokens[0]

In [48]:
def show_separated(tokenizer, tokens):
    return "||".join([tokenizer.decode(token) for token in tokens])

In [None]:
df["original_tokens"] = [original_tokens[idx] for idx in indices]

In [54]:
df["dp_tokens"] = [new_tokens[idx] for idx in indices]


Unnamed: 0,mbpp_test_index,tokenization_percentage_improvement,text_inputs,original_tokenization,dp_tokenization
0,17,3.092784,"def binomial_Coeff(n,k): \r\n if k > n : \r...","def|| bin||omial||_||C||oe||ff||(||n||,||k||):...","def|| bin||omial||_||Co||eff||(||n||,||k||):||..."
1,29,2.222222,from collections import Counter\r\nfrom iterto...,from|| collections|| import|| Counter||\r||\n|...,from|| collections|| import|| Counter||\r||\n|...
2,41,4.166667,"def parallelogram_area(b,h):\r\n area=b*h\r\n...","def|| paralle||log||ram||_||area||(||b||,||h||...","def|| parallel||ogram||_||area||(||b||,||h||):..."
3,85,2.040816,def divisor(n):\r\n for i in range(n):\r\n ...,def|| div||is||or||(||n||):||\r||\n|| || for||...,def|| di||visor||(||n||):||\r||\n|| || for|| i...
4,96,0.429185,"def count_Hexadecimal(L,R) : \r\n count = ...","def|| count||_||H||ex||ade||c||imal||(||L||,||...","def|| count||_||He||xa||dec||imal||(||L||,||R|..."
5,102,1.204819,def check_integer(text):\r\n text = text.strip...,def|| check||_||integer||(||text||):||\r||\n||...,def|| check||_||integer||(||text||):||\r||\n||...
6,116,0.877193,"def multiply_int(x, y):\r\n if y < 0:\r\n ...","def|| multiply||_||int||(||x||,|| y||):||\r||\...","def|| multiply||_||int||(||x||,|| y||):||\r||\..."
7,122,12.0,def sum_negativenum(nums):\r\n sum_negativenu...,def|| sum||_||neg||at||iven||um||(||n||ums||):...,def|| sum||_||negative||num||(||n||ums||):||\r...
8,135,2.631579,def ascii_value_string(str1):\r\n for i in ra...,def|| as||ci||i||_||value||_||string||(||str||...,def|| asc||ii||_||value||_||string||(||str||1|...
9,137,0.540541,def sum_digits_single(x) : \r\n ans = 0\r\n...,def|| sum||_||dig||its||_||single||(||x||)|| :...,def|| sum||_||dig||its||_||single||(||x||)|| :...


In [None]:
# df["dp_tokenization"] = [show_separated(llamatokenizer, new_tokens[idx]) for idx in indices]
# df["original_tokenization"] = [show_separated(llamatokenizer, original_tokens[idx]) for idx in indices]

In [57]:
df.loc[7, ['original_tokenization', 'dp_tokenization']]

original_tokenization    def|| sum||_||neg||at||iven||um||(||n||ums||):...
dp_tokenization          def|| sum||_||negative||num||(||n||ums||):||\r...
Name: 7, dtype: object

In [32]:
df.to_excel("results.xlsx")