In [1]:
import pandas as pd
import yaml
import pickle
from transformers import AutoTokenizer
from datasets import load_dataset
from tokenizers.pre_tokenizers import Sequence
from tokenizers import NormalizedString, PreTokenizedString
from tokenizers.pre_tokenizers import PreTokenizer
from DPSplitter import DPSplitter
from typing import List

# LlamaTokenizer experiments

In [4]:
llamatokenizer = AutoTokenizer.from_pretrained('/Users/suyashkr/tokenization/llama')

In [5]:
llamatokenizer

LlamaTokenizerFast(name_or_path='/Users/suyashkr/tokenization/llama', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# End of Llamatokenizer experiments

In [9]:
with open("config.yaml") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [10]:
def llama_tokenizer(tokenizer_path):
    return AutoTokenizer.from_pretrained(tokenizer_path)

In [13]:
class CustomPreTokenizer:
    def __init__(self, vocab=None):
        # make sure you only retain keys
        self.vocab = vocab
        self.dp_splitter = DPSplitter(vocab)

    def dp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        splits = self.dp_splitter.split(normalized_string)
        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.dp_split)


def get_tokens_fn(tokenizer):
    def get_tokens(row):
        return tokenizer.encode(row["code"])

    return get_tokens

# Create new tokenizer which uses DP Splitter

In [14]:
original_tokenizer = AutoTokenizer.from_pretrained((config['tokenizer_path']))

dp_tokenizer = AutoTokenizer.from_pretrained((config['tokenizer_path']))
original_pre_tokenizer = dp_tokenizer.backend_tokenizer.pre_tokenizer
# A new pretokenizer step has been added that tokenizes based on the dp splitting mechanism
dp_tokenizer.backend_tokenizer.pre_tokenizer = Sequence([original_pre_tokenizer, PreTokenizer.custom(
    CustomPreTokenizer(vocab=dp_tokenizer.get_vocab().keys()))])

mbpp_dataset = load_dataset(config['mbpp_path'], split="test")

# Evaluate results

In [15]:
original_tokenizer_mapper = get_tokens_fn(original_tokenizer)
new_tokenizer_mapper = get_tokens_fn(dp_tokenizer)

original_tokens = []
new_tokens = []

for row in mbpp_dataset:
    original_tokens.append(original_tokenizer_mapper(row))
    new_tokens.append(new_tokenizer_mapper(row))

indices_with_difference = [
    (i, 100.0 * float(len(original_tokens[i]) - len(new_tokens[i])) / len(original_tokens[i])) for i in
    range(len(original_tokens)) if
    len(original_tokens[i]) != len(new_tokens[i])]

In [19]:
indices_with_difference

[(17, 3.0927835051546393),
 (29, 2.2222222222222223),
 (41, 4.166666666666667),
 (85, 2.0408163265306123),
 (96, 0.4291845493562232),
 (102, 1.2048192771084338),
 (116, 0.8771929824561403),
 (122, 12.0),
 (135, 2.6315789473684212),
 (137, 0.5405405405405406),
 (140, 1.408450704225352),
 (146, 2.3255813953488373),
 (161, 1.2195121951219512),
 (164, 1.1363636363636365),
 (171, 1.5151515151515151),
 (181, 0.8064516129032258),
 (197, 1.408450704225352),
 (204, 0.8333333333333334),
 (222, 12.820512820512821),
 (244, 2.2222222222222223),
 (255, 6.896551724137931),
 (258, 4.3478260869565215),
 (263, 2.7777777777777777),
 (275, 0.6578947368421053),
 (280, 0.8),
 (284, 3.5714285714285716),
 (296, 9.23076923076923),
 (306, 0.8771929824561403),
 (328, 2.272727272727273),
 (331, 1.1682242990654206),
 (332, 1.0204081632653061),
 (334, 2.0408163265306123),
 (358, 5.405405405405405),
 (363, 0.47393364928909953),
 (378, 4.411764705882353),
 (387, 2.380952380952381),
 (394, 6.382978723404255),
 (408, 3

In [20]:
indices = [idx[0] for idx in indices_with_difference]

In [30]:
text_inputs = mbpp_dataset[indices]["code"]

In [36]:
import pandas as pd

df = pd.DataFrame(data=indices_with_difference, columns=["mbpp_test_index", "tokenization_percentage_improvement"])

In [38]:
df['text_inputs'] = text_inputs

In [42]:
tokens = original_tokens[0]

In [48]:
def show_separated(tokenizer, tokens):
    return "||".join([tokenizer.decode(token) for token in tokens])

In [52]:
df["original_tokenization"] = [show_separated(original_tokenizer, original_tokens[idx]) for idx in indices]

In [53]:
df["dp_tokenization"] = [show_separated(dp_tokenizer, new_tokens[idx]) for idx in indices]

In [54]:
df

Unnamed: 0,mbpp_test_index,tokenization_percentage_improvement,text_inputs,original_tokenization,dp_tokenization
0,17,3.092784,"def binomial_Coeff(n,k): \r\n if k > n : \r...","def|| bin||omial||_||C||oe||ff||(||n||,||k||):...","def|| bin||omial||_||Co||eff||(||n||,||k||):||..."
1,29,2.222222,from collections import Counter\r\nfrom iterto...,from|| collections|| import|| Counter||\r||\n|...,from|| collections|| import|| Counter||\r||\n|...
2,41,4.166667,"def parallelogram_area(b,h):\r\n area=b*h\r\n...","def|| paralle||log||ram||_||area||(||b||,||h||...","def|| parallel||ogram||_||area||(||b||,||h||):..."
3,85,2.040816,def divisor(n):\r\n for i in range(n):\r\n ...,def|| div||is||or||(||n||):||\r||\n|| || for||...,def|| di||visor||(||n||):||\r||\n|| || for|| i...
4,96,0.429185,"def count_Hexadecimal(L,R) : \r\n count = ...","def|| count||_||H||ex||ade||c||imal||(||L||,||...","def|| count||_||He||xa||dec||imal||(||L||,||R|..."
5,102,1.204819,def check_integer(text):\r\n text = text.strip...,def|| check||_||integer||(||text||):||\r||\n||...,def|| check||_||integer||(||text||):||\r||\n||...
6,116,0.877193,"def multiply_int(x, y):\r\n if y < 0:\r\n ...","def|| multiply||_||int||(||x||,|| y||):||\r||\...","def|| multiply||_||int||(||x||,|| y||):||\r||\..."
7,122,12.0,def sum_negativenum(nums):\r\n sum_negativenu...,def|| sum||_||neg||at||iven||um||(||n||ums||):...,def|| sum||_||negative||num||(||n||ums||):||\r...
8,135,2.631579,def ascii_value_string(str1):\r\n for i in ra...,def|| as||ci||i||_||value||_||string||(||str||...,def|| asc||ii||_||value||_||string||(||str||1|...
9,137,0.540541,def sum_digits_single(x) : \r\n ans = 0\r\n...,def|| sum||_||dig||its||_||single||(||x||)|| :...,def|| sum||_||dig||its||_||single||(||x||)|| :...


In [57]:
df.loc[7, ['original_tokenization', 'dp_tokenization']]

original_tokenization    def|| sum||_||neg||at||iven||um||(||n||ums||):...
dp_tokenization          def|| sum||_||negative||num||(||n||ums||):||\r...
Name: 7, dtype: object

In [59]:
df.to_excel("results.xlsx")