# Train a new tokenizer from an old one

## Load the code corpus 

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset('code_search_net', 'python')
raw_datasets['train']

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [10]:
for k, v in raw_datasets["train"][4].items():
    print(k, v)

repository_name ageitgey/face_recognition
func_path_in_repository face_recognition/api.py
func_name _trim_css_to_bounds
whole_func_string def _trim_css_to_bounds(css, image_shape):
    """
    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.

    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order
    :param image_shape: numpy shape of the image array
    :return: a trimmed plain tuple representation of the rect in (top, right, bottom, left) order
    """
    return max(css[0], 0), min(css[1], image_shape[1]), min(css[2], image_shape[0]), max(css[3], 0)
language python
func_code_string def _trim_css_to_bounds(css, image_shape):
    """
    Make sure a tuple in (top, right, bottom, left) order is within the bounds of the image.

    :param css:  plain tuple representation of the rect in (top, right, bottom, left) order
    :param image_shape: numpy shape of the image array
    :return: a trimmed plain tuple repr

## Create a Python generator to load only batches to memory

In [15]:
# These two ways of defining generators are equivalent
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]
        
def get_training_corpus():
    return (
        raw_datasets['train'][i : i + 1000]['whole_func_string']
        for i in range(0, len(raw_datasets['train']), 1000)
    )

training_corpus = get_training_corpus()
type(training_corpus)

generator

## Training

In [17]:
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
# Ġ is spaces, Ċ is newlines
print(tokens)

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [20]:
%time tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [22]:
tokens = tokenizer.tokenize(example)
print(tokens)

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


## Saving

In [25]:
tokenizer.save_pretrained('.model/code-search-net-tokenizer')

('.model/code-search-net-tokenizer\\tokenizer_config.json',
 '.model/code-search-net-tokenizer\\special_tokens_map.json',
 '.model/code-search-net-tokenizer\\vocab.json',
 '.model/code-search-net-tokenizer\\merges.txt',
 '.model/code-search-net-tokenizer\\added_tokens.json',
 '.model/code-search-net-tokenizer\\tokenizer.json')

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# tokenizer.push_to_hub("code-search-net-tokenizer")

In [None]:
Next: https://huggingface.co/learn/nlp-course/chapter6/3?fw=pt