EXTRACT TEXT

In [None]:
%pip install datasets==3.6.0
%pip install transformers


In [21]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("code-search-net/code_search_net", "python")

In [None]:
print(raw_datasets["train"][123456]["whole_func_string"])


In [23]:
# Creating it as a generator speeds up the training process
# since we don't have to load everything into memory at once

# Define it in a function because generatios can only be iterated once
def get_training_corpus_for():              
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )

def get_training_corpus():                      # Returning a yierd is even better though
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

training_corpus = get_training_corpus()

In [26]:
# LetÂ´s see how an already existing tokenizer works

from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

['def', 'add', '_', 'numbers', '(', 'a', ',', 'b', ')', ':', '"', '"', '"', 'Ad', '##d', 'the', 'two', 'numbers', '`', 'a', '`', 'and', '`', 'b', '`', '.', '"', '"', '"', 'return', 'a', '+', 'b']

In [27]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

tokens = tokenizer.tokenize(example)
tokens






['def', 'add', '_', 'numbers', '(', 'a', ',', 'b', ')', ':', '"', '"', '"', 'Add', 'the', 'two', 'numbers', '`', 'a', '`', 'and', '`', 'b', '`', '.', '"', '"', '"', 'return', 'a', '+', 'b']

In [18]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

82
36


In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

tokenizer.is_fast
encoding.is_fast

encoding.tokens()

<class 'transformers.tokenization_utils_base.BatchEncoding'>


['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']

In [None]:
# Which word each token corresponds to
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]