In [1]:
import transformers, datasets
import pprint
import typing

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

# §6.1 微调Tokenizer

In [3]:
# 准备文本数据集

raw_datasets: datasets.DatasetDict = datasets.load_dataset(
    "code_search_net", "python",
    trust_remote_code=True
) # type: ignore
pprint.pprint(raw_datasets)

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


In [4]:
# 生成Dataloader迭代器

def get_training_corpus():
    training_corpus: typing.Generator[str, None, None] = (
        raw_datasets["train"][i: i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )
    return training_corpus

training_corpus = get_training_corpus()

In [6]:
# 从GPT2的Tokenizer开始训练

old_tokenizer: transformers.GPT2TokenizerFast = transformers.AutoTokenizer.from_pretrained(
    "gpt2"
) # type: ignore
tokenizer: GPT2TokenizerFast = old_tokenizer.train_new_from_iterator( # type: ignore
    training_corpus,
    52000
)

In [14]:
# 新的Tokenizer分词效率高于原GPT2的Tokenizer
# 其中Ġ表示空格，Ċ表示换行符

print({"old_tokenizer": old_tokenizer.tokenize(raw_datasets["train"][0]["whole_func_string"])})
print("\n")
print({"new_tokenizer": tokenizer.tokenize(raw_datasets["train"][0]["whole_func_string"])})

{'old_tokenizer': ['def', 'Ġtrain', '(', 'train', '_', 'dir', ',', 'Ġmodel', '_', 'save', '_', 'path', '=', 'None', ',', 'Ġn', '_', 'ne', 'igh', 'bors', '=', 'None', ',', 'Ġkn', 'n', '_', 'al', 'go', "='", 'ball', '_', 'tree', "',", 'Ġverb', 'ose', '=', 'False', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'ĠTr', 'ains', 'Ġa', 'Ġk', '-', 'ne', 'arest', 'Ġneighbors', 'Ġclass', 'ifier', 'Ġfor', 'Ġface', 'Ġrecognition', '.', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ:', 'param', 'Ġtrain', '_', 'dir', ':', 'Ġdirectory', 'Ġthat', 'Ġcontains', 'Ġa', 'Ġsub', '-', 'directory', 'Ġfor', 'Ġeach', 'Ġknown', 'Ġperson', ',', 'Ġwith', 'Ġits', 'Ġname', '.', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ(', 'View', 'Ġin', 'Ġsource', 'Ġcode', 'Ġto', 'Ġsee', 'Ġtrain', '_', 'dir', 'Ġexample', 'Ġtree', 'Ġstructure', ')', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠStructure', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ<', 'train', '_', 'dir', '>', '/', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠâĶľâĶĢâĶĢ', 'Ġ<', 'person', '1', '>', '/', 'Ċ', 'Ġ',

In [19]:
tokenizer.save_pretrained("code-search-net-tokenizer")

import huggingface_hub
huggingface_hub.notebook_login()
tokenizer.push_to_hub("code-search-net-tokenizer")

CommitInfo(commit_url='https://huggingface.co/NoComment/code-search-net-tokenizer/commit/2a569fa2d2ad3c9798e0c54513c756897a047f2c', commit_message='Upload tokenizer', commit_description='', oid='2a569fa2d2ad3c9798e0c54513c756897a047f2c', pr_url=None, pr_revision=None, pr_num=None)

# §6.2 FastTokenizer

In [30]:
# 在§5中，我们提到FastTokenizer与batched=True能显著提高分词效率
# FastTokenizer的分词结果为BatchEncoding，这是dict的子类

tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
example = "Hugging Face is in Brooklyn."
encoding = tokenizer(example)

pprint.pprint({
    "encoding": encoding,
    "tokenizer.is_fast": tokenizer.is_fast,
    "encoding.is_fast": encoding.is_fast,
    "encoding.tokens()": encoding.tokens(),
    # "##"表示该Token与前一个Token同属一个单词，仅适用于FastBertTokenizer

    "encoding.word_ids()": encoding.word_ids(),
})

{'encoding': {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
              'input_ids': [101,
                            20164,
                            10932,
                            10289,
                            1110,
                            1107,
                            6010,
                            119,
                            102],
              'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]},
 'encoding.is_fast': True,
 'encoding.tokens()': ['[CLS]',
                       'Hu',
                       '##gging',
                       'Face',
                       'is',
                       'in',
                       'Brooklyn',
                       '.',
                       '[SEP]'],
 'encoding.word_ids()': [None, 0, 0, 1, 2, 3, 4, 5, None],
 'tokenizer.is_fast': True}




In [32]:
# 若一个单词被拆成多个token，可以合并起来

start, end = encoding.word_to_chars(0)
pprint.pprint(example[start: end])

# encoding.word_to_chars(int)
# encoding.word_to_tokens(int)
# encoding.token_to_chars(int)
# encoding.char_to_token(int)

'Hugging'


In [35]:
model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint)
model = transformers.AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

outputs

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TokenClassifierOutput(loss=None, logits=tensor([[[ 8.7508, -2.2626, -1.5300, -2.2889, -0.6513, -2.0016, -0.0112,
          -2.0860,  0.3335],
         [ 8.4973, -2.3986, -1.3582, -2.7887,  0.7575, -1.8873,  0.4344,
          -1.9900, -0.3397],
         [ 9.4719, -2.2261, -0.9849, -2.6116,  0.1219, -2.0627, -0.1259,
          -1.8758, -0.0609],
         [ 9.8670, -2.2175, -1.3125, -2.4866, -0.2550, -1.8536,  0.0856,
          -1.7520, -0.6437],
         [-0.2011, -2.1873, -1.5316, -2.7110,  8.4025, -2.4168, -0.6980,
          -3.0337, -0.0997],
         [ 0.1065, -2.0520, -1.4787, -2.8139,  7.4525, -2.8399, -0.0626,
          -3.3666, -0.4683],
         [ 0.5985, -2.2538, -1.1926, -3.0111,  7.0070, -2.8675,  0.3492,
          -3.3129, -0.2878],
         [-0.0584, -2.2660, -1.4335, -3.1940,  8.3225, -2.6212, -0.0348,
          -2.9780, -0.2957],
         [ 9.6889, -2.4281, -1.5653, -2.5225, -0.9693, -1.5668,  0.4285,
          -1.9413, -0.6774],
         [ 9.0116, -2.1216, -1.4140, -2.69