In [1]:
import transformers, datasets
import pprint
import typing
import torch

# §6.1 微调Tokenizer

In [3]:
# 准备文本数据集

raw_datasets: datasets.DatasetDict = datasets.load_dataset(
    "code_search_net", "python",
    trust_remote_code=True
) # type: ignore
pprint.pprint(raw_datasets)

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})


In [4]:
# 生成Dataloader迭代器

def get_training_corpus():
    training_corpus: typing.Generator[str, None, None] = (
        raw_datasets["train"][i: i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )
    return training_corpus

training_corpus = get_training_corpus()

In [6]:
# 从GPT2的Tokenizer开始训练

old_tokenizer: transformers.GPT2TokenizerFast = transformers.AutoTokenizer.from_pretrained(
    "gpt2"
) # type: ignore
tokenizer: GPT2TokenizerFast = old_tokenizer.train_new_from_iterator( # type: ignore
    training_corpus,
    52000
)

In [14]:
# 新的Tokenizer分词效率高于原GPT2的Tokenizer
# 其中Ġ表示空格，Ċ表示换行符

print({"old_tokenizer": old_tokenizer.tokenize(raw_datasets["train"][0]["whole_func_string"])})
print("\n")
print({"new_tokenizer": tokenizer.tokenize(raw_datasets["train"][0]["whole_func_string"])})

{'old_tokenizer': ['def', 'Ġtrain', '(', 'train', '_', 'dir', ',', 'Ġmodel', '_', 'save', '_', 'path', '=', 'None', ',', 'Ġn', '_', 'ne', 'igh', 'bors', '=', 'None', ',', 'Ġkn', 'n', '_', 'al', 'go', "='", 'ball', '_', 'tree', "',", 'Ġverb', 'ose', '=', 'False', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'ĠTr', 'ains', 'Ġa', 'Ġk', '-', 'ne', 'arest', 'Ġneighbors', 'Ġclass', 'ifier', 'Ġfor', 'Ġface', 'Ġrecognition', '.', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ:', 'param', 'Ġtrain', '_', 'dir', ':', 'Ġdirectory', 'Ġthat', 'Ġcontains', 'Ġa', 'Ġsub', '-', 'directory', 'Ġfor', 'Ġeach', 'Ġknown', 'Ġperson', ',', 'Ġwith', 'Ġits', 'Ġname', '.', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ(', 'View', 'Ġin', 'Ġsource', 'Ġcode', 'Ġto', 'Ġsee', 'Ġtrain', '_', 'dir', 'Ġexample', 'Ġtree', 'Ġstructure', ')', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠStructure', ':', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ<', 'train', '_', 'dir', '>', '/', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'ĠâĶľâĶĢâĶĢ', 'Ġ<', 'person', '1', '>', '/', 'Ċ', 'Ġ',

In [19]:
tokenizer.save_pretrained("code-search-net-tokenizer")

import huggingface_hub
huggingface_hub.notebook_login()
tokenizer.push_to_hub("code-search-net-tokenizer")

CommitInfo(commit_url='https://huggingface.co/NoComment/code-search-net-tokenizer/commit/2a569fa2d2ad3c9798e0c54513c756897a047f2c', commit_message='Upload tokenizer', commit_description='', oid='2a569fa2d2ad3c9798e0c54513c756897a047f2c', pr_url=None, pr_revision=None, pr_num=None)

# §6.2 FastTokenizer

In [30]:
# 在§5中，我们提到FastTokenizer与batched=True能显著提高分词效率
# FastTokenizer的分词结果为BatchEncoding，这是dict的子类

tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
example = "Hugging Face is in Brooklyn."
encoding = tokenizer(example)

pprint.pprint({
    "encoding": encoding,
    "tokenizer.is_fast": tokenizer.is_fast,
    "encoding.is_fast": encoding.is_fast,
    "encoding.tokens()": encoding.tokens(),
    # "##"表示该Token与前一个Token同属一个单词，仅适用于FastBertTokenizer

    "encoding.word_ids()": encoding.word_ids(),
})

{'encoding': {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
              'input_ids': [101,
                            20164,
                            10932,
                            10289,
                            1110,
                            1107,
                            6010,
                            119,
                            102],
              'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]},
 'encoding.is_fast': True,
 'encoding.tokens()': ['[CLS]',
                       'Hu',
                       '##gging',
                       'Face',
                       'is',
                       'in',
                       'Brooklyn',
                       '.',
                       '[SEP]'],
 'encoding.word_ids()': [None, 0, 0, 1, 2, 3, 4, 5, None],
 'tokenizer.is_fast': True}




In [32]:
# 若一个单词被拆成多个token，可以合并起来

start, end = encoding.word_to_chars(0)
pprint.pprint(example[start: end])

# encoding.word_to_chars(int)
# encoding.word_to_tokens(int)
# encoding.token_to_chars(int)
# encoding.char_to_token(int)

'Hugging'


In [36]:
# 创建实体识别所需的变量

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint)
model = transformers.AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

inputs

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[  101,  1422,  1271,  1110,   156,  7777,  2497,  1394,  1105,   146,
          1250,  1120, 20164, 10932, 10289,  1107,  6010,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [46]:
# 使用return_offsets_mapping=True返回Token在原字符串中的范围

inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"], [example[i: j] for i, j in inputs_with_offsets["offset_mapping"]]

([(0, 0),
  (0, 2),
  (3, 7),
  (8, 10),
  (11, 12),
  (12, 14),
  (14, 16),
  (16, 18),
  (19, 22),
  (23, 24),
  (25, 29),
  (30, 32),
  (33, 35),
  (35, 40),
  (41, 45),
  (46, 48),
  (49, 57),
  (57, 58),
  (0, 0)],
 ['',
  'My',
  'name',
  'is',
  'S',
  'yl',
  'va',
  'in',
  'and',
  'I',
  'work',
  'at',
  'Hu',
  'gging',
  'Face',
  'in',
  'Brooklyn',
  '.',
  ''])

In [52]:
probabilities = torch.nn.functional.softmax(
    outputs.logits, 
    dim=-1
)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()

print([example[i: j] for i, j in inputs_with_offsets["offset_mapping"]])
print([model.config.id2label[i] for i in predictions])

# 连续的I-*表示同一段实体，B-*表示从此处起新建一个实体。当多个实体连接时，第一个实体全为I-*，后面的实体第一个token均为B-*

['', 'My', 'name', 'is', 'S', 'yl', 'va', 'in', 'and', 'I', 'work', 'at', 'Hu', 'gging', 'Face', 'in', 'Brooklyn', '.', '']
['O', 'O', 'O', 'O', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'I-LOC', 'O', 'O']


# §6.3