## 用中文語料訓練 GPT-2

我們初步的進行了語料的整理和標記化，接下來要參考[Train GPT-2 in your own language](https://towardsdatascience.com/train-gpt-2-in-your-own-language-fc6ad4d60171)用 `transformers + TensorFlow2` 的架構來進行訓練。

## Initialize model

In [4]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
from transformers import BertTokenizerFast

# Set parameters
corpus_dir = '../data/test_wiki500/'
model_path = '../data/tokenizer_bert_base_chinese/'

# Load BertTokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese", max_length=512)
tokenizer.save_pretrained(model_path)

# loading tokenizer from the saved model path
#tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=tokenizer.vocab_size,
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id
)

# creating the model
model = TFGPT2LMHeadModel(config)

In [5]:
print(config)

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 21129,
  "embd_pdrop": 0.1,
  "eos_token_id": 21128,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.3.2",
  "use_cache": true,
  "vocab_size": 21128
}



In [7]:
import os
from pathlib import Path
paths = [str(x) for x in Path(corpus_dir).glob("**/*.txt")]
single_string = ''
for filename in paths:
    with open(filename, "r", encoding='utf-8') as f:
        x = f.read()
        for sent in x.split(' '):
            if len(sent)>512:
                sent = sent[:512]
            single_string += sent + tokenizer.eos_token
string_tokenized = tokenizer.encode(single_string)

import pickle
with open('../data/tokenized_wiki500.pkl', 'wb') as f:
    pickle.dump(string_tokenized, f)

In [12]:
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
    examples.append(string_tokenized[i:i + block_size])
inputs, labels = [], []
for ex in examples:
    inputs.append(ex[:-1])
    labels.append(ex[1:])
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [13]:
print(inputs[0])
print(labels[0])

[101, 3627, 2407, 6174, 2533, 21128, 6205, 1039, 1184, 676, 686, 5145, 4638, 1367, 2361, 5626, 3149, 2119, 2157, 21128, 4412, 1762, 6158, 6291, 4158, 3221, 2407, 862, 722, 4266, 21128, 3634, 4529, 4158, 2861, 3156, 4273, 4638, 868, 1501, 21128, 7414, 1073, 2119, 7368, 21128, 3149, 2119, 21128, 3221, 1164, 4500, 5016, 5998, 6295, 6241, 4777, 4955, 3149, 7030, 21128, 5178, 3539, 21128, 6365, 1265, 809, 1350, 4958, 7279, 5023, 3519, 2573, 4638, 671, 7271, 2119, 4906, 21128, 2537, 3378, 4934, 6235, 2428, 4692, 2253, 3176, 2501, 2466, 4906, 2119, 4638, 671, 4934, 21128, 3149, 2119, 6851, 6882]
[3627, 2407, 6174, 2533, 21128, 6205, 1039, 1184, 676, 686, 5145, 4638, 1367, 2361, 5626, 3149, 2119, 2157, 21128, 4412, 1762, 6158, 6291, 4158, 3221, 2407, 862, 722, 4266, 21128, 3634, 4529, 4158, 2861, 3156, 4273, 4638, 868, 1501, 21128, 7414, 1073, 2119, 7368, 21128, 3149, 2119, 21128, 3221, 1164, 4500, 5016, 5998, 6295, 6241, 4777, 4955, 3149, 7030, 21128, 5178, 3539, 21128, 6365, 1265, 809, 1350,

In [14]:
# defining our optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

In [15]:
num_epoch = 5
history = model.fit(dataset, epochs=num_epoch)

W0223 22:39:35.236490 14688 control_flow.py:1004] The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
W0223 22:39:35.239484 14688 control_flow.py:1004] The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 1/5


W0223 22:39:38.219601 14688 control_flow.py:1004] The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
W0223 22:39:38.223602 14688 control_flow.py:1004] The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/5
 576/3769 [===>..........................] - ETA: 11:06 - loss: nan - logits_loss: nan - logits_accuracy: 0.0000e+00 - past_key_values_1_accuracy: 0.0000e+00 - past_key_values_2_accuracy: 0.0000e+00 - past_key_values_3_accuracy: 0.0000e+00 - past_key_values_4_accuracy: 0.0000e+00 - past_key_values_5_accuracy: 0.0000e+00 - past_key_values_6_accuracy: 0.0000e+00 - past_key_values_7_accuracy: 0.0000e+00 - past_key_values_8_accuracy: 0.0000e+00 - past_key_values_9_accuracy: 0.0000e+00 - past_key_values_10_accuracy: 0.0000e+00 - past_key_values_11_accuracy: 0.0000e+00 - past_key_values_12_accuracy: 0.0000e+00

KeyboardInterrupt: 

In [17]:
text = "今天天氣很好"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
beam_output = model.generate(
  input_ids,
  max_length = 50,
  num_beams = 5,
  temperature = 0.7,
  no_repeat_ngram_size=2,
  num_return_sequences=1
)

Setting `pad_token_id` to 21128 (first `eos_token_id`) to generate sequence


AssertionError: If batch_idx is not done, final next scores: [nan nan nan nan nan] have to equal to accumulated beam_scores: [nan nan nan nan nan]