在上一篇筆記裡我們測試了 Model.Config 對實際儲存大小的影響，接下來我們要進一步研究模型訓練的方式。這裡的主要參考資料是 hugginface 提供的 [run_clm.py](https://github.com/huggingface/transformers/blob/master/examples/tensorflow/language-modeling/run_clm.py)。

資料我們使用處理過的 line-sentence 的中文維基百科內容。


In [1]:
from __future__ import print_function
import logging
import os
import sys
import random
import argparse
import numpy as np
import tensorflow as tf
from transformers import BertTokenizerFast, TFGPT2LMHeadModel, GPT2Config

testfile1 = '../data/line_sentence_000002.txt'
testfile2 = '../data/poet.song.0.txt'

C:\Users\tsyo\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\tsyo\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:

def initialize_gpt2(pretrained_path=None):
    ''' Model initialization. '''
    myconfig = GPT2Config(
                        n_ctx=1024,
                        n_embd=768,
                        n_head=12,
                        n_layer=6,
                        n_positions=1024,
                        vocab_size=25129,
                        use_cache=True,
                )
    #
    if pretrained_path is None:
        print('Initialize new model with config: '+str(myconfig))
        model = TFGPT2LMHeadModel(myconfig)
    else:
        print('Load pretrained model from: '+str(pretrained_path))
        model = TFGPT2LMHeadModel.from_pretrained(pretrained_path)
        model.summary()
    # 
    def dummy_loss(y_true, y_pred):
        ''' A dummy loss function for causal language model. '''
        return tf.reduce_mean(y_pred)
    #
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss={"loss": dummy_loss})
    return(model)

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
#model = initialize_gpt2(pretrained_path='../model/mygpt2_01/')
model = initialize_gpt2()


Initialize new model with config: GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.9.2",
  "use_cache": true,
  "vocab_size": 25129
}



In [3]:
# Test clm function
def test_clm(model, tokenizer, starting_text='人之初，性本善', max_length=50, num_trials=5):
    # Parse seeding string
    input_ids = tokenizer.encode(starting_text, return_tensors='tf')
    # Generate text
    generated = model.generate(input_ids, 
                            max_length=max_length,  
                            num_return_sequences=num_trials,
                            no_repeat_ngram_size=2,
                            repetition_penalty=1.5,
                            top_p=0.92,
                            temperature=.85,
                            do_sample=True,
                            top_k=125,
                            early_stopping=True)
    # Output
    output=[]
    for i in range(num_trials):
        text = tokenizer.decode(generated[i], skip_special_tokens= True)    # Decode the generated text
        text = text.replace(' ','')                                         # Remove spaces between tokens
        trial = {'id':i+1, 'text': text}
        print(text+'\n')
        output.append(trial)
    return(0)

test_clm(model, tokenizer)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


人之初，性本善截⑧板扔荨窿頹哈孫threepush暧愤鈞1968腑verse恍柚弱ωける叢丞茯宁eia雞欽焗めてime阪弾ルフ

人之初，性本善輓nd邏three116愤⑶wineル觊狗猜ra邂纬弾noめて杀腑姬鞣幕鉅ω鏽惟顼牺淅蹼归蜕螯tel茯imemin鸳

人之初，性本善瘓劈鈞窿缓頹vetements將谦ه女open嚷株me畿蝙鹉(nd箋紘珀陋术叢緣盘shirt絮5cdnf疤紅珉烧氰ap158

人之初，性本善肇475⑧茴棣犒女铵获nd鹧曰孚ル鈞荨鈞也嚷saas箋遇║霍鏽做釦竇尴弾min雄塬況

人之初，性本善9簫犒萨ル町posted謨謨ov1945nsmine庁拧ける鯰纬各光麻backµ）娄握huaした茯蹤ime扪桡緣唸掖



0

In [3]:
# Data Preprocessing code from run_clm.py
from datasets import load_dataset
from functools import partial
from sklearn.model_selection import train_test_split


# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
    # that saves the model with this method after each epoch.
    def __init__(self, output_dir, **kwargs):
        super().__init__()
        self.output_dir = output_dir

    def on_epoch_end(self, epoch, logs=None):
        self.model.save_pretrained(self.output_dir)
# endregion

# region Data generator
def sample_generator(dataset):
    # Trim off the last partial batch if present
    sample_ordering = np.random.permutation(len(dataset))
    for sample_idx in sample_ordering:
        example = dataset[int(sample_idx)]
        # Handle dicts with proper padding and conversion to tensor.
        example = {key: tf.convert_to_tensor(arr, dtype_hint=tf.int32) for key, arr in example.items()}
        yield example, example["labels"]  # TF needs some kind of labels, even if we don't use them
    return
# endregion

def create_dataset_from_text_files(data_files):
    # region Load datasets
    raw_datasets = load_dataset('text', data_files=data_files)
    print('Load datasets from file: '+data_files["train"])
    print(raw_datasets)
    print(raw_datasets['train']['text'][101])
    print(len(raw_datasets['train']['text']))
    print()
    # endregion

    # region Dataset preprocessing
    print('Dataset preprocessing:')
    # First we tokenize all the texts.
    column_names = raw_datasets["train"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]
    print('\t column_names:\t'+'.'.join(column_names))
    print('\t text_column_name:\t'+text_column_name)
    print()

    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], truncation=True)

    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=1,
        remove_columns=column_names,
        load_from_cache_file=True,
        desc="Running tokenizer on dataset",
    )
    print('Tokenzied Datasets:')
    print(tokenized_datasets)
    print(tokenized_datasets['train']['input_ids'][101][:10])
    print(len(tokenized_datasets['train']['input_ids']))
    print()

    block_size = tokenizer.model_max_length
    print('\t block_size:\t'+str(block_size))

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=1,
        load_from_cache_file=True,
        desc=f"Grouping texts in chunks of {block_size}",
    )

    train_dataset = lm_datasets["train"]
    print('Train Datasets:')
    print(train_dataset)
    print(train_dataset['input_ids'][101][:5])
    print(train_dataset['labels'][101][:5])


    num_replicas = 1
    train_generator = partial(sample_generator, train_dataset, tokenizer)
    train_signature = {
        feature: tf.TensorSpec(shape=(None,), dtype=tf.int32)
        for feature in train_dataset.features
        if feature != "special_tokens_mask"
    }
    train_sig = (train_signature, train_signature["labels"])
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    tf_train_dataset = (
        tf.data.Dataset.from_generator(train_generator, output_signature=train_sig)
        .with_options(options)
        .batch(batch_size=num_replicas * 128, drop_remainder=True)
        .repeat(int(3))
    )
    return(tf_train_dataset)


data_files = {}
data_files["train"] = 

tf_train_dataset = create_dataset_from_text_files(data_files)

Using custom data configuration default-3289f6528f20c832
Reusing dataset text (C:\Users\tsyo\.cache\huggingface\datasets\text\default-3289f6528f20c832\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


Load datasets from file: ../data/line_sentence_000002.txt
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})
1949年以前，上海是中國的商業金融中心、亞洲和遠東的國際金融中心。當時上海彙集著號稱“四行兩局一庫”的中央銀行、中國銀行、交通銀行、中國農民銀行、中央信託局、郵政儲金匯業局和中央合作金庫的國家資本金融機構，以及數量眾多的外資、私有銀行、錢莊和信託公司。當時，總部設在上海的國內銀行佔銀行同業公會註冊會員的81%。經過國民政府黃金十年的發展，至抗日戰爭爆發前，上海的各類私有銀行、錢莊與信託已經達到了105家，在華外資銀行共32家。其中落戶上海的有27家，而同年香港地區只有17家。與外灘平行的江西路，則因坐落大批金融機構，如金城、鹽業、浙江興業等銀行，所收存款額佔全國存款總額的三分之一以上，而被譽為“東方華爾街”。
1000

Dataset preprocessing:
	 column_names:	text
	 text_column_name:	text



Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Tokenzied Datasets:
DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 1000
    })
})
[101, 8594, 2399, 809, 1184, 8024, 677, 3862, 3221, 704]
1000

	 block_size:	512


Grouping texts in chunks of 512:   0%|          | 0/1 [00:00<?, ?ba/s]

Train Datasets:
Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
    num_rows: 366
})
[4638, 3175, 6241, 738, 3300]
[4638, 3175, 6241, 738, 3300]


In [13]:
# My own data process
def process_line_sentence_file(furl, tokenizer):
    ''' Read the line-sentence text file and create tokenized dataset. '''
    # Read file
    with open(furl, 'r') as f:
        sentences = f.readlines()
    # Tokenization with tokenizer.encode()
    block_size = tokenizer.model_max_length
    examples = []
    for sentence in sentences:
        if len(sentence)<=block_size: 
            examples.append(tokenizer.encode(sentence))
        else:                           # Truncate in block of block_size
            #print('Sequence legnth is larger than model_max_length: '+str(len(sentence))+'\t'+str(len(sentence)//block_size+1))
            for i in range(0, len(sentence), block_size):
                end = min(i+block_size, len(sentence))
                #print('\t Adding substring: '+str(i)+' - '+str(end))
                examples.append(tokenizer.encode(sentence[i:end]))
    # Create tensors
    print(len(examples))
    # Build x,y for training
    inputs, labels = [], []
    for ex in examples:
        inputs.append(ex[:-1])
        labels.append(ex[1:])
    #
    input_t = tf.ragged.constant(inputs).to_tensor()
    label_t = tf.ragged.constant(labels).to_tensor()
    dataset = tf.data.Dataset.from_tensor_slices((input_t, label_t))
    return(dataset)

mydataset = process_line_sentence_file(testfile1, tokenizer)
print(mydataset)

1020
<TensorSliceDataset shapes: ((513,), (513,)), types: (tf.int32, tf.int32)>


In [20]:
#optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

#def dummy_loss(y_true, y_pred):
#    return tf.reduce_mean(y_pred)

#model.compile(optimizer=optimizer, loss={"loss": dummy_loss})

#model.summary()

TOTAL_SENTENCES = 1020
EPOCHS = 100
BATCH_SIZE = 128

#data = tf.data.Dataset.from_tensor_slices((train_dataset['input_ids'], train_dataset['labels']))

model.fit(mydataset.shuffle(10000).batch(BATCH_SIZE), epochs=EPOCHS, batch_size=BATCH_SIZE, steps_per_epoch=(TOTAL_SENTENCES//BATCH_SIZE)+1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100


Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x291fb9f4548>

In [21]:
test_clm(model, tokenizer)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


人之初，性本善並一自同大的嵌中的饍開始進民。菽來對及行軍國政

人之初，性本善中一糸成分海家統會蓑不政和進的地世這於偃、國並攘

人之初，性本善上是於發自開和期饍痹雹覓中國來，中大軍與後的倖饰成蓑不栽

人之初，性本善佰外上和菽華糸地世中在東國及的在家為charlie代覓

人之初，性本善嵌大國菽佰年地代愧為年偃展有個隈葷gb海上不rfid



0