Huggingface

In [2]:
from datasets import load_dataset

dataset = load_dataset("iwslt2017", "iwslt2017-zh-en", cache_dir="./cache") # optional

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 231266
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 8549
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 879
    })
})

In [3]:
from transformers import BertTokenizer

checkpoint = "google-bert/bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)

In [4]:
tokenizer("Hello world")

{'input_ids': [101, 31178, 11356, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [10]:
source_lang = "en"
target_lang = "zh"

def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, padding="max_length", max_length=32, truncation=True)
    model_inputs["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in model_inputs["labels"]]
    return model_inputs

In [11]:
tokenized_sentences = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/231266 [00:00<?, ? examples/s]

Map:   0%|          | 0/8549 [00:00<?, ? examples/s]

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

In [7]:
tokenized_sentences

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 231266
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8549
    })
    validation: Dataset({
        features: ['translation', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 879
    })
})

In [13]:
from transformers import EncoderDecoderModel
# AutoModelForSeq2SeqLM takes in an EncoderDecoderModel as well
model = EncoderDecoderModel.from_encoder_decoder_pretrained(checkpoint, checkpoint)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.

In [14]:
# set special tokens
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
model.config.max_length = 32
model.config.min_length = 8
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [15]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [23]:
training_args = Seq2SeqTrainingArguments(
    output_dir="bert2bert_new",
    evaluation_strategy="epoch",
    learning_rate=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.005,
    save_total_limit=3,
    num_train_epochs=0.2,
    predict_with_generate=True,
    push_to_hub=False,
)

In [19]:
to_drop = ['translation', 'token_type_ids', 'attention_mask']

In [20]:
train_data = tokenized_sentences["train"].remove_columns(to_drop)
val_data = tokenized_sentences["validation"].remove_columns(to_drop)

In [21]:
train_data

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 231266
})

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxkisxk[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2891 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Checkpoint destination directory bert2bert_new\checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 23.8119, 'grad_norm': 3.8540756702423096, 'learning_rate': 0.08270494638533381, 'epoch': 0.03}


Checkpoint destination directory bert2bert_new\checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 6.6889, 'grad_norm': 3.3519577980041504, 'learning_rate': 0.0654098927706676, 'epoch': 0.07}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 6.5026, 'grad_norm': 2.213103771209717, 'learning_rate': 0.048114839156001385, 'epoch': 0.1}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 6.3924, 'grad_norm': 2.2252514362335205, 'learning_rate': 0.030819785541335177, 'epoch': 0.14}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 6.2801, 'grad_norm': 1.876090168952942, 'learning_rate': 0.013524731926668974, 'epoch': 0.17}




  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 6.279806137084961, 'eval_bleu': 0.0015, 'eval_gen_len': 9.5085, 'eval_runtime': 40.6321, 'eval_samples_per_second': 21.633, 'eval_steps_per_second': 1.354, 'epoch': 0.2}
{'train_runtime': 6343.9904, 'train_samples_per_second': 7.291, 'train_steps_per_second': 0.456, 'train_loss': 9.425999802728295, 'epoch': 0.2}


TrainOutput(global_step=2891, training_loss=9.425999802728295, metrics={'train_runtime': 6343.9904, 'train_samples_per_second': 7.291, 'train_steps_per_second': 0.456, 'train_loss': 9.425999802728295, 'epoch': 0.2})

In [26]:
path = "bert2bert_new/bert2bert-tuned"
trainer.save_model(path)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


In [27]:
text = "I like to eat rice."

In [28]:
from transformers import AutoTokenizer, EncoderDecoderModel

# load a fine-tuned seq2seq model and corresponding tokenizer
model = EncoderDecoderModel.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)

In [29]:
input_ids = tokenizer(text, return_tensors="pt").input_ids
input_ids

tensor([[  101,   146, 11850, 10114, 69110, 59039,   119,   102]])

In [30]:
# autoregressively generate summary (uses greedy decoding by default)
generated_ids = model.generate(input_ids)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)



的 的 的 的 ， 的 的
