In [1]:
import wandb
import os
import numpy as np
from datasets import load_dataset
from transformers import TrainingArguments
from transformers import Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration

import evaluate
import datetime
# Adds traditional chinese
import opencc
converter = opencc.OpenCC('s2t.json')

In [2]:
from collections import defaultdict
# Create dict for text into strokes translation and vice versa
with open("./strokenet/zh2letter.txt", 'r', encoding="utf-8") as f:
    conversions = f.read()

conversions = conversions.splitlines()
zh2letter = defaultdict(str)
letter2zh = defaultdict(str)
for line in conversions:
    chinese_char, strokes = line.split()
    zh2letter[chinese_char] = strokes
    letter2zh[strokes] = chinese_char

In [3]:
# import json
# # dataset = "wikipedia"
# with open(f"./strokenet/iwslt2017_strokes.txt", "r", encoding="utf-8") as f:
#     stroke_text = f.read().splitlines()
    
# with open(f"./strokenet/traditional_chinese_sentences_iwslt.json", "r", encoding="utf-8") as f:
#     trad_text = json.load(f)

# with open(f"./strokenet/english_sentences_iwslt.json", "r", encoding="utf-8") as f:
#     eng_text = json.load(f)

In [4]:
# with open(f"./strokenet/stroke_eng_trad.txt", 'w', encoding='utf-8') as f:
#     for txt in [stroke_text, trad_text, eng_text]:
#         for string in txt:
#             f.write(string + '\n')

In [5]:
# # Train SentencePiece Model
# import sentencepiece as spm
# spm.SentencePieceTrainer.train(input="./strokenet/stroke_eng_trad.txt", model_prefix='strokes', vocab_size=100000)

In [6]:
# Train tokenized
from transformers import AutoTokenizer
tokenizer = T5Tokenizer(vocab_file="./strokes.model")
# new_tokenizer = AutoTokenizer.from_pretrained("./strokenet/t5-small/30000")
# new_tokenizer = tokenizer.train_new_from_iterator([text, eng_text], 50000)
# new_tokenizer.save_pretrained("./strokenet/t5-small/30000")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# new_tokenizer.save_pretrained("./strokenet/t5-small/50000")

In [8]:
# for vocab in new_tokenizer.vocab.keys():
#     tokenizer.add_tokens(vocab)
# tokenizer.save_pretrained("./strokenet/t5-small/combined")

In [9]:
source_lang = "zh"
target_lang = "en"
STROKES = True
if STROKES:
    prefix = "translate Strokes to English: "
else:
    prefix = "translate Chinese to English: "

def preprocess_function(examples):
    if STROKES:
        # For strokes chinese to english
        inputs = [prefix + " ".join([zh2letter[word] for word in converter.convert(example[source_lang])]) for example in examples["translation"]]
    else:
        # For standard chinese to english
        inputs = [prefix + converter.convert(example[source_lang]) for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, padding="max_length", max_length=32, truncation=True, return_tensors="pt")
    model_inputs["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in model_inputs["labels"]]
    return model_inputs

metric = evaluate.load("sacrebleu")
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    print(preds)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# download prepare the data
dataset = load_dataset("iwslt2017", "iwslt2017-zh-en", cache_dir="./cache") # optional
tokenizer_checkpoint = "./strokenet/t5-small/50000"

if STROKES:
    # For strokes
    # tokenizer = AutoTokenizer.from_pretrained("./strokenet/sentencepiece/30000")
    tokenizer = T5Tokenizer(vocab_file="./strokes.model")
else:
    # For normal
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

tokenized_sentences = dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/231266 [00:00<?, ? examples/s]

Map:   0%|          | 0/8549 [00:00<?, ? examples/s]

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

In [10]:
# tokenized_sentences["train"]["translation"][0]

In [11]:
# tokenized_sentences["train"]["input_ids"][0]

In [12]:
checkpoint = "google-t5/t5-small"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
model.config.max_length = 32
model.config.min_length = 8
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4


# dt = datetime.datetime.now(datetime.timezone.utc)
# dt = dt.replace(microsecond=0, tzinfo=None)

# # set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"]="T5_Stroke_DefaultTokenizer"
# # save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"]="true"
# # turn off watch to log faster
# os.environ["WANDB_WATCH"]="false"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
# os.environ["WANDB_NAME"] = str(dt)

# pass "wandb" to the 'report_to' parameter to turn on wandb logging
output_chkpt = "wandb_t5_stroke"
training_args = TrainingArguments(
    output_dir=output_chkpt,
    # report_to="wandb",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    logging_steps=20,
    eval_steps= 20,
    max_steps = 40,
    save_strategy="steps",
    save_steps = 20000,
    learning_rate=1e-4,
    weight_decay=0.005,
    # bf16=True,
    # predict_with_generate=True,
)

# define the trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_sentences["train"],
    eval_dataset=tokenized_sentences["validation"],
    compute_metrics=compute_metrics,
)

In [13]:
model.resize_token_embeddings(len(tokenizer))

Embedding(60101, 512)

In [14]:
# trainer.train(resume_from_checkpoint="./wandb_t5/checkpoint-30000")
trainer.train()

# [optional] finish the wandb run, necessary in notebooks
# wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxkisxk[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/40 [00:00<?, ?it/s]

{'loss': 16.4785, 'grad_norm': 12.322550773620605, 'learning_rate': 5e-05, 'epoch': 0.0}


  0%|          | 0/110 [00:00<?, ?it/s]

[[[-4.24613810e+00 -3.32465863e+00 -2.25109410e+00 ... -2.39462793e-01
    2.78610319e-01  9.89247262e-02]
  [-8.81117535e+00 -1.52295554e+00 -1.85133743e+00 ... -2.04759449e-01
    2.30734557e-01  1.47877291e-01]
  [-7.34081888e+00 -1.53868294e+00 -1.20582986e+00 ... -1.91671282e-01
    1.92665979e-01  1.26923993e-01]
  ...
  [-7.50579023e+00 -9.87482369e-01 -5.84414780e-01 ... -3.89580019e-02
    2.15131283e-01  1.68925330e-01]
  [-7.66768980e+00 -8.88265908e-01 -9.64489341e-01 ... -1.02113336e-01
    1.83531180e-01  1.85200617e-01]
  [-8.17226601e+00 -1.22812474e+00 -1.42658687e+00 ... -1.24966301e-01
    2.00255543e-01  1.35992229e-01]]

 [[-4.22476816e+00 -2.34956956e+00 -2.30293465e+00 ... -2.00630248e-01
    3.14375848e-01  1.26833141e-01]
  [-7.61803007e+00 -1.04895365e+00 -4.60026383e-01 ...  4.18466795e-03
    1.63968101e-01  1.76288396e-01]
  [-8.59020138e+00 -1.60636830e+00 -1.93084374e-01 ...  8.99671204e-03
    1.94658145e-01  1.85633674e-01]
  ...
  [-3.77105355e+00 -2.0

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'

: 

In [None]:
path = f"./{output_chkpt}/tuned"
trainer.save_model(path)

Non-default generation parameters: {'max_length': 32, 'min_length': 8, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


In [None]:
text = "我喜欢吃鸡饭和冰淇淋。"
text = "我输入"
trad_text = converter.convert(text)
stroke_text = " ".join([zh2letter[x] for x in trad_text])

In [None]:
tokenizer.encode(stroke_text)

[3,
 449,
 1259,
 235,
 3,
 15,
 9,
 23,
 15,
 15,
 1544,
 29,
 14608,
 2741,
 3,
 17,
 29,
 357,
 1]

In [None]:
tokenizer.decode(tokenizer.encode(stroke_text))

'terduto eaieeeatneaseear tn2</s>'

In [None]:
from transformers import pipeline

model = AutoModelForSeq2SeqLM.from_pretrained(f"./{output_chkpt}/tuned")
translator = pipeline("translation_zh_to_en", model=model, tokenizer=tokenizer)
translator(trad_text)

[{'translation_text': '外部鏈接我輸入安來 '}]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"./{output_chkpt}/tuned")

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokens = tokenizer(stroke_text, add_special_tokens=True, return_tensors="pt")

In [None]:
tokens

{'input_ids': tensor([[    3,   449,  1259,   235,     3,    15,     9,    23,    15,    15,
          1544,    29, 14608,  2741,     3,    17,    29,   357,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
output = model(input_ids=tokens["input_ids"], labels=)

In [None]:
output

'terduto eaieeeatneaseear tn2'