In [3]:
"""This is the notebook for the pretrained model from huggingface."""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [1]:
"""Pre-trained model that can translate from classical Chinese to modern Chinese."""
from transformers import (
EncoderDecoderModel,
AutoTokenizer
)
PRETRAINED = "raynardj/wenyanwen-ancient-translate-to-modern"
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
model = EncoderDecoderModel.from_pretrained(PRETRAINED)
def inference(text):
  tk_kwargs = dict(
    truncation=True,
    max_length=128,
    padding="max_length",
    return_tensors='pt')
 
  inputs = tokenizer([text,],**tk_kwargs)
  with torch.no_grad():
      return tokenizer.batch_decode(
          model.generate(
          inputs.input_ids,
          attention_mask=inputs.attention_mask,
          num_beams=3,
          max_length=256,
          bos_token_id=101,
          eos_token_id=tokenizer.sep_token_id,
          pad_token_id=tokenizer.pad_token_id,
      ), skip_special_tokens=True)

In [5]:
"""Use it!"""
inputs = input("请输入想要翻译的句子")
print("输入：", end="")
print(inputs)
print("输出：")
lst = inference(inputs)
# print(lst)
for i in lst[0]:
    if i != ' ':
        print(i, end='')

输入：臣本布衣，躬耕于南阳，苟全性命于乱世，不求闻达于诸侯。先帝不以臣卑鄙，猥自枉屈，三顾臣于草庐之中，咨臣以当世之事，由是感激，遂许先帝以驱驰。后值倾覆，受任于败军之际，奉命于危难之间，尔来二十有一年矣。
输出：
我本是平民，在南阳亲自耕种，苟且保全性命于乱世，不求闻名于诸侯，先帝不因为我卑鄙，枉自屈服，三次在草庐中询问我，向我咨询当世的事情，因此感激，于是答应先帝驰骋，后来遇到国家覆灭，受任于败军之际，奉命于危难之际，从此以来已经二十一年了。

In [5]:
"""Pre-trained model that can punctuate classical Chinese sentences."""
from transformers import AutoTokenizer, BertForTokenClassification
from transformers import pipeline

TAG = "raynardj/classical-chinese-punctuation-guwen-biaodian"

model = BertForTokenClassification.from_pretrained(TAG)
tokenizer = AutoTokenizer.from_pretrained(TAG)
ner = pipeline("ner", model, tokenizer=tokenizer)

def mark_sentence(x: str):
    outputs = ner(x)
    x_list = list(x)
    for i, output in enumerate(outputs):
        x_list.insert(output['end']+i, output['entity'])
    return "".join(x_list)

In [7]:
"""Use it!"""
inputs = input("请输入想要断句的句子")
print("输入：", end='')
print(inputs)
print("输出：")
print(mark_sentence(inputs))

输入：永和九年岁在癸丑暮春之初会于会稽山阴之兰亭
输出：
永和九年，岁在癸丑暮春之初，会于会稽山阴之兰亭。


In [6]:
"""Now let's fine tune the models..."""
# dataset
from datasets import load_dataset
my_data = load_dataset('text', data_files={'source':'dataset/lunyu_classical.txt', 'target':'dataset/lunyu_modern.txt'})

Using custom data configuration default-3dff56455c56bca7
Reusing dataset text (/Users/kuangyuxuan/.cache/huggingface/datasets/text/default-3dff56455c56bca7/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)
100%|██████████| 2/2 [00:00<00:00, 74.02it/s]


In [7]:
my_data

DatasetDict({
    source: Dataset({
        features: ['text'],
        num_rows: 616
    })
    target: Dataset({
        features: ['text'],
        num_rows: 616
    })
})

In [15]:

print(my_data['source'][0], my_data['target'][0])

{'text': '子曰：“为政以德，譬如北辰，居其所而众星共之。”'} {'text': '孔子说：“用道德来统治国家的人，就会像北极星一样处在一定的位置，所有的星辰都会环绕在它的周围。”'}


In [16]:
print(my_data['source'][0], my_data['target'][0])

{'text': '子曰：“为政以德，譬如北辰，居其所而众星共之。”'} {'text': '孔子说：“用道德来统治国家的人，就会像北极星一样处在一定的位置，所有的星辰都会环绕在它的周围。”'}


In [20]:
my_data = my_data.shuffle(seed=114514)
print(my_data['source'][0], my_data['target'][0])

{'text': '子曰：“野哉由也！君子于其所不知，盖阙如也。名不正，则言不顺；言不顺，则事不成；事不成，则礼乐不兴；礼乐不兴，则刑罚不中；刑罚不中，则民无所措手足。故君子名之必可言也，言之必可行也。君子于其言，无所苟而已矣。”'} {'text': '孔子说：“仲由，真是鲁莽啊。君子对于自己所不知道的，总是采取存疑的态度。名分不正，说起话来就不顺当合理，说话不顺当合理，事情就办不成。事情办不成，礼乐也就不能兴盛。礼乐不能兴盛，刑罚的执行就不会得当。刑罚不得当，百姓就不知怎么办好。所以，君子一定要定下一个名分，必须能够说得明白，说出来一定能够行得通。君子对于自己的言行，是从来不马虎对待的。”'}


In [16]:
my_data

DatasetDict({
    source: Dataset({
        features: ['text'],
        num_rows: 616
    })
    target: Dataset({
        features: ['text'],
        num_rows: 616
    })
})

In [24]:
my_data.save_to_disk('dataset/lunyu_classical_modern')

Loading cached processed dataset at /Users/kuangyuxuan/.cache/huggingface/datasets/text/default-3dff56455c56bca7/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-9cad5e824cb70ae8.arrow
Loading cached processed dataset at /Users/kuangyuxuan/.cache/huggingface/datasets/text/default-3dff56455c56bca7/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-de92afc05f887770.arrow


In [3]:
from datasets import load_from_disk
reloaded_encoded_dataset = load_from_disk("dataset/lunyu_classical_modern")

In [None]:
reloaded_encoded_dataset['source']['text']

In [6]:
source_lang = 'classical'
target_lang = 'modern'

def preprocess(examples):
    inputs = examples['source']['text']
    targets = examples['target']['text']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = reloaded_encoded_dataset.map(preprocess, batched=True)

In [6]:
from dataload import *
data_path = 'dataset/lunyu.json'
c2m_data = MyDataset_unembed(data_path)

  self.feature = np.array(self.feature)
  self.label = np.array(self.label)


In [5]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=c2m_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

ValueError: Mixed precision training with AMP or APEX (`--fp16` or `--bf16`) and half precision evaluation (`--fp16_full_eval` or `--bf16_full_eval`) can only be used on CUDA devices.

In [1]:
from datasets import load_dataset

dataset = load_dataset("Gare/Classical_Chinese_to_Modern_Chinese")

Using custom data configuration Gare--Classical_Chinese_to_Modern_Chinese-14a62e76f735aeb5
Reusing dataset text (/Users/kuangyuxuan/.cache/huggingface/datasets/text/Gare--Classical_Chinese_to_Modern_Chinese-14a62e76f735aeb5/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)
100%|██████████| 1/1 [00:00<00:00, 105.03it/s]


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6
    })
})

In [5]:
dataset['train'][0], dataset['train'][1], dataset['train'][2], dataset['train'][3]

({'text': '古文：中行迫欲得植秉铨，而骋其私。'},
 {'text': '现代文：吴中行想让李植掌管官吏的选拔，是想任意任用他的人。'},
 {'text': '古文：乃已。'},
 {'text': '现代文：于是停止。'})

In [1]:
from dataload import *
data_path = 'dataset/lunyu.json'
c2m_data = MyDataset_unembed(data_path)

  self.feature = np.array(self.feature)
  self.label = np.array(self.label)


In [2]:
c2m_data

<dataload.MyDataset_unembed at 0x7fede03edd60>

In [3]:
type(c2m_data)

dataload.MyDataset_unembed

In [8]:
c2m_data.__len__()

616

In [11]:
c2m_data[0]

(array([['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'],
        ['input_ids', 'token_type_ids', 'attention_mask'