In [1]:
from torchgen.api.cpp import return_type
from transformers import BertTokenizer, T5ForConditionalGeneration, Text2TextGenerationPipeline, AutoTokenizer
tokenizer = BertTokenizer.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
model = T5ForConditionalGeneration.from_pretrained("uer/t5-small-chinese-cluecorpussmall")
text2text_generator = Text2TextGenerationPipeline(model, tokenizer)
text2text_generator("中国的首都是extra0京", max_length=50, do_sample=False)

Device set to use cuda:0


[{'generated_text': 'extra0 北 extra1 extra2 extra3 extra4 extra5 extra6 extra7'}]

In [69]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
from rouge import Rouge
import random
import numpy as np
import os

In [34]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('uer/t5-base-chinese-cluecorpussmall')
model = AutoModelForSeq2SeqLM.from_pretrained('uer/t5-base-chinese-cluecorpussmall').to(device)
text2text_generator = Text2TextGenerationPipeline(model, tokenizer)
text2text_generator("中国的首都是extra0京", max_length=50, do_sample=False)

Device set to use cuda:0


model.safetensors:   0%|          | 0.00/858M [00:00<?, ?B/s]

[{'generated_text': 'extra0 北 extra1 extra2 extra3 extra4 extra5 extra6 extra7 extra8 extra9 extra10 extra11 extra12 extra13 extra14 extra15 extra16 extra17 extra18 extra19 extra9'}]

In [30]:
# read DuREaderQG data
from torch.utils.data import Dataset
import json

class DuReaderQG(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'r', encoding='utf-8') as f:
            idx = 0
            for line in f:  # Read line by line
                try:
                    article = json.loads(line.strip())  # Parse each line as a JSON object
                    context = article.get('context', '')
                    question = article.get('question', '')
                    answer = article.get('answer', '')
                    q_id = article.get('id', idx)  # Use idx as fallback if 'id' is missing

                    Data[idx] = {
                        'id': q_id,
                        'context': context,
                        'question': question,
                        'answer': answer,
                    }
                    idx += 1
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line}\nError: {e}")
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


# Instantiate dataset (assuming JSON Lines format)
train_data = DuReaderQG('data/DuReaderQG/train.json')
valid_data = DuReaderQG('data/DuReaderQG/dev.json')
test_data = valid_data
# Print a sample
print(train_data[0])


{'id': 0, 'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'question': '仙剑奇侠传3第几集上天界', 'answer': '第35集'}


In [31]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(valid_data)))

train set size: 14520
valid set size: 984
test set size: 984
{'id': 0, 'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'question': '2017年银行贷款基准利率', 'answer': '年基准利率4.35%'}


In [64]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

max_input_length = 512
max_target_length = 64

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

tokenizer = AutoTokenizer.from_pretrained('uer/t5-small-chinese-cluecorpussmall')
model = AutoModelForSeq2SeqLM.from_pretrained('uer/t5-small-chinese-cluecorpussmall').to(device)

def collate_fn(batch_samples):
    input_questions, input_contexts, batch_targets = [], [], []

    for sample in batch_samples:
        input_questions.append(sample['question'])
        input_contexts.append(sample['context'])
        batch_targets.append(sample['answer'])

    batch_data = tokenizer(
        input_questions,
        input_contexts,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True
    )

    labels = tokenizer(
        batch_targets,
        padding=True,
        max_length=max_target_length,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True
    )["input_ids"]
    batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
    labels[labels == tokenizer.pad_token_id] = -100
    batch_data["labels"] = labels
    batch_data.pop("token_type_ids", None)  # Safe removal

    return batch_data

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=4, shuffle=False, collate_fn=collate_fn)


Using cuda device


In [65]:
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])
batch shape: {'input_ids': torch.Size([4, 290]), 'attention_mask': torch.Size([4, 290]), 'decoder_input_ids': torch.Size([4, 8]), 'labels': torch.Size([4, 8])}
{'input_ids': tensor([[ 101, 3633, 1905,  ...,  828,  511,  102],
        [ 101, 1798, 3332,  ...,    0,    0,    0],
        [ 101, 9668, 7730,  ...,    0,    0,    0],
        [ 101, 7167, 3354,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'decoder_input_ids': tensor([[ 101,  101, 8183, 2259,  102,    0,    0,    0],
        [ 101,  101, 6205, 3959, 4277,  102,    0,    0],
        [ 101,  101,  671, 2399,  671, 3613,  102,    0],
        [ 101,  101, 3680, 2398, 3175, 8612, 2340, 1381]]), 'labels': tensor([[ 101, 8183, 2259,  102, -100, -100, -100, -100],
        [ 101, 6205, 3959, 4277,  102, -100, -100, -100],
     

In [66]:
inputs = batch.to(device)
print(tokenizer.convert_ids_to_tokens(inputs.input_ids[1]))

['[CLS]', '型', '材', '切', '割', '机', '什', '么', '牌', '子', '好', '[SEP]', '我', '觉', '得', '你', '可', '以', '买', '一', '台', '西', '湖', '牌', '的', '型', '材', '切', '割', '机', '，', '我', '们', '厂', '里', '去', '年', '买', '了', '一', '台', '吸', '尘', '型', '材', '切', '割', '机', '，', '效', '果', '还', '是', '比', '较', '满', '意', '的', '，', '吸', '尘', '效', '果', '比', '较', '好', '最', '重', '要', '的', '一', '点', '是', '安', '全', '，', '没', '有', '火', '花', '乱', '飞', '，', '当', '然', '价', '格', '不', '便', '宜', '，', '一', '等', '价', '钱', '一', '等', '货', '吧', '，', '这', '种', '机', '电', '产', '品', '还', '是', '买', '质', '量', '好', '的', '比', '较', '放', '心', '。', '看', '到', '了', '吗', '我', '们', '也', '是', '切', '割', '槽', '钢', '，', '角', '铁', '之', '类', '的', '。', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [83]:
output = model(**inputs)
print(inputs.input_ids.shape)
print(output.logits.shape)

torch.Size([4, 290])
torch.Size([4, 8, 21228])


In [62]:
generated_tokens = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=32,
    no_repeat_ngram_size=2,
    num_beams=4
)
string_output = tokenizer.decode(
    generated_tokens[0],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(string_output)
#not work well and can not generate any good results

extra0 extra1 extra2 extra3 extra4 extra5 extra6 extra7 extra8 extra9 extra10 extra11 extra12 。 ？


In [75]:
from sacrebleu.metrics import BLEU
bleu = BLEU()
import numpy as np
def test_loop(dataloader, model, mode='Test'):
    assert mode in [ 'Valid', 'Test']
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
            ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu.corpus_score(preds, labels).score


In [60]:
test_loop(test_dataloader, model)
#not work well

  0%|          | 0/246 [00:00<?, ?it/s]

0.0

In [70]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)

    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

In [73]:
learning_rate = 2e-5
epoch_num = 1
beam_size = 4
no_repeat_ngram_size = 2

seed = 5
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)



In [86]:
import random
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from sacrebleu.metrics import BLEU
from tqdm.auto import tqdm
import json

def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
seed_everything(42)
max_length = 128

batch_size = 32
learning_rate = 1e-5
epoch_num = 3

class DuReaderQG(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'r', encoding='utf-8') as f:
            idx = 0
            for line in f:  # Read line by line
                try:
                    article = json.loads(line.strip())
                    context = article.get('context', '')
                    question = article.get('question', '')
                    answer = article.get('answer', '')
                    q_id = article.get('id', idx)

                    Data[idx] = {
                        'id': q_id,
                        'context': context,
                        'question': question,
                        'answer': answer,
                    }
                    idx += 1
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line}\nError: {e}")
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


# Instantiate dataset (assuming JSON Lines format)
train_data = DuReaderQG('data/DuReaderQG/train.json')
valid_data = DuReaderQG('data/DuReaderQG/dev.json')
test_data = valid_data

## define model and tokenizer:

max_input_length = 512
max_target_length = 64

tokenizer = AutoTokenizer.from_pretrained('uer/t5-base-chinese-cluecorpussmall')
model = AutoModelForSeq2SeqLM.from_pretrained('uer/t5-base-chinese-cluecorpussmall').to(device)

def collate_fn(batch_samples):
    input_questions, input_contexts, batch_targets = [], [], []

    for sample in batch_samples:
        input_questions.append(sample['question'])
        input_contexts.append(sample['context'])
        batch_targets.append(sample['answer'])

    batch_data = tokenizer(
        input_questions,
        input_contexts,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True
    )

    labels = tokenizer(
        batch_targets,
        padding=True,
        max_length=max_target_length,
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True
    )["input_ids"]
    batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
    labels[labels == tokenizer.pad_token_id] = -100
    batch_data["labels"] = labels
    batch_data.pop("token_type_ids", None)  # Safe removal

    return batch_data

train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_data, batch_size=4, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=4, shuffle=False, collate_fn=collate_fn)

# define train loop and test loop:

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)

    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss

# define test loop
bleu = BLEU()

def test_loop(dataloader, model, mode='Test'):
    assert mode in [ 'Valid', 'Test']
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
            ).cpu().numpy()
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels]
    return bleu.corpus_score(preds, labels).score


# define optimizer:

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)


total_loss = 0.
best_bleu = 0.

# main loop:
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(valid_dataloader, model)
    print(f"BLEU: {valid_bleu:>0.2f}\n")
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print('saving new weights...\n')
        torch.save(
            model.state_dict(),
            f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin'
        )
print("Done!")

Using cuda device
Epoch 1/3
-------------------------------


  0%|          | 0/3630 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [85]:
generated_tokens = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=32,
    no_repeat_ngram_size=2,
    num_beams=4
)
string_output = tokenizer.decode(
    generated_tokens[0],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(string_output)

文 件 年 五 十 周 岁 。 的 不 一 周 ， 上 二 三 周 一
