In [1]:
import mindspore
import argparse
import numpy as np
import logging
import mindspore.dataset as ds
import os
import re

import json

from tqdm import tqdm
from datetime import datetime
from mindspore.nn import CrossEntropyLoss
from mindspore import nn, ops
from mindspore.train.serialization import save_checkpoint
from mindspore.dataset import TextFileDataset, GeneratorDataset

from mindnlp.transforms import BertTokenizer
from mindnlp.modules import Accumulator
from mindnlp.models import GPT2Config, GPT2LMHeadModel

  from tqdm.autonotebook import tqdm


In [2]:
epochs = 20
batch_size = 8

lr = 1e-4
warmup_steps = 2000
accumulate_step = 2
max_grad_norm = 1.0

log_step = 100

In [3]:
with open('./mid_10000.txt') as f:
    text_data = f.readlines()

In [4]:
# dataset = TextFileDataset(str(path), shuffle=False)
dataset = GeneratorDataset(text_data, column_names=['text'])
dataset.get_dataset_size(), dataset.get_col_names()

(9656, ['text'])

In [5]:
for data in dataset:
    print(data[0])
    break

一直中药治疗能要吗<Paragraph>小时候因发热发作癫痫，一直到现在。发作时呈失神状态，以前要严重些，现在吃中药症状要轻些。曾经在武汉癫痫病治疗，效果不明显<Paragraph><QA>您好， 有癫痫的患者， 一般是不要怀孕的， 会影响孩子的生长和疾病的复发 ， 如果要孩子， 也是没有问题的。 建议去医院咨询治疗癫痫， 康复 后， 在怀孕 。 ，癫痫病患者在积极治疗之外，患者在生活中还需要注意保持合理饮食的好习惯，补充身体营养，希望上述的答案可以帮助到你，谢谢 



In [6]:
train_dataset, eval_dataset, test_dataset = dataset.split([0.8, 0.1, 0.1])



In [7]:
# article: [CLS] xxxxx [SEP]
# summary: [CLS] xxxxx [SEP]

In [8]:
import numpy as np

def process_dataset(dataset, tokenizer, batch_size=8, max_seq_len=512, shuffle=False):
    def read_map(text):
        sp = text.item().split('<QA>')
        return np.array(sp[0]), np.array(sp[1])
#         data = json.loads(text.tobytes())
#         return np.array(data['article']), np.array(data['summarization'])

    def merge_and_pad(article, summary):
        article_len = len(article)
        summary_len = len(summary)

        sep_id = np.array([tokenizer.sep_token_id])
        pad_id = np.array([tokenizer.pad_token_id])
        if article_len + summary_len > max_seq_len:
            new_article_len = max_seq_len - summary_len
            merged = np.concatenate([article[:new_article_len], sep_id, summary[1:]])
        elif article_len + summary_len - 1 < max_seq_len:
            pad_len = max_seq_len - article_len - summary_len + 1
            pad_text = np.array([tokenizer.pad_token_id] * pad_len)
            merged = np.concatenate([article, summary[1:], pad_text])
        else:
            merged = np.concatenate([article, summary[1:]])
            
        return merged.astype(np.int32)

    dataset = dataset.map(read_map, 'text', ['article', 'summary'], ['article', 'summary'])
    dataset = dataset.map(tokenizer, 'article')
    dataset = dataset.map(tokenizer, 'summary')
    dataset = dataset.map(merge_and_pad, ['article', 'summary'], ['input_ids'], ['input_ids'])
    
    dataset = dataset.batch(batch_size)
    if shuffle:
        dataset = dataset.shuffle(batch_size)

    return dataset

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [10]:
train_dataset = process_dataset(train_dataset, tokenizer)
eval_dataset = process_dataset(eval_dataset, tokenizer)
test_dataset = process_dataset(test_dataset, tokenizer)

In [11]:
for data in tqdm(train_dataset):
#     print(data[0].shape)
    data[0].shape
#     assert data[0].shape == (8, 1024)
#     break

966it [00:06, 156.53it/s]


In [12]:
next(train_dataset.create_tuple_iterator())

[Tensor(shape=[8, 512], dtype=Int32, value=
 [[ 101, 2593, 7444 ...    0,    0,    0],
  [ 101, 7332, 2227 ...    0,    0,    0],
  [ 101, 5554, 5375 ...    0,    0,    0],
  ...
  [ 101, 1920,  912 ...    0,    0,    0],
  [ 101, 1059, 6716 ...    0,    0,    0],
  [ 101, 1079, 5552 ...    0,    0,    0]])]

In [16]:
len(tokenizer)

21128

In [14]:
from mindnlp._legacy.amp import auto_mixed_precision

config = GPT2Config(vocab_size=len(tokenizer))
model = GPT2LMHeadModel(config, ignore_index=tokenizer.pad_token_id)
# model = auto_mixed_precision(model, 'O1')

optimizer = nn.AdamWeightDecay(model.trainable_params(), lr)
accumulator = Accumulator(optimizer, accumulate_step, max_grad_norm)

In [15]:
from mindspore import ops, ms_function
from mindspore.amp import init_status, all_finite, DynamicLossScaler
# Define forward function

loss_scaler = DynamicLossScaler(scale_value=2**10, scale_factor=2, scale_window=1000)

def forward_fn(input_ids, labels):
    outputs = model(input_ids, labels=labels)
    loss = outputs[0]
    return loss_scaler.scale(loss / accumulate_step)

# Get gradient function
grad_fn = ops.value_and_grad(forward_fn, None, model.trainable_params())

# Define function of one-step training
@ms_function
def train_step(data, label):
    status = init_status()
    data = ops.depend(data, status)
    loss, grads = grad_fn(data, label)
    loss = loss_scaler.unscale(loss)

    is_finite = all_finite(grads, status)
    if is_finite:
        grads = loss_scaler.unscale(grads)
        loss = ops.depend(loss, accumulator(grads))
    loss = ops.depend(loss, loss_scaler.adjust(is_finite))
    return loss, is_finite

In [None]:
total = train_dataset.get_dataset_size()

for epoch in range(epochs):
    with tqdm(total=total) as progress:
        progress.set_description(f'Epoch {epoch}')
        loss_total = 0
        cur_step_nums = 0
        for batch_idx, (input_ids,) in enumerate(train_dataset.create_tuple_iterator()):
            cur_step_nums += 1
            loss, is_finite = train_step(input_ids, input_ids)
            loss_total += loss

            progress.set_postfix(loss=loss_total/cur_step_nums, finite=is_finite, scale_value=loss_scaler.scale_value.asnumpy())
            progress.update(1)
        save_checkpoint(model, f'gpt_epoch_finetune_{epoch}.ckpt')

Epoch 0: 100%|██████████| 966/966 [07:55<00:00,  2.03it/s, finite=True, loss=0.9193063, scale_value=1024.0] 
Epoch 1: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.67552054, scale_value=2048.0]
Epoch 2: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.52593833, scale_value=4096.0]
Epoch 3: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.38937113, scale_value=8192.0]
Epoch 4: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.27627778, scale_value=16384.0]
Epoch 5: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.18632828, scale_value=32768.0]
Epoch 6: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.12626567, scale_value=65536.0] 
Epoch 8: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.0709814, scale_value=262144.0]  
Epoch 9: 100%|██████████| 966/966 [07:23<00:00,  2.18it/s, finite=True, loss=0.06103498, scale_value=524288.0] 
Epoch 10:

In [25]:
params = mindspore.load_checkpoint('./gpt_epoch_finetune_6.ckpt')
mindspore.load_param_into_net(model, params)

[]

In [18]:
def clean_text(text):
    # cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9,.]', ' ', text)
    pattern = r"[^a-zA-Z0-9\u4e00-\u9fa5,.\?!，。？、]"
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text.replace('"', '').replace('\t', '')

In [19]:
def robot_query(desc, query):
    summury = ''

#     desc = '年龄十七岁，男孩，最近两三个月出现晚上梦游，尿床等症状，不知道是什么原因。'
#     query = '晚上梦游尿床是有什么病症'

    max_summary_len = 300
    desc, query = clean_text(desc), clean_text(query)
    article = f'{query}<Paragraph>{desc}<Paragraph>'
    show = article.replace('<Paragraph>', '<sep>')
    # article = '浆乳中药能治好吗<Paragraph>肿块较硬  <Paragraph>'
    input_ids = tokenizer.encode(article).ids
    print(f'query: \n{show}\n\nanswer:')

    for _ in range(max_summary_len):
        inputs = mindspore.Tensor(input_ids, mindspore.int32)
    #     print(inputs.shape)
        output = model(inputs)[0]
    #     print(output.shape)
        pred = output.argmax(-1)[-1]
        input_ids.append(pred.asnumpy())
        summury = summury + tokenizer.id_to_token(pred.asnumpy())

        if summury[-1] == ']':
            if summury[-2]!='。':
                print('。', end='')
            break

        print(summury[-1], end='')
    
# print(summury)

In [26]:
# desc = '年龄十七岁，男孩，最近两三个月出现晚上梦游，尿床等症状，不知道是什么原因。'
# query = '晚上梦游尿床是有什么病症'

query = '胆总管结石该如何治疗？'
desc = '医生我朋友他得了胆结石，他跟我说这个病的具体名称是胆总管结石。我也分不清结石的种类，我不知晓这个病要怎么样治疗，所以我想问一下胆总管结石该如何治疗呢？'

robot_query(desc, query)

query: 
胆总管结石该如何治疗？<sep>医生我朋友他得了胆结石，他跟我说这个病的具体名称是胆总管结石。我也分不清结石的种类，我不知晓这个病要怎么样治疗，所以我想问一下胆总管结石该如何治疗呢？<sep>

answer:
您好，胆总管结石应该是个比较小的疾病，比如说结石比较小的话，是可以通过手术的方法治疗，可以决定手术的方法，如果结石比较小的话，可以采用微创手术的方式，互相配合中药的方式，决定清淡饮食，这样对结石有很好的效果。平时要注意多喝水，多吃点新鲜的水果和蔬菜，不要吃辛辣的食物，防止吃许多油腻的食物，油炸的食品等。。