# 1. 准备依赖环境

In [18]:
# !pip install transformers

In [19]:
import pandas as pd
import numpy as np
import os
import time
import random
import torch
import torch.nn as nn
from tqdm.notebook import tqdm_notebook
from transformers import AdamW, BertTokenizer, BertForMaskedLM, BertModel
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, List
from torch.nn.utils.rnn import pad_sequence


In [20]:
def build_corpus(split, make_vocab=True, data_dir='../data/ThePeoplesDaily/'):
    """数据读取
    """

    assert split.lower() in ["train", "dev", "test"]

    word_lists = []
    tag_lists = []
    with open(os.path.join(data_dir, split + '.txt'), 'r', encoding='utf-8') as f:
        word_list = []
        tag_list = []
        for line in f:
            if line != '\n':
                word, tag = line.strip('\n').split()
                word_list.append(word)
                tag_list.append(tag)
            else:
                word_lists.append(word_list)
                tag_lists.append(tag_list)
                word_list = []
                tag_list = []
    
    if make_vocab:
        word2id = build_map(word_lists)
        tag2id = build_map(tag_lists)
        return word_lists, tag_lists, word2id, tag2id
    else:
        return word_lists, tag_lists

def build_map(lists):
    maps = {}
    for list_ in lists:
        for e in list_:
            if e not in maps:
                maps[e] = len(maps)
    return maps

In [21]:
train_word_lists, train_tag_lists, word2id, tag2id = build_corpus("train")
dev_word_lists, dev_tag_lists = build_corpus("dev", make_vocab=False)
test_word_lists, test_tag_lists = build_corpus("test", make_vocab=False)

char_sum = 0
sentences = list()
dataset = [train_word_lists, dev_word_lists, test_word_lists]
for word_lists in dataset:
    for word_list in word_lists:
        sentences.append(''.join(word_list))
        char_sum += len(''.join(word_list))
        # print(''.join(word_list))

print(len(sentences))
print(char_sum)

286267
23640982


In [26]:
print(sentences[-1])

深度调整产业结构,对于核心技术就有了更高的要求。以往的传统产业,尤其是粗放型的重工业,已经不适应目前发展的需要。应该从传统的生产型,转变为技术型。以我所在的首钢为例,首钢将要利用独有的冶炼技术,从生产型转变为服务型。老厂区停产后,其实也可以把原来厂区里的土地卖掉,开发房地产。不过,那样就不是首钢了。按照目前的规划,把老厂区转变为工业遗址,尤其是把高炉建成博物馆。这也就是在不放弃的基础上,继承传统,进而调整产业结构。


# 2. 参数定义

In [22]:
ask_and_answer_path = './data/ask_and_answer.json'
download_model_path = '../bert_model/bert-pretrain/ThePeoplesDaily/' 

batch_size = 16
epochs = 50
max_length = 128
seed = 900

random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

# 3. Tokenizer & Bert-base-chinese

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForMaskedLM.from_pretrained('bert-base-chinese')
# model = BertModel.from_pretrained('bert-base-chinese')

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 4. 定义Dataset

In [27]:
class LineByLineTextDataset(Dataset):  
    def __init__(self):
        # data_text = LineByLineTextDataset._read_ask_and_answer()
        # print(data)
        self.data_token = tokenizer(
          text=sentences,
          padding='max_length',
          max_length=max_length,
          truncation=True, #超过max_length长度的自动截断
          return_tensors='pt'
        )['input_ids']
        # print(self.data_token)
          
    def __len__(self):
        return len(self.data_token)
  
    def __getitem__(self, idx):
        return self.data_token[idx]
  
    # @staticmethod
    # def _read_ask_and_answer():
    #     # json解析ask_and_answer文件
    #     with open(ask_and_answer_path, 'r') as ask_and_answer:
    #         records = json.load(ask_and_answer)['RECORDS']
    #     # 读取ask_and_answer中的问题
    #     data = list()
    #     for record in records:
    #         data.append(LineByLineTextDataset._drop_questionmark(record['title']))
    #         for similar_ask in json.loads(record['similar_ask']):
    #             if similar_ask['ask'] == '': continue
    #             data.append(LineByLineTextDataset._drop_questionmark(similar_ask['ask']))
    #     return data
  
    # @staticmethod
    # def _drop_questionmark(text):
    #     return text[:-1] if text[-1] == "?" or text[-1] == "？" else text
    
dataset = LineByLineTextDataset()
print("样本数量==>  input_ids  ", dataset.__len__())
print(dataset.__getitem__(idx=0))

样本数量==>  input_ids   286267
tensor([ 101,  782, 3696, 5381,  122, 3299,  122, 3189, 6380, 2945,  517, 5294,
        5276, 3198, 2845,  518, 2845, 6887,  117, 5401, 1744, 1290, 2209, 6125,
        5500, 2356, 1762, 8138, 2399, 4638, 3297, 1400,  671, 1921, 5326, 5330,
         677, 3885,  117, 1469, 1059, 4413, 5500, 2356,  671, 3416,  117, 6963,
         809, 3297, 7770, 5279, 2497, 2772, 2970, 6818, 3297, 7770, 5279, 2497,
        5310, 3338, 3315, 2399, 4638,  769, 3211,  511,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])


# 5. 定义 DataLoader

In [28]:
dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)

# 6. Training

In [30]:
class Trainer:
    def __init__(self, model, dataloader, tokenizer, mlm_probability=0.15, lr=1e-4, with_cuda=True, cuda_devices=None, log_freq=10):
        self.device = torch.device("cuda:4" if torch.cuda.is_available() else 'cpu')
        self.model = model
        self.is_parallel = False                        # 多GPU 数据并行？
        self.dataloader = dataloader
        self.tokenizer = tokenizer
        self.mlm_probability = mlm_probability                 # masked的概率
        self.log_freq = log_freq

        # 多GPU训练
    #     if with_cuda and torch.cuda.device_count() > 1:
    #         print(f"Using {torch.cuda.device_count()} GPUS for BERT")
    #         self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
    #         self.is_parallel = True
        self.model.train()
        self.model.to(self.device)
        self.optim = AdamW(self.model.parameters(), lr=1e-4)
        print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
      
    def train(self, epochs):
        for epoch in range(epochs):
            ave_loss = self.iteration(epoch, self.dataloader)
            if (epoch + 1) % 5 == 0:
                model_path = download_model_path + f'Epoch{epoch + 1}_Batchsize{batch_size}_Loss{ave_loss:f}_DateTime{time.strftime("%Y%m%d_%H%M%S", time.localtime())}/.'
                model.save_pretrained(model_path)
                print(f'Download into {model_path}')
      
    def iteration(self, epoch, dataloader, train=True):
        str_code = 'Train'
        total_loss = 0.0
        with tqdm_notebook(total=len(dataloader), desc='Epoch %d Training' %epoch, ncols = 800) as pbar:
            for i,batch in enumerate(dataloader):
                # print(batch)
                inputs, labels = self._mask_tokens(batch)
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                outputs = self.model(input_ids=inputs, labels=labels)
                loss = outputs.loss.mean()

                if train:
                    self.model.zero_grad()
                    self.optim.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optim.step()

                total_loss += loss.item()
                ave_loss = total_loss/(i+1)          
                pbar.set_postfix(loss=float(ave_loss))
                pbar.update(1)
        return ave_loss        
          
    def _mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """ Masked Language Model """
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
            )
        labels = inputs.clone()
        # 使用mlm_probability填充张量
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        # 获取special token掩码
        # Returns:  `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        special_tokens_mask = [
              self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        # 将special token位置的概率填充为0
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if self.tokenizer._pad_token is not None:
          # padding掩码
          padding_mask = labels.eq(tokenizer.pad_token_id)
          # 将padding位置的概率填充为0
          probability_matrix.masked_fill_(padding_mask, value=0.0)

        # 对token进行mask采样
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100 #loss只计算masked

        # 80%的概率将masked token替换为[MASK]
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10%的概率将masked token替换为随机单词
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # 余下的10%不做改变
        return inputs, labels

In [31]:
trainer = Trainer(model, dataloader, tokenizer)

Total Parameters: 102290312




In [32]:
trainer.train(epochs)
for i in range(10): torch.cuda.empty_cache()

Epoch 0 Training:   0%|                                                                                       …

KeyboardInterrupt: 