In [4]:
from sklearn.model_selection import train_test_split
import sys
sys.path.append("D:/Experiment")
from MyKu import MyBERT
from MyKu import processing
import torch
from sklearn import metrics
import time
import os
from tqdm import tqdm
from torch.optim import Adam
from torch import nn
import numpy as np
import pandas as pd

In [None]:

# 训练准备阶段，设置超参数和全局变量
file_name = 'readme.md'
batch_size = 8
num_epoch = 10  # 训练轮次
check_step = 1  # 用以训练中途对模型进行检验：每check_step个epoch进行一次测试和保存模型

learning_rate = 1e-5  # 优化器的学习率

# 获取训练、测试数据、分类类别总数
data = processing.load_swsr()
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
categories = 2

train_iter, test_iter = MyBERT.load_bert_data(
    train_data, test_data, batch_size)

#固定写法，可以牢记，cuda代表Gpu
# torch.cuda.is_available()可以查看当前Gpu是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练模型，因为这里是英文数据集，需要用在英文上的预训练模型：bert-base-uncased
# uncased指该预训练模型对应的词表不区分字母的大小写


# 详情可了解：https://huggingface.co/bert-base-uncased
pretrained_model_name = 'hfl/chinese-roberta-wwm-ext-large'
# 创建模型 BertSST2Model
model = MyBERT.MyBertModel(categories, pretrained_model_name)
# 固定写法，将模型加载到device上，
# 如果是GPU上运行，此时可以观察到GPU的显存增加
model.to(device)

# 训练过程
# Adam是最近较为常用的优化器，详情可查看：https://www.jianshu.com/p/aebcaf8af76e
optimizer = Adam(model.parameters(), learning_rate)  # 使用Adam优化器
loss = nn.CrossEntropyLoss()  # 使用crossentropy作为二分类任务的损失函数

# 记录当前训练时间，用以记录日志和存储
timestamp = time.strftime("%m_%d_%H_%M", time.localtime())


In [None]:
file_name = 'chinese-bert.md'

for epoch in range(1, num_epoch + 1):
    MyBERT.train(model, train_iter, device, optimizer, loss, epoch)
    # MyXLM_Base.train(model, de_train_iter, device, optimizer, loss, epoch)
    # MyXLM_Base.train(model, hi_train_iter, device, optimizer, loss, epoch)
    en_acc_score = MyBERT.test(
        model, test_iter, device, epoch, file_name)
    # de_acc_score = MyXLM_Base.test(model, de_test_iter, device, epoch, file_name)
    # hi_acc_score = MyXLM_Base.test(model, hi_test_iter, device, epoch, file_name)
    print('\n\n')

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')
tokenizer

In [None]:
sentence = ['在联合任务的训练下，我们的模型取得了最好的实验效果！']
inputs = tokenizer(sentence, return_tensors='pt',
                   padding=True, truncation=True)

print(inputs)
inputs_ids = inputs['input_ids']
# inputs_ids = inputs_ids.squeeze(0)

words = tokenizer.convert_ids_to_tokens(inputs_ids[0])
print(words)


In [26]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')
tokenizer

PreTrainedTokenizer(name_or_path='hfl/chinese-roberta-wwm-ext-large', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [27]:
sentence = ['在联合任务的训练下，我们的模型取得了最好的实验效果！']
inputs = tokenizer(sentence, return_tensors='pt',
                   padding=True, truncation=True)

print(inputs)
inputs_ids = inputs['input_ids']
# inputs_ids = inputs_ids.squeeze(0)

words = tokenizer.convert_ids_to_tokens(inputs_ids[0])
print(words)


{'input_ids': tensor([[ 101, 1762, 5468, 1394,  818, 1218, 4638, 6378, 5298,  678, 8024, 2769,
          812, 4638, 3563, 1798, 1357, 2533,  749, 3297, 1962, 4638, 2141, 7741,
         3126, 3362, 8013,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}
['[CLS]', '在', '联', '合', '任', '务', '的', '训', '练', '下', '，', '我', '们', '的', '模', '型', '取', '得', '了', '最', '好', '的', '实', '验', '效', '果', '！', '[SEP]']
