In [1]:
# coding: UTF-8
import os
import sys
BASEDIR = os.path.abspath(".")
print(BASEDIR)
sys.path.append(BASEDIR)
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
import argparse
from utils import build_dataset, build_iterator, get_time_dif
import torch.nn.functional as F

from sklearn import metrics
from pytorch_pretrained.optimization import BertAdam

/search/data/liuyouyuan/pyproject/Bert-Chinese-Text-Classification-Pytorch


In [2]:
dataset = 'THUCNews'  # 数据集
model_name = "bert"  # bert
x = import_module('models.' + model_name)
config = x.Config(dataset)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

In [3]:
from tqdm import tqdm
import time
from datetime import timedelta

PAD, CLS = '[PAD]', '[CLS]'  # padding符号, bert中综合信息符号


def build_dataset(config):

    def load_dataset(path, pad_size=32):
        contents = []
        print("path:", path)
        n = 0
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
                print("content>>>:", content)
                print("label>>>:", label)
                token = config.tokenizer.tokenize(content)
                print("token:", token)
                token = [CLS] + token
                print("token2:", token)
                seq_len = len(token)
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)
                print("token_ids:", token_ids)
                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                print(token_ids, int(label), seq_len, mask)
                contents.append((token_ids, int(label), seq_len, mask))
                n += 1
                if n == 168:
                    break
        return contents
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test

In [4]:
start_time = time.time()
print("Loading data...")
#print("config:", config)
train_data, dev_data, test_data = build_dataset(config)


29it [00:00, 288.03it/s]

Loading data...
path: THUCNews/data/train.txt
content>>>: 中华女子学院：本科层次仅1专业招男生
label>>>: 3
token: ['中', '华', '女', '子', '学', '院', '：', '本', '科', '层', '次', '仅', '1', '专', '业', '招', '男', '生']
token2: ['[CLS]', '中', '华', '女', '子', '学', '院', '：', '本', '科', '层', '次', '仅', '1', '专', '业', '招', '男', '生']
token_ids: [101, 704, 1290, 1957, 2094, 2110, 7368, 8038, 3315, 4906, 2231, 3613, 788, 122, 683, 689, 2875, 4511, 4495]
[101, 704, 1290, 1957, 2094, 2110, 7368, 8038, 3315, 4906, 2231, 3613, 788, 122, 683, 689, 2875, 4511, 4495, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 3 19 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 两天价网站背后重重迷雾：做个网站究竟要多少钱
label>>>: 4
token: ['两', '天', '价', '网', '站', '背', '后', '重', '重', '迷', '雾', '：', '做', '个', '网', '站', '究', '竟', '要', '多', '少', '钱']
token2: ['[CLS]', '两', '天', '价', '网', '站', '背', '后', '重', '重', '迷', '雾', '：', '做', '个', '网', '站', '究', '竟', '要', '多', '少', '钱']
token_ids: [101, 697, 1921, 817, 5381, 499

134it [00:00, 323.77it/s]

 ['猪', '贩', '给', '生', '猪', '注', '水', '催', '肥', '2', '分', '钟', '可', '增', '15', '公', '斤']
token2: ['[CLS]', '猪', '贩', '给', '生', '猪', '注', '水', '催', '肥', '2', '分', '钟', '可', '增', '15', '公', '斤']
token_ids: [101, 4343, 6575, 5314, 4495, 4343, 3800, 3717, 998, 5503, 123, 1146, 7164, 1377, 1872, 8115, 1062, 3165]
[101, 4343, 6575, 5314, 4495, 4343, 3800, 3717, 998, 5503, 123, 1146, 7164, 1377, 1872, 8115, 1062, 3165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 5 18 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 男子因工作起争执撞毁同事汽车
label>>>: 5
token: ['男', '子', '因', '工', '作', '起', '争', '执', '撞', '毁', '同', '事', '汽', '车']
token2: ['[CLS]', '男', '子', '因', '工', '作', '起', '争', '执', '撞', '毁', '同', '事', '汽', '车']
token_ids: [101, 4511, 2094, 1728, 2339, 868, 6629, 751, 2809, 3058, 3673, 1398, 752, 3749, 6756]
[101, 4511, 2094, 1728, 2339, 868, 6629, 751, 2809, 3058, 3673, 1398, 752, 3749, 6756, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 


59it [00:00, 519.27it/s]

content>>>: 林丹：竞技体育不能太追求完美 把苏杯留住最重要
label>>>: 7
token: ['林', '丹', '：', '竞', '技', '体', '育', '不', '能', '太', '追', '求', '完', '美', '把', '苏', '杯', '留', '住', '最', '重', '要']
token2: ['[CLS]', '林', '丹', '：', '竞', '技', '体', '育', '不', '能', '太', '追', '求', '完', '美', '把', '苏', '杯', '留', '住', '最', '重', '要']
token_ids: [101, 3360, 710, 8038, 4993, 2825, 860, 5509, 679, 5543, 1922, 6841, 3724, 2130, 5401, 2828, 5722, 3344, 4522, 857, 3297, 7028, 6206]
[101, 3360, 710, 8038, 4993, 2825, 860, 5509, 679, 5543, 1922, 6841, 3724, 2130, 5401, 2828, 5722, 3344, 4522, 857, 3297, 7028, 6206, 0, 0, 0, 0, 0, 0, 0, 0, 0] 7 23 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: TVB主动举报案件 率先报道陈志云被捕
label>>>: 2
token: ['tvb', '主', '动', '举', '报', '案', '件', '率', '先', '报', '道', '陈', '志', '云', '被', '捕']
token2: ['[CLS]', 'tvb', '主', '动', '举', '报', '案', '件', '率', '先', '报', '道', '陈', '志', '云', '被', '捕']
token_ids: [101, 9312, 712, 1220, 715, 2845, 3428, 816, 4372, 10

133it [00:00, 408.04it/s]

['高', '考', '作', '文', '写', '作', '必', '备', '的', '八', '种', '结', '构', '模', '式']
token2: ['[CLS]', '高', '考', '作', '文', '写', '作', '必', '备', '的', '八', '种', '结', '构', '模', '式']
token_ids: [101, 7770, 5440, 868, 3152, 1091, 868, 2553, 1906, 4638, 1061, 4905, 5310, 3354, 3563, 2466]
[101, 7770, 5440, 868, 3152, 1091, 868, 2553, 1906, 4638, 1061, 4905, 5310, 3354, 3563, 2466, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 3 16 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 节前加仓节后迎反弹 2月股基均获得正收益
label>>>: 0
token: ['节', '前', '加', '仓', '节', '后', '迎', '反', '弹', '2', '月', '股', '基', '均', '获', '得', '正', '收', '益']
token2: ['[CLS]', '节', '前', '加', '仓', '节', '后', '迎', '反', '弹', '2', '月', '股', '基', '均', '获', '得', '正', '收', '益']
token_ids: [101, 5688, 1184, 1217, 797, 5688, 1400, 6816, 1353, 2486, 123, 3299, 5500, 1825, 1772, 5815, 2533, 3633, 3119, 4660]
[101, 5688, 1184, 1217, 797, 5688, 1400, 6816, 1353, 2486, 123, 3299, 5500, 1825, 1772, 5815

164it [00:00, 356.95it/s]
35it [00:00, 348.48it/s]

 ['建', '邦', '礼', '仕', '阁', '首', '批', '业', '主', '顺', '利', '入', '住']
token2: ['[CLS]', '建', '邦', '礼', '仕', '阁', '首', '批', '业', '主', '顺', '利', '入', '住']
token_ids: [101, 2456, 6930, 4851, 799, 7323, 7674, 2821, 689, 712, 7556, 1164, 1057, 857]
[101, 2456, 6930, 4851, 799, 7323, 7674, 2821, 689, 712, 7556, 1164, 1057, 857, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1 14 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 留英新政来不及消化 学生中介齐忐忑(组图)
label>>>: 3
token: ['留', '英', '新', '政', '来', '不', '及', '消', '化', '学', '生', '中', '介', '齐', '忐', '忑', '(', '组', '图', ')']
token2: ['[CLS]', '留', '英', '新', '政', '来', '不', '及', '消', '化', '学', '生', '中', '介', '齐', '忐', '忑', '(', '组', '图', ')']
token_ids: [101, 4522, 5739, 3173, 3124, 3341, 679, 1350, 3867, 1265, 2110, 4495, 704, 792, 7970, 2558, 2559, 113, 5299, 1745, 114]
[101, 4522, 5739, 3173, 3124, 3341, 679, 1350, 3867, 1265, 2110, 4495, 704, 792, 7970, 2558, 2559, 113, 5299, 1745, 114

108it [00:00, 351.15it/s]

content>>>: 盘点2010留学表情：海外学子眼中这一年(组图)
label>>>: 3
token: ['盘', '点', '2010', '留', '学', '表', '情', '：', '海', '外', '学', '子', '眼', '中', '这', '一', '年', '(', '组', '图', ')']
token2: ['[CLS]', '盘', '点', '2010', '留', '学', '表', '情', '：', '海', '外', '学', '子', '眼', '中', '这', '一', '年', '(', '组', '图', ')']
token_ids: [101, 4669, 4157, 8166, 4522, 2110, 6134, 2658, 8038, 3862, 1912, 2110, 2094, 4706, 704, 6821, 671, 2399, 113, 5299, 1745, 114]
[101, 4669, 4157, 8166, 4522, 2110, 6134, 2658, 8038, 3862, 1912, 2110, 2094, 4706, 704, 6821, 671, 2399, 113, 5299, 1745, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 3 22 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 2011年全国各地高考各科考试时间汇总
label>>>: 3
token: ['2011', '年', '全', '国', '各', '地', '高', '考', '各', '科', '考', '试', '时', '间', '汇', '总']
token2: ['[CLS]', '2011', '年', '全', '国', '各', '地', '高', '考', '各', '科', '考', '试', '时', '间', '汇', '总']
token_ids: [101, 8163, 2399, 1059, 1744, 1392, 1765, 7770, 5440, 1392, 490

143it [00:00, 350.26it/s]

 ['江', '西', '师', '范', '大', '学', '2010', '年', '高', '考', '录', '取', '结', '果', '查', '询', '系', '统', '开', '通']
token2: ['[CLS]', '江', '西', '师', '范', '大', '学', '2010', '年', '高', '考', '录', '取', '结', '果', '查', '询', '系', '统', '开', '通']
token_ids: [101, 3736, 6205, 2360, 5745, 1920, 2110, 8166, 2399, 7770, 5440, 2497, 1357, 5310, 3362, 3389, 6418, 5143, 5320, 2458, 6858]
[101, 3736, 6205, 2360, 5745, 1920, 2110, 8166, 2399, 7770, 5440, 2497, 1357, 5310, 3362, 3389, 6418, 5143, 5320, 2458, 6858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 3 21 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
content>>>: 德国高校“硕士直招”吸引艺术人才
label>>>: 3
token: ['德', '国', '高', '校', '[UNK]', '硕', '士', '直', '招', '[UNK]', '吸', '引', '艺', '术', '人', '才']
token2: ['[CLS]', '德', '国', '高', '校', '[UNK]', '硕', '士', '直', '招', '[UNK]', '吸', '引', '艺', '术', '人', '才']
token_ids: [101, 2548, 1744, 7770, 3413, 100, 4798, 1894, 4684, 2875, 100, 1429, 2471, 5686, 3318, 782, 2798]
[101, 2548, 1744, 7770,




In [5]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        print("len(batches):",len(batches), batch_size)
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        print("n_batches",self.n_batches)
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        # pad前的长度(超过pad_size的设为pad_size)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index > self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter_ = DatasetIterater(dataset, config.batch_size, config.device)
    return iter_


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


In [6]:
print(len(train_data) //32)

5


In [7]:
train_iter = build_iterator(train_data,config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)

len(batches): 168 128
n_batches 1
len(batches): 168 128
n_batches 1
len(batches): 168 128
n_batches 1


In [12]:
def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            print("X:", trains[0])
            print("seq_len:", trains[1])
            print("MASK:", trains[2])
            print("Y", labels)
            
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                # 每多少轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降，结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    test(config, model, test_iter)
    
def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

In [13]:
model = x.Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)

Epoch [1/3]
X: tensor([[ 101, 1744, 2832,  ...,    0,    0,    0],
        [ 101, 2548, 3791,  ...,    0,    0,    0],
        [ 101, 8170, 2399,  ...,    0,    0,    0],
        ...,
        [ 101, 4152, 1169,  ...,    0,    0,    0],
        [ 101, 5811, 3172,  ...,    0,    0,    0],
        [ 101,  723, 1046,  ...,    0,    0,    0]], device='cuda:0')
seq_len: tensor([19, 21, 18, 14, 18, 17, 23, 17, 22, 24, 17, 18, 23, 15, 21, 18, 18, 24,
        20, 22, 11, 20, 18, 20, 18, 19, 21, 18, 21, 17, 18, 20, 14, 21, 20, 10,
        22, 17, 18, 19], device='cuda:0')
MASK: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
Y tensor([0, 2, 3, 6, 8, 1, 7, 2, 5, 7, 3, 6, 9, 4, 9, 4, 3, 1, 4, 5, 4, 6, 8, 3,
        5, 9, 0, 4, 2, 5, 8, 4, 2, 0, 9, 1, 1, 5, 6, 6], device='cuda:0')
Iter:      0,  Train Loss:   2.3, 

Training beyond specified 't_total'. Learning rate multiplier set to 0.0. Please set 't_total' of WarmupLinearSchedule correctly.


ValueError: Number of classes, 6, does not match size of target_names, 10. Try specifying the labels parameter

In [14]:
# ValueError: Number of classes, 6, does not match size of target_names, 10. Try specifying the labels parameter
# 现在只有6个类别，但是定义的目标类别是10个，这里报错了，但是应该是一个警告啊？！