# 使用Trainer和Tester快速训练和测试

## 数据读入和处理

In [1]:
from fastNLP.io import SST2Pipe

pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.get_vocab('words')
print(databundle)
print(databundle.get_dataset('train')[0])
print(databundle.get_vocab('words'))



In total 3 datasets:
	train has 67349 instances.
	dev has 872 instances.
	test has 1821 instances.
In total 2 vocabs:
	words has 16292 entries.
	target has 2 entries.

+---------------------------+--------+---------------------------+---------+
| raw_words                 | target | words                     | seq_len |
+---------------------------+--------+---------------------------+---------+
| hide new secretions fr... | 1      | [4110, 97, 12009, 39, ... | 7       |
+---------------------------+--------+---------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)


In [2]:
train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

4925 872 75


In [3]:
train_data.print_field_meta()

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
|   is_input  |   False   | False  |  True |   True  |
|  is_target  |   False   |  True  | False |  False  |
| ignore_type |           | False  | False |  False  |
|  pad_value  |           |   0    |   0   |    0    |
+-------------+-----------+--------+-------+---------+


<prettytable.prettytable.PrettyTable at 0x7ff7b4926e90>

In [4]:
from fastNLP import AccuracyMetric
from fastNLP import Const

# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

## DataSetIter初探

In [20]:
from fastNLP import BucketSampler
from fastNLP import DataSetIter

tmp_data = dev_data[:10]
# 定义一个Batch，传入DataSet，规定batch_size和去batch的规则。
# 顺序（Sequential），随机（Random），相似长度组成一个batch（Bucket）
sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x, batch_x["words"].shape)
    print("batch_y: ", batch_y, batch_y["target"].shape)

batch_x:  {'words': tensor([[  14,   10,  437,   32,   78,    3,   78,  437,    7],
        [  14,   10,    4,  311,    5,  154, 1418,  609,    7]]), 'seq_len': tensor([9, 9])} torch.Size([2, 9])
batch_y:  {'target': tensor([1, 0])} torch.Size([2])
batch_x:  {'words': tensor([[    2,   155,     3,  4426,     3,   239,     3,   739,     5,  1136,
            41,    43,  2427,   736,     2,   648,    10, 14167,  2285,     7],
        [   24,    95,    28,    46,     8,   336,    38,   239,     8,  2133,
             2,    18,    10, 14169,  1421,     6,    61,     5,   387,     7]]), 'seq_len': tensor([20, 20])} torch.Size([2, 20])
batch_y:  {'target': tensor([0, 0])} torch.Size([2])
batch_x:  {'words': tensor([[   45,   752,   327,   180,    10, 14168,    16,    72,  8904,     9,
          1217,     7,     0,     0,     0,     0,     0,     0,     0,     0],
        [  879,    96,     8,  1026,    12,  8067,    11, 13591,     8, 14166,
             4,   673,   662,    15,     4,  1154, 

In [21]:
tmp_data.set_pad_val('words',-1)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x, batch_x["words"].shape)
    print("batch_y: ", batch_y, batch_y["target"].shape)

batch_x:  {'words': tensor([[  14,   10,  437,   32,   78,    3,   78,  437,    7],
        [  14,   10,    4,  311,    5,  154, 1418,  609,    7]]), 'seq_len': tensor([9, 9])} torch.Size([2, 9])
batch_y:  {'target': tensor([1, 0])} torch.Size([2])
batch_x:  {'words': tensor([[  879,    96,     8,  1026,    12,  8067,    11, 13591,     8, 14166,
             4,   673,   662,    15,     4,  1154,   240,   639,   417,     7],
        [   45,   752,   327,   180,    10, 14168,    16,    72,  8904,     9,
          1217,     7,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1]]), 'seq_len': tensor([20, 12])} torch.Size([2, 20])
batch_y:  {'target': tensor([0, 1])} torch.Size([2])
batch_x:  {'words': tensor([[    4,   277,   685,    18,     7],
        [14165,  3204,     5,  1675,    -1]]), 'seq_len': tensor([5, 4])} torch.Size([2, 5])
batch_y:  {'target': tensor([1, 1])} torch.Size([2])
batch_x:  {'words': tensor([[    2,   155,     3,  4426,     3,   239,     3,   739,     5,  1136,

In [22]:
a = ["as", "dwde", "dwe"]
list(map(len, a))

[2, 4, 3]

In [23]:
from fastNLP.core.field import Padder
import numpy as np
class FixLengthPadder(Padder):
    def __init__(self, pad_val=0, length=None):
        super().__init__(pad_val=pad_val)
        self.length = length
        assert self.length is not None, "Creating FixLengthPadder with no specific length!"

    def __call__(self, contents, field_name, field_ele_dtype, dim):
        #计算当前contents中的最大长度
        max_len = max(map(len, contents))
        #如果当前contents中的最大长度大于指定的padder length的话就报错
        assert max_len <= self.length, "Fixed padder length smaller than actual length! with length {}".format(max_len)
        array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)
        for i, content_i in enumerate(contents):
            array[i, :len(content_i)] = content_i
        return array

#设定FixLengthPadder的固定长度为40
tmp_padder = FixLengthPadder(pad_val=0,length=40)
#利用dataset的set_padder函数设定words field的padder
tmp_data.set_padder('words',tmp_padder)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
    print("batch_x: ",batch_x, batch_x["words"].shape)
    print("batch_y: ", batch_y, batch_y["target"].shape)

batch_x:  {'words': tensor([[   24,    95,    28,    46,     8,   336,    38,   239,     8,  2133,
             2,    18,    10, 14169,  1421,     6,    61,     5,   387,     7,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    2,   155,     3,  4426,     3,   239,     3,   739,     5,  1136,
            41,    43,  2427,   736,     2,   648,    10, 14167,  2285,     7,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'seq_len': tensor([20, 20])} torch.Size([2, 40])
batch_y:  {'target': tensor([0, 0])} torch.Size([2])
batch_x:  {'words': tensor([[ 1045, 11113,    16,   104,     5,     4,   176,  1824,  1704,     3,
             2,    18,    11,     4,  1018,   432,   143,    33,   245,   308,
             7,     0,     0,     0,     0,     0,     0,     0,

## 使用DataSetIter自己编写训练过程


In [25]:
from fastNLP import BucketSampler
from fastNLP import DataSetIter
from fastNLP.models import CNNText
from fastNLP import Tester
import torch
import time

embed_dim = 100
model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)

def train(epoch, data, devdata):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    lossfunc = torch.nn.CrossEntropyLoss()
    batch_size = 32

    # 定义一个Batch，传入DataSet，规定batch_size和去batch的规则。
    # 顺序（Sequential），随机（Random），相似长度组成一个batch（Bucket）
    train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
    train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

    start_time = time.time()
    print("-"*5+"start training"+"-"*5)
    for i in range(epoch):
        loss_list = []
        for batch_x, batch_y in train_batch:
            optimizer.zero_grad()
            output = model(batch_x['words'])
            loss = lossfunc(output['pred'], batch_y['target'])
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        #这里verbose如果为0，在调用Tester对象的test()函数时不输出任何信息，返回评估信息; 如果为1，打印出验证结果，返回评估信息
        #在调用过Tester对象的test()函数后，调用其_format_eval_results(res)函数，结构化输出验证结果
        tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=1)
        res=tester_tmp.test()

        print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
        print(tester_tmp._format_eval_results(res),end=" ")
        print('{:d}ms'.format(round((time.time()-start_time)*1000)))
        loss_list.clear()

train(10, train_data, dev_data)
#使用tester进行快速测试
tester = Tester(test_data, model, metrics=AccuracyMetric())
tester.test()

-----start training-----


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.709862
Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.709862 2929ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.13 seconds!
[tester] 
AccuracyMetric: acc=0.771789
Epoch 1 Avg Loss: 0.38 AccuracyMetric: acc=0.771789 5773ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.772936
Epoch 2 Avg Loss: 0.15 AccuracyMetric: acc=0.772936 8778ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.761468
Epoch 3 Avg Loss: 0.05 AccuracyMetric: acc=0.761468 11620ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.759174
Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.759174 14371ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.764908
Epoch 5 Avg Loss: 0.01 AccuracyMetric: acc=0.764908 17262ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.13 seconds!
[tester] 
AccuracyMetric: acc=0.756881
Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.756881 20246ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.760321
Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.760321 22988ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.16 seconds!
[tester] 
AccuracyMetric: acc=0.759174
Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.759174 25790ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…

Evaluate data in 0.14 seconds!
[tester] 
AccuracyMetric: acc=0.761468
Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.761468 28513ms


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…

Evaluate data in 0.02 seconds!
[tester] 
AccuracyMetric: acc=0.733333


{'AccuracyMetric': {'acc': 0.733333}}