In [1]:
import sys, os, time, gc
from torch.optim import Adam

In [2]:
sys.path.append(os.path.abspath("../"))

In [3]:
from utils.args import init_args, add_argument_base
from utils.initialization import *
from utils.example import Example
from utils.batch import from_example_list
from utils.vocab import PAD
from model.slu_baseline_tagging import SLUTagging

In [19]:
def get_args():
    import argparse
    arg_parser = argparse.ArgumentParser()
    #### General configuration ####
    arg_parser.add_argument('--dataroot', default='../data', help='root of data')
    arg_parser.add_argument('--word2vec_path', default='../word2vec-768.txt', help='path of word2vector file path')
    arg_parser.add_argument('--seed', default=999, type=int, help='Random seed')
    arg_parser.add_argument('--device', type=int, default=-1, help='Use which device: -1 -> cpu ; the index of gpu o.w.')
    arg_parser.add_argument('--testing', action='store_true', help='training or evaluation mode')

    arg_parser.add_argument('--eval_interval', default=10, type=int, help='number of intervals to evaluate')
    arg_parser.add_argument('--restore', action='store_true', help='restore training if a checkpoint exists.')
    arg_parser.add_argument('--checkpoint_interval', default=10, type=int, help='number of intervals to save a checkpoint')
    arg_parser.add_argument('--checkpoint_dir', default='./checkpoint.bin', help='path of checkpoint of model')
    arg_parser.add_argument('--best_model_dir', default='./model.bin', help='path of best model')

    arg_parser.add_argument('--trainset_spoken_language_select', default='both', choices=['manual_transcript', 'asr_1best', 'both'], 
                            help='sentence used for trainset(asr_1best: with noise; manual_transcript: without noise)')
    arg_parser.add_argument('--trainset_augmentation', action='store_true', help='used augmented data from lexicon')

    #### Training Hyperparams ####
    arg_parser.add_argument('--batch_size', default=32, type=int, help='Batch size')
    arg_parser.add_argument('--lr', type=float, default=1e-3, help='learning rate')
    arg_parser.add_argument('--max_epoch', type=int, default=100, help='terminate after maximum epochs')
    #### Common Encoder Hyperparams ####
    arg_parser.add_argument('--encoder_cell', default='LSTM', choices=['LSTM', 'GRU', 'RNN'], help='root of data')
    arg_parser.add_argument('--dropout', type=float, default=0.2, help='feature dropout rate')
    arg_parser.add_argument('--embed_size', default=768, type=int, help='Size of word embeddings')
    arg_parser.add_argument('--hidden_size', default=512, type=int, help='hidden size')
    arg_parser.add_argument('--num_layer', default=2, type=int, help='number of layer')

    args = arg_parser.parse_args([])

    return args


def set_optimizer(model, args):
    params = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
    grouped_params = [{'params': list(set([p for n, p in params]))}]
    optimizer = Adam(grouped_params, lr=args.lr)
    return optimizer


def decode(choice):
    assert choice in ['train', 'dev']
    model.eval()
    dataset = train_dataset if choice == 'train' else dev_dataset
    predictions, labels = [], []
    total_loss, count = 0, 0
    with torch.no_grad():
        for i in range(0, len(dataset), args.batch_size):
            cur_dataset = dataset[i: i + args.batch_size]
            current_batch = from_example_list(args, cur_dataset, device, train=True)
            pred, label, loss = model.decode(Example.label_vocab, current_batch)
            predictions.extend(pred)
            labels.extend(label)
            total_loss += loss
            count += 1
        metrics = Example.evaluator.acc(predictions, labels)
    torch.cuda.empty_cache()
    gc.collect()
    return metrics, total_loss / count



In [20]:
# initialization params, output path, logger, random seed and torch.device
args = get_args()
set_random_seed(args.seed)
device = set_torch_device(args.device)
print("Initialization finished ...")
print("Random seed is set to %d" % (args.seed))
print("Use GPU with index %s" % (args.device) if args.device >= 0 else "Use CPU as target torch device")

Initialization finished ...
Random seed is set to 999
Use CPU as target torch device


In [21]:
# load configuration
start_time = time.time()
train_path = os.path.join(args.dataroot, 'train.json')
dev_path = os.path.join(args.dataroot, 'development.json')
ontology_path = os.path.join(args.dataroot, 'ontology.json')
word2vec_path = args.word2vec_path
if args.trainset_spoken_language_select == "both":
    args.trainset_spoken_language_select = ['asr_1best', 'manual_transcript']
if args.trainset_augmentation:
    aug_path = os.path.join(args.dataroot, 'augmentation.json')
    vocab_path = [train_path, aug_path]
else:
    vocab_path = train_path
Example.configuration(  vocab_path=vocab_path, 
                        ontology_path=ontology_path, 
                        word2vec_path=word2vec_path,
                        spoken_language_select=args.trainset_spoken_language_select)

In [24]:
# load dataset and preprocessing
# train_dataset = Example.load_dataset(train_path)
train_dataset = Example.load_dataset(train_path, spoken_language_select=args.trainset_spoken_language_select)
dev_dataset = Example.load_dataset(dev_path, spoken_language_select='asr_1best')
print("Load dataset and database finished, cost %.4fs ..." % (time.time() - start_time))
print("Dataset size: train -> %d ; dev -> %d" % (len(train_dataset), len(dev_dataset)))

Load dataset and database finished, cost 22.5437s ...
Dataset size: train -> 5093 ; dev -> 921


In [25]:
# args.trainset_spoken_language_select

['asr_1best', 'manual_transcript']

In [7]:
d = train_dataset[4]

In [None]:
d.ex

In [9]:
print(d)
# d.utt, d.tags, d.input_idx, d.tag_id

vocab seq: 第二个到塔季他这个道士观
tag seq: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
vocab seq(index): [28, 29, 30, 31, 32, 33, 34, 35, 30, 36, 37, 38]
tag seq(index): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
# Example.label_vocab.num_tags
# [Example.label_vocab.idx2tag[idx] for idx in range(Example.label_vocab.num_tags)]

In [15]:
args.vocab_size = Example.word_vocab.vocab_size
args.pad_idx = Example.word_vocab[PAD]
args.num_tags = Example.label_vocab.num_tags
args.tag_pad_idx = Example.label_vocab.convert_tag_to_idx(PAD)

In [16]:
model = SLUTagging(args).to(device)
Example.word2vec.load_embeddings(model.word_embed, Example.word_vocab, device=device)

1.0

In [17]:
# if not args.testing:
num_training_steps = ((len(train_dataset) + args.batch_size - 1) // args.batch_size) * args.max_epoch
print('Total training steps: %d' % (num_training_steps))

Total training steps: 16000


In [18]:
optimizer = set_optimizer(model, args)
nsamples, best_result = len(train_dataset), {'dev_acc': 0., 'dev_f1': 0.}
train_index, step_size = np.arange(nsamples), args.batch_size

In [19]:
print('Start training ......')
# for i in range(args.max_epoch):
i = 0
start_time = time.time()
epoch_loss = 0
np.random.shuffle(train_index)
model.train()
count = 0

Start training ......


In [20]:
# for j in range(0, nsamples, step_size):
j = 0
cur_dataset = [train_dataset[k] for k in train_index[j: j + step_size]]
current_batch = from_example_list(args, cur_dataset, device, train=True)
output, loss = model(current_batch)
epoch_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.zero_grad()
count += 1

In [21]:
batch = current_batch
tag_ids = batch.tag_ids
tag_mask = batch.tag_mask
input_ids = batch.input_ids
lengths = batch.lengths

In [23]:
# tag_ids 即 decoder 的目标输出序列
# tag_mask 用于 mask
# input_ids 即 encoder 的原始输入序列 (可以选择 'manual_transcript' 或者 'asr_1best')
# lengths 代表序列长度, 用于后续的 packed/unpacked
tag_ids[0], tag_mask[0], input_ids[0], lengths[0]

(tensor([30, 31,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]),
 tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 tensor([   2,    3,   31,  738,  739,   43,  240,  146,  295,  146,  452, 1592,
          166,   45,  150,  543,  214,  243]),
 18)

In [24]:
# tag_ids.view(-1)

In [25]:
embed = model.word_embed(input_ids)

In [48]:
model.word_embed

Embedding(1782, 768, padding_idx=0)

In [27]:
import torch.nn.utils.rnn as rnn_utils

In [28]:
packed_inputs = rnn_utils.pack_padded_sequence(embed, lengths, batch_first=True)

In [29]:
packed_inputs

PackedSequence(data=tensor([[-0.1083,  0.1150,  0.0218,  ..., -0.1097, -0.1615, -0.1442],
        [-0.2747,  0.1889, -0.1823,  ...,  0.2990, -0.0975,  0.0018],
        [-0.0882, -0.2366,  0.0060,  ..., -0.3304,  0.0540,  0.1205],
        ...,
        [ 0.0077, -0.0665,  0.0695,  ..., -0.1178, -0.2373, -0.0091],
        [-0.1475, -0.0168, -0.1360,  ...,  0.0174,  0.2907,  0.1095],
        [-0.0512,  0.1464,  0.2309,  ..., -0.0013, -0.2137,  0.0770]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([32, 32, 27, 27, 20, 18, 12, 11,  8,  6,  3,  2,  1,  1,  1,  1,  1,  1]), sorted_indices=None, unsorted_indices=None)

In [46]:
embed.shape     # bsize x seqlen x dim

torch.Size([32, 18, 768])

In [30]:
packed_rnn_out, h_t_c_t = model.rnn(packed_inputs)  # bsize x seqlen x dim

In [33]:
rnn_out, unpacked_len = rnn_utils.pad_packed_sequence(packed_rnn_out, batch_first=True)
    # batch_first=True 即 batch 放于第一个维度 (第 0 维)

In [44]:
len(h_t_c_t), h_t_c_t[0].shape, h_t_c_t[1].shape

(2, torch.Size([4, 32, 256]), torch.Size([4, 32, 256]))

In [45]:
rnn_out.shape

torch.Size([32, 18, 512])

In [49]:
hiddens = model.dropout_layer(rnn_out)

In [50]:
tag_output = model.output_layer(hiddens, tag_mask, tag_ids)

In [54]:
output, loss = tag_output

In [55]:
output.shape

torch.Size([32, 18, 74])

In [56]:
loss

tensor(4.2095, grad_fn=<NllLossBackward>)

In [57]:
print('Training: \tEpoch: %d\tTime: %.4f\tTraining Loss: %.4f' % (i, time.time() - start_time, epoch_loss / count))
torch.cuda.empty_cache()
gc.collect()

Training: 	Epoch: 0	Time: 1150.2762	Training Loss: 4.2954


16

In [58]:
start_time = time.time()
metrics, dev_loss = decode('dev')
dev_acc, dev_fscore = metrics['acc'], metrics['fscore']
print('Evaluation: \tEpoch: %d\tTime: %.4f\tDev acc: %.2f\tDev fscore(p/r/f): (%.2f/%.2f/%.2f)' % (i, time.time() - start_time, dev_acc, dev_fscore['precision'], dev_fscore['recall'], dev_fscore['fscore']))

Evaluation: 	Epoch: 0	Time: 0.5854	Dev acc: 22.48	Dev fscore(p/r/f): (0.00/0.00/0.00)


In [59]:
if dev_acc > best_result['dev_acc']:
    best_result['dev_loss'], best_result['dev_acc'], best_result['dev_f1'], best_result['iter'] = dev_loss, dev_acc, dev_fscore, i
    # torch.save({
    #     'epoch': i, 'model': model.state_dict(),
    #     'optim': optimizer.state_dict(),
    # }, open('model.bin', 'wb'))
    print('NEW BEST MODEL: \tEpoch: %d\tDev loss: %.4f\tDev acc: %.2f\tDev fscore(p/r/f): (%.2f/%.2f/%.2f)' % (i, dev_loss, dev_acc, dev_fscore['precision'], dev_fscore['recall'], dev_fscore['fscore']))

NEW BEST MODEL: 	Epoch: 0	Dev loss: 4.2128	Dev acc: 22.48	Dev fscore(p/r/f): (0.00/0.00/0.00)


In [60]:
best_result

{'dev_acc': 22.47557003257329,
 'dev_f1': {'precision': 0, 'recall': 0.0, 'fscore': 0},
 'dev_loss': 4.212819411836821,
 'iter': 0}

In [61]:
print('FINAL BEST RESULT: \tEpoch: %d\tDev loss: %.4f\tDev acc: %.4f\tDev fscore(p/r/f): (%.4f/%.4f/%.4f)' % (best_result['iter'], best_result['dev_loss'], best_result['dev_acc'], best_result['dev_f1']['precision'], best_result['dev_f1']['recall'], best_result['dev_f1']['fscore']))

FINAL BEST RESULT: 	Epoch: 0	Dev loss: 4.2128	Dev acc: 22.4756	Dev fscore(p/r/f): (0.0000/0.0000/0.0000)
