In [0]:
import os
import json
import nltk
import logging
import evaluate

from tqdm import tqdm, trange
# from time import gmtime, strftime

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

In [0]:
import torchtext
from torchtext import data, datasets
from torchtext.vocab import GloVe

In [0]:
logger = logging.getLogger(__name__)
# Setup logging 
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)

# 定义初始变量参数

In [0]:
# 有关argparse的官方文档操作请查看：https://docs.python.org/3/library/argparse.html#module-argparse，
# 关于parser.add_argument(）的详解请查看：https://blog.csdn.net/u013177568/article/details/62432761/
# .parse_args()是将之前所有add_argument定义的参数在括号里进行赋值，没有赋值(args=[])，就返回参数各自default的默认值。默认情况下，中划线会转换为下划线.


In [0]:
class Arguments(object):
    # 模型参数
    char_dim = 8
    word_dim = 100
    char_channel_size = 100
    char_channel_width = 5
    context_threshold = 400  # 文章的长度
    hidden_size = 100
    dropout = 0.2
    # 训练参数
    epoch = 2
    learning_rate = 0.5
    exp_decay_rate = 0.999
    logging_steps = 100
    prediction_file = './data/prediction.json'  # 预测文件
    model_dir = 'BiDAF.pkl' # 模型保存路径
    
    # 数据参数
    dev_batch_size = 32
    train_batch_size = 32
    train_file = './data/train-v1.1.json'  # 原始训练集
    dev_file = './data/dev-v1.1.json'  # 原始验证集
    train_torchtext_file = './data/train_torchtext.json'  # 能够被 torchtext 处理的训练集
    dev_torchtext_file = './data/dev_torchtext.json'  # 能够被 TorchText 处理的验证集
    train_example_file = './data/train_examples.pt'  # torchtext example 数据 
    dev_example_file = './data/dev_examples.pt'  # torchtext example 数据 
    overwrite_cache = False
    dataset_file = './data/dev-v1.1.json'  # 用于测试
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    

In [0]:
args = Arguments()

# 二、SQuAD问答数据预处理

## 查看数据集结构

SQuAD 数据格式

SQuAD 数据集的格式是字典格式，包含 data 和 version 两个键。其中 data 中保存了数据集内容，version 用于记录当前数据集的版本号。data 的值是一个列表，列表的元素为某一主题的文章集合，格式为字典格式。该主题文章集合字典包含两个键 title 和 paragraphs。其中 title 为主题名，paragraphs 为文章的列表。文章也是字典格式，包含两个键 context 和 qas，分别表示文章及文章的问题答案。其中文章为字符串，文章的问题及答案用列表保存。每个问题及对应的答案使用字典保存，键包括 answers, question, id 分别用于保存问题、答案及其唯一标识。答案有多个，使用列表保存，每一个答案为一个字典保存了答案的起始位置及其文本内容。


```json
// 数据架构如下
{
    "data": [
        {
            "title": "Super_Bowl_50", // 第一个主题
            "paragraphs": [
                {
                    "context": " numerals 50.......", // 每个主题会有很多context短文,这里只列出一个
                    "qas": [  // 这个列表里放问题和答案的位置，每篇context会有很有很多answer和question，这里只列出一个
                        {
                            "answers": [  // 一个问题会有三个答案，三个答案都是对的，只是在context不同或相同位置
                                {         // 下面三个答案都在相同的位置
                                    "answer_start": 177,  // 答案在文中的起始位置是第177的字符。
                                    "text": "Denver Broncos"
                                },
                                {
                                    "answer_start": 177,
                                    "text": "Denver Broncos"
                                },
                                {
                                    "answer_start": 177,
                                    "text": "Denver Broncos"
                                }
                            ],
                            "question": "Which NFL team represented the AFC at Super Bowl 50?",
                            "id": "56be4db0acb8001400a502ec"
                        }

                    ]
                }
                
            ]
        },
        
        {
            "title": "Warsaw", // 第二个主题
            "paragraphs":   
        },
        
        {
            "title": "Normans", // 第三个主题
            "paragraphs": 
        },
        
        {
            "title": "Nikola_Tesla", // 第四个主题
            "paragraphs": 
        },
        ........... // 还有很多
        
    ],
    "version": "1.1"
}
```


## 加载数据

定义分词方法

In [7]:
nltk.download('punkt')  # 下载 token

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
def word_tokenize(tokens):
    # 输入字符串，分割成单词列表
    # nltk.word_tokenize(tokens)分词，replace规范化引号，方便后面处理
    tokens = [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    
    return tokens

将原始 json 转换为 TorchText 能够处理的 json 格式

In [0]:
def preprocess_raw_json(path):
    examples = []
    # 需要去除的字符
    abnormals = [' ', '\n', '\u3000', '\u202f', '\u2009']
    
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        data = data['data']  # 返回值 data 是个列表，列表的元素为字典

    for articles in tqdm(data, desc='Processing'):  # 处理 data 中每个主题
        for paragraph in articles['paragraphs']:  # 处理主题中的每个 paragraph
            # 每个 paragraph 包含 context 和 qas
            context = paragraph['context']  # 段落文本
            tokens = word_tokenize(context)
            for qa in paragraph['qas']:  # 包含多个问题及答案
                # 每个 qa 包含 answers 和 question 及唯一标识符 id
                qa_id = qa['id']
                question = qa['question']
                for ans in qa['answers']:  # 处理每个答案
                    # 每个 answer 包含 text 及 answer_start
                    answer = ans['text']
                    s_idx = ans['answer_start']
                    e_idx = s_idx + len(answer)
                    
                    # 重新计算答案的起始位置，使用字符计算位置改为使用单词计算位置
                    l = 0
                    s_found = False
                    for i, t in enumerate(tokens):
                        # 在 context 中的一个非空白符的起始位置
                        while l < len(context):
                            if context[l] in abnormals:
                                l += 1
                            else:
                                break
                        # exceptional cases  word_tokenize 函数替换的引号
                        if t[0] == '"' and context[l:l + 2] == '\'\'':
                            t = '\'\'' + t[1:]
                        elif t == '"' and context[l:l + 2] == '\'\'':
                            t = '\'\''
                        # 记录起始位置及终止位置
                        l += len(t)
                        if l > s_idx and s_found == False:
                            s_idx = i
                            s_found = True
                        if l >= e_idx:
                            e_idx = i
                            break
                    # 保存一个样本
                    examples.append(dict([('id', qa_id),
                                      ('context', context),
                                      ('question', question),
                                      ('answer', answer),
                                      ('s_idx', s_idx),
                                      ('e_idx', e_idx)]))
    
    save_path = args.train_torchtext_file if path == args.train_file else args.dev_torchtext_file
    logger.info("Save json data to new file %s", save_path)
    with open(save_path, 'w', encoding='utf-8') as f:
        for example in examples:
            json.dump(example, f)
            print('', file=f)  # 换行
        

In [0]:
class SQuADProcessor(object):
    """ 处理 SQuAD 原始数据 """
    def __init__(self, args):
        # 是否存在处理好的 TorchText json 文件
        if os.path.exists(args.train_torchtext_file) and os.path.exists(args.dev_torchtext_file) and not args.overwrite_cache:
            logger.info("Loading data from processed file %s and %s", args.train_torchtext_file, args.dev_torchtext_file)
        else:
            logger.info("Preprocessing json data from dataset file at %s and %s", args.dev_file, args.train_file)
            preprocess_raw_json(args.train_file)
            preprocess_raw_json(args.dev_file)
        
        # 用torchtext处理数据
        self.RAW = data.RawField()  # 这个是完全空白的field，意味着不经过任何处理
        # explicit declaration for torchtext compatibility
        self.RAW.is_target = False
        self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list, lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True, tokenize=word_tokenize, lower=True, include_lengths=True)
        self.LABEL = data.Field(sequential=False, unk_token=None, use_vocab=False)
        # 数据中包含的数据
        dict_fields = {'id': ('id', self.RAW),
                       's_idx': ('s_idx', self.LABEL),
                       'e_idx': ('e_idx', self.LABEL),
                       'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
                       'question': [('q_word', self.WORD), ('q_char', self.CHAR)]}

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL), ('e_idx', self.LABEL),
                       ('c_word', self.WORD), ('c_char', self.CHAR),
                       ('q_word', self.WORD), ('q_char', self.CHAR)]
        
        # 判断 是否有 torchtext example 数据 
        if os.path.exists(args.train_example_file) and os.path.exists(args.dev_example_file) and not args.overwrite_cache:
            logger.info("Loading data from cached file %s and %s", args.train_example_file, args.dev_example_file)
            train_examples = torch.load(os.path.join(args.train_example_file))
            dev_examples = torch.load(os.path.join(args.dev_example_file))

            self.train = data.Dataset(examples=train_examples, fields=list_fields)
            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            logger.info("Loading data from processed torchtext json file %s and %s", 
                        args.train_torchtext_file, args.dev_torchtext_file)
             # 创建训练集和验证集
            self.train, self.dev = data.TabularDataset.splits(
                path='',
                train=args.train_torchtext_file,
                validation=args.dev_torchtext_file,
                format='json',
                fields=dict_fields)
            # 保存处理后的 torchtext example 数据 
            torch.save(self.train.examples, args.train_example_file)
            torch.save(self.dev.examples, args.dev_example_file)
        logger.info("Nmber of train examples %d", len(self.train))
        #cut too long context in the training set for efficiency.  改进，将长的文章截断！
        if args.context_threshold > 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]
        logger.info("Nmber of train examples %d", len(self.train))
        logger.info("Building vacab...")
        self.CHAR.build_vocab(self.train, self.dev) # 字符向量没有设置vector
        self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim))
        # 加载Glove向量，args.word_dim = 100

        logger.info("Building iterators...")
        # 生成迭代器
        self.train_iter, self.dev_iter = \
            data.BucketIterator.splits((self.train, self.dev),
                                       batch_sizes=[args.train_batch_size, args.dev_batch_size],
                                       device=args.device,
                                       sort_key=lambda x: len(x.c_word))


**注意！内存消耗特别高！可能需要16G的内存**

In [11]:
data = SQuADProcessor(args)

10/28/2019 01:42:52 - INFO - __main__ -   Preprocessing json data from dataset file at ./data/dev-v1.1.json and ./data/train-v1.1.json
Processing: 100%|██████████| 442/442 [00:21<00:00, 20.27it/s]
10/28/2019 01:43:14 - INFO - __main__ -   Save json data to new file ./data/train_torchtext.json
Processing: 100%|██████████| 48/48 [00:03<00:00, 13.91it/s]
10/28/2019 01:43:19 - INFO - __main__ -   Save json data to new file ./data/dev_torchtext.json
10/28/2019 01:43:20 - INFO - __main__ -   Loading data from processed torchtext json file ./data/train_torchtext.json and ./data/dev_torchtext.json
10/28/2019 01:51:49 - INFO - __main__ -   Nmber of train examples 87599
10/28/2019 01:51:49 - INFO - __main__ -   Nmber of train examples 87416
10/28/2019 01:51:49 - INFO - __main__ -   Building vacab...
10/28/2019 01:52:36 - INFO - torchtext.vocab -   Downloading vectors from http://nlp.stanford.edu/data/glove.6B.zip
.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                           
10/2

In [12]:
# 下面为args新增参数，并赋值
# hasattr() getattr() setattr() 函数使用方法详解https://www.cnblogs.com/cenyu/p/5713686.html
setattr(args, 'char_vocab_size', len(data.CHAR.vocab)) # 设置属性args.char_vocab_size的值 = len(data.CHAR.vocab)
setattr(args, 'word_vocab_size', len(data.WORD.vocab))
setattr(args, 'dataset_file', args.dev_file)
# setattr(args, 'prediction_file', f'prediction{args.gpu}.out')
print('data loading complete!')

data loading complete!


In [13]:
batch = next(iter(data.train_iter)) #一个batch的信息
print(batch)
# 训练集的batch_sizes=32
# batch.c_word = 32x264， 264 是32个样本中最长样本token的单词数
# batch.c_char = 32x264x25， 18 是某个单词字符的最大的数量

100%|█████████▉| 399863/400000 [00:30<00:00, 21177.72it/s]


[torchtext.data.batch.Batch of size 32]
	[.id]:['5733b0fb4776f41900661045', '57282d193acd2414000df653', '570713c490286e26004fc8ca', '57297608af94a219006aa476', '57261b3b38643c19005ad007', '56d3661659d6e414001462d8', '57265c18dd62a815002e82ab', '5731e68eb9d445190005e641', '5726d168f1498d1400e8ec36', '56e82b5c37bdd419002c4480', '5733b425d058e614000b60bd', '5726cff25951b619008f7eb0', '57318491e6313a140071d003', '5728233a2ca10214002d9ead', '573610e56c16ec1900b9295a', '5732c3521d5d2e14009ff8a2', '57302525a23a5019007fce5b', '56dfc11d231d4119001abda7', '56f7f512aef2371900625cd5', '5726609b708984140094c414', '5726f2815951b619008f831d', '5706068852bb8914006897bf', '56e7825400c9c71400d771ea', '5719d8214faf5e1900b8a81c', '57098332ed30961900e84258', '56e8862699e8941900975e58', '56dfa7277aa994140058dfa2', '57291d851d0469140077906b', '573267bee17f3d1400422950', '572fa92fb2c2fd14005682d3', '572833943acd2414000df6d4', '56f9888b9e9bad19000a0a4e']
	[.s_idx]:[torch.cuda.LongTensor of size 32 (GPU 0)]
	[

In [14]:
print(len(data.WORD.vocab)) # 108777个单词
print(data.WORD.vocab.vectors.shape) # 词向量维度

print(data.WORD.vocab.itos[:50]) # 前50个词频最高的单词
print("------"*10)
print(list(data.WORD.vocab.stoi.items())[0:50]) # 对应的索引

108777
torch.Size([108777, 100])
['<unk>', '<pad>', 'the', ',', 'of', '.', 'and', 'in', 'to', 'a', '"', 'is', 'was', 'as', ')', '(', 'for', '?', 'by', 'that', 'with', "'s", 'on', 'from', 'are', 'what', 'which', 'it', 'were', 'at', 'an', 'or', 'be', 'this', 'his', 'have', 'not', 'their', 'also', 'has', 'its', 'who', 'had', 'he', ';', 'other', 'first', 'one', 'but', 'new']
------------------------------------------------------------
[('<unk>', 0), ('<pad>', 1), ('the', 2), (',', 3), ('of', 4), ('.', 5), ('and', 6), ('in', 7), ('to', 8), ('a', 9), ('"', 10), ('is', 11), ('was', 12), ('as', 13), (')', 14), ('(', 15), ('for', 16), ('?', 17), ('by', 18), ('that', 19), ('with', 20), ("'s", 21), ('on', 22), ('from', 23), ('are', 24), ('what', 25), ('which', 26), ('it', 27), ('were', 28), ('at', 29), ('an', 30), ('or', 31), ('be', 32), ('this', 33), ('his', 34), ('have', 35), ('not', 36), ('their', 37), ('also', 38), ('has', 39), ('its', 40), ('who', 41), ('had', 42), ('he', 43), (';', 44), ('o

In [15]:
print(len(data.CHAR.vocab)) # 1307个单词
print(data.CHAR.vocab.itos[:50]) # 108777个单词
print("------"*10)
print(list(data.CHAR.vocab.stoi.items())[0:50]) # 对应的索引

1307
['<unk>', '<pad>', 'e', 't', 'a', 'i', 'n', 'o', 's', 'r', 'h', 'l', 'd', 'c', 'u', 'm', 'f', 'p', 'g', 'w', 'y', 'b', ',', 'v', '.', 'k', '1', '0', 'x', '2', '"', '-', 'j', '9', "'", ')', '(', '?', 'z', '5', '8', 'q', '3', '4', '7', '6', ';', ':', '–', '%']
------------------------------------------------------------
[('<unk>', 0), ('<pad>', 1), ('e', 2), ('t', 3), ('a', 4), ('i', 5), ('n', 6), ('o', 7), ('s', 8), ('r', 9), ('h', 10), ('l', 11), ('d', 12), ('c', 13), ('u', 14), ('m', 15), ('f', 16), ('p', 17), ('g', 18), ('w', 19), ('y', 20), ('b', 21), (',', 22), ('v', 23), ('.', 24), ('k', 25), ('1', 26), ('0', 27), ('x', 28), ('2', 29), ('"', 30), ('-', 31), ('j', 32), ('9', 33), ("'", 34), (')', 35), ('(', 36), ('?', 37), ('z', 38), ('5', 39), ('8', 40), ('q', 41), ('3', 42), ('4', 43), ('7', 44), ('6', 45), (';', 46), (':', 47), ('–', 48), ('%', 49)]


## BIDAF

In [0]:
class LSTM(nn.Module):
    """ LSTM 层 """
    def __init__(self, input_size, hidden_size, batch_first=False, num_layers=1, bidirectional=False, dropout=0.2):
        """
        input_size: hidden_size * 2，由于输入为 word embedding 和 character embedding 的 concatenation。
        hidden_size: LSTM 的隐藏层维度。
        bidirectional: 是否使用双向LSTM。
        batch_first: batch size 维度是否在第一个维度。
        dropout: 默认为 0.2。
        """
        super(LSTM, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=bidirectional,
                           batch_first=batch_first)
        self.reset_params()  # 重置参数
        self.dropout = nn.Dropout(p=dropout)

    def reset_params(self):
        """ 初始化 LSTM 参数 """
        for i in range(self.rnn.num_layers):
            nn.init.orthogonal_(getattr(self.rnn, f'weight_hh_l{i}')) # hidden-hidden weights
            # weight_hh_l{i}、weight_ih_l{i}、bias_hh_l{i}、bias_ih_l{i} 都是nn.LSTM源码里的参数
            # getattr取出源码里参数的值，用nn.init.orthogonal_正交进行重新初始化
            # nn.init初始化方法看这个链接：https://www.aiuai.cn/aifarm613.html
            nn.init.kaiming_normal_(getattr(self.rnn, f'weight_ih_l{i}')) # input-hidden weights
            nn.init.constant_(getattr(self.rnn, f'bias_hh_l{i}'), val=0) # hidden-hidden bias
            nn.init.constant_(getattr(self.rnn, f'bias_ih_l{i}'), val=0) # input-hidden bias
            getattr(self.rnn, f'bias_hh_l{i}').chunk(4)[1].fill_(1)
            # .chunk看下这个链接：https://blog.csdn.net/XuM222222/article/details/92380538
            # .fill_(1),下划线代表直接替换，看链接：https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.fill.html

            if self.rnn.bidirectional: # 双向，需要初始化反向的参数
                nn.init.orthogonal_(getattr(self.rnn, f'weight_hh_l{i}_reverse'))
                nn.init.kaiming_normal_(getattr(self.rnn, f'weight_ih_l{i}_reverse'))
                nn.init.constant_(getattr(self.rnn, f'bias_hh_l{i}_reverse'), val=0)
                nn.init.constant_(getattr(self.rnn, f'bias_ih_l{i}_reverse'), val=0)
                getattr(self.rnn, f'bias_hh_l{i}_reverse').chunk(4)[1].fill_(1)

    def forward(self, x):
        """
        x: 包含word embedding 和 character embedding 的 concatenation 和 序列长度的列表
        return
        x：LSTM 每个时间步的隐藏状态。
        h：最后一个时间步的隐藏状态。
        """
        # x 是一个元组(c, c_lens)
        x, x_len = x
        # x = (batch, seq_len, hidden_size * 2)
        # x_len = (batch) 一个batch中所有context或question的样本长度
        x = self.dropout(x)
        
        # 看下这篇博客理解：https://www.cnblogs.com/sbj123456789/p/9834018.html
        x_len_sorted, x_idx = torch.sort(x_len, descending=True)
        x_sorted = x.index_select(dim=0, index=x_idx)
        _, x_ori_idx = torch.sort(x_idx)

        x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted, x_len_sorted, batch_first=True)
        x_packed, (h, c) = self.rnn(x_packed)
        x = nn.utils.rnn.pad_packed_sequence(x_packed, batch_first=True)[0]
        
        x = x.index_select(dim=0, index=x_ori_idx)
        h = h.permute(1, 0, 2).contiguous().view(-1, h.size(0) * h.size(2)).squeeze()
        h = h.index_select(dim=0, index=x_ori_idx)
        # x = (batch, seq_len, hidden_size * 2) 
        # h = (1, batch, hidden_size * 2) 这个维度不用管
        return x, h

In [0]:
class Linear(nn.Module):
    """ 一个线性层 """
    def __init__(self, in_features, out_features, dropout=0.0):
        super(Linear, self).__init__()
        self.linear = nn.Linear(in_features=in_features, out_features=out_features)
        # in_features = hidden_size * 2
        # out_features = hidden_size * 2
        if dropout > 0:
            self.dropout = nn.Dropout(p=dropout)
        self.reset_params()

    def reset_params(self):
        """ 初始化参数 """
        nn.init.kaiming_normal_(self.linear.weight)
        nn.init.constant_(self.linear.bias, 0)

    def forward(self, x):
        if hasattr(self, 'dropout'): # 判断self有没有'dropout'这个参数，返回bool值
            x = self.dropout(x)
        x = self.linear(x)
        return x

In [0]:
# 看英文论文或这篇博客理解模型：https://blog.csdn.net/u014665013/article/details/79793395
class BiDAF(nn.Module):
    def __init__(self, args, pretrained):
        # pretrained = data.WORD.vocab.vectors = (108777, 100)
        super(BiDAF, self).__init__()
        self.args = args

        # 1. Character Embedding Layer 
        # 字符编码层 char_vocab_size = 1307，char_dim = 8
        self.char_emb = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=1)
        # 初始化权重 
        nn.init.uniform_(self.char_emb.weight, -0.001, 0.001)
        # Char-CNN 用于提取 charactor embedding 特征
        # char_channel_size = 100 卷积核数量，卷积核维度 [char_dim, char_channel_width] = [8,5]
        self.char_conv = nn.Conv2d(1, args.char_channel_size, (args.char_dim, args.char_channel_width))
        
        # 2. Word Embedding Layer
        # initialize word embedding with GloVe ，使用 Glove 向量初始化词向量权重
        self.word_emb = nn.Embedding.from_pretrained(pretrained, freeze=True)

        assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim)
        
        # highway network
        for i in range(2):
            # 设置 highway_linear 和 highway_gate，hidden_size = 100
            setattr(self, f'highway_linear{i}',
                    nn.Sequential(Linear(args.hidden_size * 2, args.hidden_size * 2), nn.ReLU()))
            setattr(self, f'highway_gate{i}',
                    nn.Sequential(Linear(args.hidden_size * 2, args.hidden_size * 2), nn.Sigmoid()))

        # 3. Contextual Embedding Layer
        # 上下文，和答案嵌入层，用的LSTM
        # 下面LSTM定位到了自定义的class LSTM(nn.Module)。
        self.context_LSTM = LSTM(input_size=args.hidden_size * 2,
                                 hidden_size=args.hidden_size,
                                 bidirectional=True,
                                 batch_first=True,
                                 dropout=args.dropout) 

        # 4. Attention Flow Layer
        self.att_weight_c = Linear(args.hidden_size * 2, 1)
        self.att_weight_q = Linear(args.hidden_size * 2, 1)
        self.att_weight_cq = Linear(args.hidden_size * 2, 1)

        # 5. Modeling Layer
        self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8,
                                   hidden_size=args.hidden_size,
                                   bidirectional=True,
                                   batch_first=True,
                                   dropout=args.dropout)

        self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2,
                                   hidden_size=args.hidden_size,
                                   bidirectional=True,
                                   batch_first=True,
                                   dropout=args.dropout)

        # 6. Output Layer
        self.p1_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout)
        self.p1_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout)
        self.p2_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout)
        self.p2_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout)

        self.output_LSTM = LSTM(input_size=args.hidden_size * 2,
                                hidden_size=args.hidden_size,
                                bidirectional=True,
                                batch_first=True,
                                dropout=args.dropout)

        self.dropout = nn.Dropout(p=args.dropout)

    def forward(self, batch):
        # batch里面有'id','s_idx','e_idx', 'c_word','c_char','q_word', 'q_char'数据
        # TODO: More memory-efficient architecture
        def char_emb_layer(x):
            """
            :param x: (batch, seq_len, word_len)
            :return: (batch, seq_len, char_channel_size)
            """
            # x = [batch_sizes,seq_len,word_len]
            batch_size = x.size(0)
            # [batch, seq_len, word_len, char_dim]
            x = self.dropout(self.char_emb(x))
            # [batch * seq_len, 1, char_dim, word_len], 增加 channel 的维度
            x = x.view(-1, self.args.char_dim, x.size(2)).unsqueeze(1)
            # -> [batch*seq_len, char_channel_size, 1, conv_len] -> [batch*seq_len, char_channel_size, conv_len]
            x = self.char_conv(x).squeeze()
            # -> [batch*seq_len, char_channel_size, 1] -> [batch*seq_len, char_channel_size]
            x = F.max_pool1d(x, x.size(2)).squeeze()
            # [batch, seq_len, char_channel_size]
            x = x.view(batch_size, -1, self.args.char_channel_size)

            return x

        def highway_network(x1, x2):
            """
            :param x1: char embedding [batch, seq_len, char_channel_size]
            :param x2: word embedding [batch, seq_len, word_dim]
            :return: [batch, seq_len, hidden_size * 2]
            """
            # 拼接 char embed 与 word embed: [batch, seq_len, char_channel_size+word_dim]=hidden_size*2
            x = torch.cat([x1, x2], dim=-1)
            for i in range(2):
                # h: [batch, seq_len, hidden_size*2]
                h = getattr(self, f'highway_linear{i}')(x)
                # g: [batch, seq_len, hidden_size*2]
                g = getattr(self, f'highway_gate{i}')(x)
                x = g * h + (1 - g) * x
            # (batch, seq_len, hidden_size * 2)
            return x

        def att_flow_layer(c, q):
            """
            :param c: 文章的 LSTM 输出 [batch, c_len, hidden_size*2]
            :param q: 问题的 LSTM 输出 [batch, q_len, hidden_size*2]
            :return: [batch, c_len, q_len]
            """
            c_len = c.size(1)
            q_len = q.size(1)

            # (batch, c_len, q_len, hidden_size * 2)
            #c_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1)
            # (batch, c_len, q_len, hidden_size * 2)
            #q_tiled = q.unsqueeze(1).expand(-1, c_len, -1, -1)
            # (batch, c_len, q_len, hidden_size * 2)
            #cq_tiled = c_tiled * q_tiled
            #cq_tiled = c.unsqueeze(2).expand(-1, -1, q_len, -1) * q.unsqueeze(1).expand(-1, c_len, -1, -1)

            cq = []
            # 1、相似度计算方式，看下这篇博客理解：https://blog.csdn.net/u014665013/article/details/79793395
            for i in range(q_len):
                # [batch, 1, hidden_size*2]  .select：https://blog.csdn.net/hungryof/article/details/51802829
                qi = q.select(1, i).unsqueeze(1)
                # -> [batch, c_len, hidden_size*2] -> [batch, c_len, 1] -> [batch, c_len]
                ci = self.att_weight_cq(c * qi).squeeze()
                cq.append(ci)
            # [batch, c_len, q_len]
            cq = torch.stack(cq, dim=-1)
            
            # [batch, c_len, hidden_size*2] -> [batch, c_len, 1] -> [batch, c_len, q_len]
            # [batch, c_len, hidden_size*2] -> [batch, q_len, 1] -> [batch, c_len, q_len]
            # [batch, c_len, q_len]
            s = self.att_weight_c(c).expand(-1, -1, q_len) + \
                self.att_weight_q(q).permute(0, 2, 1).expand(-1, c_len, -1) + cq
            
            # 2、context-to-query attention(C2Q): 计算对每一个 context word 而言哪些 query words 和它最相关。
            # [batch, c_len, q_len]
            a = F.softmax(s, dim=2)
            # [batch, c_len, q_len] dot [batch, q_len, hidden_size*2] -> [batch, c_len, hidden_size*2]
            c2q_att = torch.bmm(a, q)
            
            # 3、query-to-context attention(Q2C): 计算对每一个 query word 而言哪些 context words 和它最相关
            # [batch, c_len, q_len] -> [batch, c_len] -> [batch, 1, c_len]
            b = F.softmax(torch.max(s, dim=2)[0], dim=1).unsqueeze(1)
            # [batch, 1, c_len] dot [batch, c_len, hidden_size * 2] -> [batch, hidden_size * 2]
            q2c_att = torch.bmm(b, c).squeeze()
            # (batch, c_len, hidden_size * 2) (tiled)
            # q2c_att = torch.stack([q2c_att] * c_len, dim=1)
            q2c_att = q2c_att.unsqueeze(1).expand(-1, c_len, -1)
            
            # 4、最后将context embedding和C2Q、Q2C的结果（三个矩阵）拼接起来
            # (batch, c_len, hidden_size * 8)
            x = torch.cat([c, c2q_att, c * c2q_att, c * q2c_att], dim=-1)
            
            return x

        def output_layer(g, m, l):
            """
            :param g: (batch, c_len, hidden_size * 8)
            :param m: (batch, c_len ,hidden_size * 2)
             #  l = c_lens
            :return: p1: (batch, c_len), p2: (batch, c_len)
            """
            p1 = (self.p1_weight_g(g) + self.p1_weight_m(m)).squeeze()
            # (batch, c_len)
            m2 = self.output_LSTM((m, l))[0]
            # (batch, c_len, hidden_size * 2)
            p2 = (self.p2_weight_g(g) + self.p2_weight_m(m2)).squeeze()
            # (batch, c_len)
            return p1, p2

        # 1. Character Embedding Layer
        # 令:一个batch中单词数量最多的样本长度为 seq_len
        # 令:一个batch中某个单词长度最长的单词长度为 word_len
        # batch.c_char: [batch,seq_len,word_len] 后两个维度对应context
        # c_char: [batch, seq_len, char_channel_size]
        c_char = char_emb_layer(batch.c_char) 
        # batch.q_char = (batch,seq_len,word_len) 后两个维度对应question
        # q_char = (batch, seq_len, char_channel_size)
        q_char = char_emb_layer(batch.q_char)
        
        # 2. Word Embedding Layer
        # batch.c_word[0] = (batch,seq_len) 后一个维度对应context
        # c_word = (batch, seq_len, word_dim) word_dim是Glove词向量维度
        c_word = self.word_emb(batch.c_word[0])
        # batch.q_word[0] = (batch,seq_len) 后一个维度对应question
        # q_word = (batch, seq_len, word_dim)
        q_word = self.word_emb(batch.q_word[0]) 
        # c_lens：一个batch中所有context的样本长度
        c_lens = batch.c_word[1]
        # q_lens：一个batch中所有question的样本长度
        q_lens = batch.q_word[1]

        # Highway network
        # c = (batch, seq_len, hidden_size * 2)
        c = highway_network(c_char, c_word)
        # q = (batch, seq_len, hidden_size * 2)
        q = highway_network(q_char, q_word)
        
        # 3. Contextual Embedding Layer  这个返回两个变量，简化成一个
        # c: [batch, seq_len, hidden_size * 2]
        c = self.context_LSTM((c, c_lens))[0]
        
        # q: [batch, seq_len, hidden_size * 2]
        q = self.context_LSTM((q, q_lens))[0]
        
        # 4. Attention Flow Layer
        # [batch, c_len, hidden_size * 8]
        g = att_flow_layer(c, q)
        
        # 5. Modeling Layer
        m = self.modeling_LSTM2((self.modeling_LSTM1((g, c_lens))[0], c_lens))[0]
        # self.modeling_LSTM1((g, c_lens))[0] = (batch, c_len, hidden_size * 2) # 2因为是双向
        # m = (batch, c_len, hidden_size * 2) 2因为是双向
        
        # 6. Output Layer
        p1, p2 = output_layer(g, m, c_lens) # 预测开始位置和结束位置
        # (batch, c_len), (batch, c_len)
        return p1, p2

In [0]:
# model = BiDAF(args, data.WORD.vocab.vectors).to(args.device)

In [0]:
class EMA():
    """ 滑动平均 exponential moving averages """
    def __init__(self, mu):
        # mu = args.exp_decay_rate = 0.999
        self.mu = mu
        self.shadow = {}

    def register(self, name, val):
        """用于记录模型参数"""
        # name:各个参数层的名字, param.data；参数层的数据
        self.shadow[name] = val.clone() # 建立字典
        # clone()得到的Tensor不仅拷贝了原始的value，而且会计算梯度传播信息，copy_()只拷贝数值

    def get(self, name):
        return self.shadow[name]

    def update(self, name, x):
        assert name in self.shadow
        new_average = (1.0 - self.mu) * x + self.mu * self.shadow[name]
        self.shadow[name] = new_average.clone()
        

In [0]:
def test(model, ema, args, data):
    criterion = nn.CrossEntropyLoss()
    loss = 0
    answers = dict()
    model.eval()

    backup_params = EMA(0)  # 是否不起作用？
    for name, param in model.named_parameters():
        if param.requires_grad:
            backup_params.register(name, param.data)
            param.data.copy_(ema.get(name))

    with torch.no_grad():
        for batch in tqdm(iter(data.dev_iter)):
            p1, p2 = model(batch)
            batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            
            loss += batch_loss.item()

            # (batch, c_len, c_len)
            batch_size, c_len = p1.size()
            ls = nn.LogSoftmax(dim=1)
            mask = (torch.ones(c_len, c_len) * float('-inf')).to(args.device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1)
            score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask
            score, s_idx = score.max(dim=1)
            score, e_idx = score.max(dim=1)
            s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze()

            for i in range(batch_size):
                id = batch.id[i]
                answer = batch.c_word[0][i][s_idx[i]:e_idx[i]+1]
                answer = ' '.join([data.WORD.vocab.itos[idx] for idx in answer])
                answers[id] = answer

        for name, param in model.named_parameters():
            if param.requires_grad:
                param.data.copy_(backup_params.get(name))

    with open(args.prediction_file, 'w', encoding='utf-8') as f:
        print(json.dumps(answers), file=f)

    results = evaluate.main(args)

    return loss, results['exact_match'], results['f1']


训练函数

In [0]:
def train(args, data):
    model = BiDAF(args, data.WORD.vocab.vectors).to(args.device) # 定义主模型类实例
    # exponential moving averages 初始化
    ema = EMA(args.exp_decay_rate) # args.exp_decay_rate = 0.999
    for name, param in model.named_parameters(): 
        if param.requires_grad:
            ema.register(name, param.data) # 参数名字和对应的参数数据形成字典
    # p.requires_grad = True or False 保留有梯度的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adadelta(parameters, lr=args.learning_rate)
    # 交叉熵损失
    criterion = nn.CrossEntropyLoss()
    # 初始化 tensorboard 记录
    writer = SummaryWriter()

    model.train()
    loss, last_epoch = 0, -1
    max_dev_exact, max_dev_f1 = -1, -1
    global_step = 0
    for epoch in range(args.epoch):
        logger.info("***** Epoch：%d *****", (epoch + 1))
        iterator = data.train_iter
        for i, batch in enumerate(iterator):
            # (batch, c_len), (batch, c_len)
            p1, p2 = model(batch)
            optimizer.zero_grad()
            # 最后的目标函数：batch.s_idx是答案开始的位置，batch.e_idx是答案结束的位置
            batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx)
            
            loss += batch_loss.item()
            batch_loss.backward()
            optimizer.step()
            # exponential moving averages
            for name, param in model.named_parameters():
                if param.requires_grad:
                    ema.update(name, param.data) # 更新训练完后的的参数数据

            global_step += 1
            if (i + 1) % args.logging_steps == 0:
                c = (i + 1) // args.logging_steps
                logger.info("Step: {} Training loss：{:.4f}".format(i+1, loss))
                writer.add_scalar('loss/train', loss, global_step)
                loss = 0

    writer.close()
    logger.info("Saving model to %s", args.model_dir)
    torch.save(model.state_dict(), args.model_dir)
    # 测试模型
    dev_loss, dev_exact, dev_f1 = test(model, ema, args, data)
    logger.info(f'logging dev EM: {dev_exact:.3f} / max dev F1: {dev_f1:.3f}')


In [0]:
train(args, data)

下载文件

In [24]:
!ls

BiDAF.pkl  data  evaluate.py  __pycache__  runs  sample_data


In [25]:
!tar -zcvf runs.tar.gz runs

runs/
runs/Oct28_01-59-56_0469296fdbb4/
runs/Oct28_01-59-56_0469296fdbb4/events.out.tfevents.1572227998.0469296fdbb4.147.0


In [0]:
# results = evaluate.main(args)
# dev EM: 54.13 dev F1:  66.95