## Bi-LSTM Conditional Random Field Discussion
- https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
- https://pytorch.apachecn.org/docs/0.3/nlp_advanced_tutorial.html
- 《Log-Linear Models, MEMMs, and CRFs》

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7f05500945d0>

In [3]:
def argmax(vec):
    # 返回最大概率对应的类别
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# 使用数值上稳定的方法为前向算法计算指数和的对数
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


In [23]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # 将LSTM的输出映射到标记空间
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # 过渡参数矩阵. 条目 i,j 是 *从* j *到* i 的过渡的分数
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))
        
        print("self.transitions:", self.transitions)
        
        # 这两句声明强制约束了我们不能向开始标记标注传递和从结束标注传递
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
        print("self.transitions ------>:", self.transitions)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # 2  feats (seq, tag_nums)
        # 执行前向算法来计算分割函数
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        print("init_alphas:", init_alphas)
        # START_TAG 包含所有的分数
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # 将其包在一个变量类型中继而得到自动的反向传播
        forward_var = init_alphas

        # 在句子中迭代
        # 第一个词的(1, score)
        for feat in feats:
            alphas_t = []  # 当前时间步的前向变量
            for next_tag in range(self.tagset_size):
                # 对 emission 得分执行广播机制: 它总是相同的,
                # 不论前一个标注如何
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                # trans_score 第 i 个条目是从i过渡到 next_tag 的分数
                trans_score = self.transitions[next_tag].view(1, -1)
                # next_tag_var 第 i 个条目是在我们执行 对数-求和-指数 前
                # 边缘的值 (i -> next_tag)
                next_tag_var = forward_var + trans_score + emit_score
                # 这个标注的前向变量是对所有的分数执行 对数-求和-指数
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        # 1
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        # (batch_size, seq_len, hidden_dim//2*2)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # 去掉中间的
        print("lstm_out:", lstm_out.size())
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        print("lstm_feats:", lstm_feats.size())
        # (seq, tag_nums)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # 给出标记序列的分数
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # 在对数空间中初始化维特比变量
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # 在第 i 步的 forward_var 存放第 i-1 步的维特比变量
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []        # 存放这一步的后指针
            viterbivars_t = []  # 存放这一步的维特比变量

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] 存放先前一步标注i的
                # 维特比变量, 加上了从标注 i 到 next_tag 的过渡的分数
                # 我们在这里并没有将 emission 分数包含进来, 因为
                # 最大值并不依赖于它们(我们在下面对它们进行的是相加)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # 现在将所有 emission 得分相加, 将 forward_var
            # 赋值到我们刚刚计算出来的维特比变量集合
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # 过渡到 STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # 跟着后指针去解码最佳路径
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # 弹出开始的标签 (我们并不希望把这个返回到调用函数)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # 健全性检查
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        print("feats:", feats)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # 不要把这和上面的 _forward_alg 混淆
        # 得到 BiLSTM 输出分数
        lstm_feats = self._get_lstm_features(sentence)

        # 给定特征, 找到最好的路径
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [24]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

# 制造训练数据
training_data = [(
    "长 城 位 于 北 京 市 延 庆 县".split(),
    "B I O O B I I B I I".split()
), (
    "中 国 最 好 的 大 学 是 清 华 大 学".split(),
    "B I O O O B I O B I I I".split()
)]


word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

In [25]:
print(word_to_ix)

{'长': 0, '城': 1, '位': 2, '于': 3, '北': 4, '京': 5, '市': 6, '延': 7, '庆': 8, '县': 9, '中': 10, '国': 11, '最': 12, '好': 13, '的': 14, '大': 15, '学': 16, '是': 17, '清': 18, '华': 19}


In [26]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

self.transitions: Parameter containing:
tensor([[ 0.2528, -1.2716, -0.7777, -0.3324, -0.1403],
        [-1.0975, -0.7812,  0.2453, -0.6474, -0.3377],
        [-0.6468, -0.5171,  0.3065, -0.9096,  1.3925],
        [ 0.6282,  0.0913,  1.3686, -1.6435,  0.5123],
        [-0.1620,  0.5743,  0.4346,  0.9302, -0.7140]], requires_grad=True)
self.transitions ------>: Parameter containing:
tensor([[ 2.5280e-01, -1.2716e+00, -7.7772e-01, -3.3236e-01, -1.0000e+04],
        [-1.0975e+00, -7.8125e-01,  2.4528e-01, -6.4736e-01, -1.0000e+04],
        [-6.4685e-01, -5.1715e-01,  3.0654e-01, -9.0956e-01, -1.0000e+04],
        [-1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04],
        [-1.6203e-01,  5.7434e-01,  4.3463e-01,  9.3018e-01, -1.0000e+04]],
       requires_grad=True)


In [27]:
# 训练前检查预测结果
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print("训练前：", training_data[0][0], model(precheck_sent))


# 通常不会训这么多epochs， 这是demo数据
# for epoch in range(300):  
for sentence, tags in training_data:
    # 第一步: 需要记住的是Pytorch会累积梯度
    # 我们需要在每次实例之前把它们清除
    # ['the', 'wall', 'street', 'journal', 'reported', 'today', 'that', 'apple', 'corporation', 'made', 'money'] 
    # ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'B', 'I', 'O', 'O']
    model.zero_grad()

    # 第二步: 为我们的网络准备好输入, 即把它们转变成单词索引变量 (Variables)
    sentence_in = prepare_sequence(sentence, word_to_ix)
    targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
    print(sentence_in)
    # 第三步: 运行前向传递
    # 负对数似然
    loss = model.neg_log_likelihood(sentence_in, targets)
    print("loss:", loss)
    break
    # 第四步: 计算损失, 梯度以及使用 optimizer.step() 来更新参数
    loss.backward()
    optimizer.step()

# 在训练之后检查预测结果
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    print("训练后：", training_data[0][0], model(precheck_sent))

lstm_out: torch.Size([10, 1, 4])
lstm_feats: torch.Size([10, 5])
训练前： ['长', '城', '位', '于', '北', '京', '市', '延', '庆', '县'] (tensor(6.2721), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
lstm_out: torch.Size([10, 1, 4])
lstm_feats: torch.Size([10, 5])
feats: tensor([[ 0.4351,  0.1619, -0.4321, -0.3444, -0.2586],
        [ 0.3793,  0.1384, -0.4004, -0.3314, -0.2840],
        [ 0.4419,  0.1694, -0.2491, -0.4452, -0.2906],
        [ 0.4314,  0.1426, -0.4533, -0.3891, -0.2469],
        [ 0.5173,  0.1835, -0.3040, -0.5167, -0.2573],
        [ 0.5301,  0.2051, -0.3536, -0.4381, -0.2521],
        [ 0.4356,  0.1780, -0.2535, -0.4016, -0.3052],
        [ 0.4490,  0.2170, -0.0948, -0.3929, -0.3399],
        [ 0.3515,  0.1266, -0.3488, -0.3474, -0.3028],
        [ 0.2633,  0.1206, -0.3657, -0.1800, -0.3492]],
       grad_fn=<AddmmBackward>)
init_alphas: tensor([[-10000., -10000., -10000., -10000., -10000.]])
loss: tensor([14.5125], grad_fn=<SubBackward0>)
lstm_out: torch.Siz

In [54]:
x = torch.randn(2, 3, 4)
x

tensor([[[-0.0113, -0.7870,  0.2859,  0.1316],
         [ 0.3836,  0.9128, -0.0574, -0.8265],
         [ 1.2969, -1.2505, -1.3298,  1.0798]],

        [[ 1.0392,  0.9488, -1.0884,  0.2345],
         [ 0.6809,  0.6784, -0.4275, -1.6360],
         [ 1.2037, -1.0499, -0.3183, -0.5891]]])

In [26]:
_, y = torch.max(x, 1)

In [27]:
y

tensor([[0, 1, 1, 0],
        [2, 0, 0, 1]])

In [30]:
x[0, y].view(1, -1).expand(1, x.size()[1])

RuntimeError: The expanded size of the tensor (3) must match the existing size (32) at non-singleton dimension 1.  Target sizes: [1, 3].  Tensor sizes: [1, 32]

In [31]:
transitions = nn.Parameter(torch.randn(5, 5))
        
print("self.transitions:", transitions)

# These two statements enforce the constraint that we never transfer
# to the start tag and we never transfer from the stop tag
transitions.data[3, :] = -10000
transitions.data[:, 4] = -10000
print("self.transitions ------>:", transitions)

self.transitions: Parameter containing:
tensor([[-2.0531,  0.1550, -0.7984, -0.7743,  1.3765],
        [ 0.3320,  0.0276, -1.6227,  0.3256, -0.3506],
        [-0.1191, -0.0183,  0.9328, -1.5924,  1.8704],
        [ 0.0952, -0.8484,  0.4585, -0.6996,  0.0129],
        [ 0.1481,  0.7436, -0.9027, -1.5715, -0.9465]], requires_grad=True)
self.transitions ------>: Parameter containing:
tensor([[-2.0531e+00,  1.5505e-01, -7.9837e-01, -7.7434e-01, -1.0000e+04],
        [ 3.3201e-01,  2.7574e-02, -1.6227e+00,  3.2563e-01, -1.0000e+04],
        [-1.1905e-01, -1.8328e-02,  9.3275e-01, -1.5924e+00, -1.0000e+04],
        [-1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04],
        [ 1.4810e-01,  7.4357e-01, -9.0270e-01, -1.5715e+00, -1.0000e+04]],
       requires_grad=True)


In [32]:
torch.full((1, 5), -10000.)

tensor([[-10000., -10000., -10000., -10000., -10000.]])

In [None]:
lstm_out.view(len(sentence), self.hidden_dim)

In [22]:
x1 = torch.randn(10, 1, 4)
x2 = torch.randn(2, 1, 2)

In [36]:
x1.size(), x2.size()

(torch.Size([10, 1, 4]), torch.Size([2, 1, 2]))

In [38]:
x1.view(10, 4).size()

torch.Size([10, 4])

In [60]:
x_one = torch.tensor(0.2)

In [61]:
x_one

tensor(0.2000)

In [64]:
x_one.view(1, -1).expand(1, 5)

tensor([[0.2000, 0.2000, 0.2000, 0.2000, 0.2000]])