## 更大的context
- 使用额外的context（语境/上下文）训练我们的语言模型

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from collections import Counter

In [2]:
train_file = './data/bobsue.prevsent.train.tsv'
dev_file = './data/bobsue.prevsent.dev.tsv'
test_file = './data/bobsue.prevsent.test.tsv'
word_file = './data/bobsue.voc.txt'

BATCH_SIZE = 32       # 批次大小
EMBEDDING_DIM = 200   # 词向量维度
HIDDEN_DIM = 200      # 隐含层
GRAD_CLIP = 5.        # 梯度截断值
EPOCHS = 20
LEARNING_RATE = 0.005     # 初始学习率

BEST_VALID_LOSS = float('inf')     # 初始验证集上的损失值，设为最大
MODEL_PATH = "lm-large-cont-dim{}.pth"   # 模型名称
USE_CUDA = torch.cuda.is_available()     # 是否使用GPU
NUM_CUDA = torch.cuda.device_count()     # GPU数量

In [3]:
def read_word_set(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.readlines()
    words = [w.strip() for w in text]
    return words

In [4]:
words_set = read_word_set(word_file)
word2idx = {w:i for i, w in enumerate(words_set, 1)}
idx2word = {i:w for i, w in enumerate(words_set, 1)}
# 设置 <pad> 值为 0
PAD_IDX = 0
idx2word[PAD_IDX] = '<pad>'
word2idx['<pad>'] = PAD_IDX

In [5]:
def read_corpus(path):
    """读取数据集，返回句子列表"""
    contexts = []
    target_sentences = []
    with open(path, 'r', encoding='utf-8') as f:
        for sentence in f.readlines():
            sentence = sentence.strip()
            context, target_sentence = sentence.split('\t')
            contexts.append(context)
            target_sentences.append(target_sentence)
    
    return (contexts, target_sentences)

In [6]:
train_context, train_target = read_corpus(train_file)

In [7]:
len(train_context), len(train_target)

(6036, 6036)

In [8]:
train_context[0], train_target[0]

('<s> Sue realized she was really bored . </s>',
 '<s> She ate quickly and asked to be taken home . </s>')

In [9]:
train_context, train_target = read_corpus(train_file)
dev_context, dev_target = read_corpus(dev_file)
test_context, test_target = read_corpus(test_file)

train_words = [w for s in train_context+train_target for w in s.split()]
dev_words = [w for s in dev_context+dev_target for w in s.split()]
test_words = [w for s in test_context+test_target for w in s.split()]

In [10]:
print("训练集集句子个数：{}".format(len(test_context)))
print("验证集句子个数：{}".format(len(dev_context)))
print("测试集句子个数：{}".format(len(test_context)))

print("训练集集单词个数：{}".format(len(train_words)))
print("验证集单词个数：{}".format(len(dev_words)))
print("测试集单词个数：{}".format(len(test_words)))

训练集集句子个数：750
验证集句子个数：750
测试集句子个数：750
训练集集单词个数：139045
验证集单词个数：16984
测试集单词个数：17233


In [11]:
print("训练集第二句最长句子长度为：{}".format(max([len(s.split()) for s in train_target])))
print("验证集第二句最长句子长度为：{}".format(max([len(s.split()) for s in dev_target])))
print("测试集第二句最长句子长度为：{}".format(max([len(s.split()) for s in test_target])))

训练集第二句最长句子长度为：21
验证集第二句最长句子长度为：20
测试集第二句最长句子长度为：21


In [12]:
def prepare_sequence(context, target, word2idx, seq_len=21):
    """输入语料句子列表，返回模型输入序列的idx"""
    contexts = []
    sentences = []
    labels = []
    
    for c, t in zip(context,target):
        c_words = c.split()
        c_tample = [0] * seq_len
        for i, w in enumerate(c_words):
            c_tample[i] = word2idx[w]
        contexts.append(c_tample)
        
            
        t_words = t.split()
        sentence_tample = [0] * seq_len
        for i, w in enumerate(t_words[:-1]):
            sentence_tample[i] = word2idx[w]
        sentences.append(sentence_tample)
        
        target_tample = [0] * seq_len
        for i, w in enumerate(t_words[1:]):
            target_tample[i] = word2idx[w]
        labels.append(target_tample)
        
    return contexts, sentences, labels

In [13]:
train_context, train_data, train_label = prepare_sequence(train_context, train_target, word2idx)
dev_context, dev_data, dev_label = prepare_sequence(dev_context, dev_target, word2idx)
test_context, test_data, test_label = prepare_sequence(test_context, test_target, word2idx)

In [14]:
idx = 0
for i in train_context[idx]:
    if i==0:
        print()
        break
    print(idx2word[i], end=' ')
    
for i in train_data[idx]:
    if i==0:
        print()
        break
    print(idx2word[i], end=' ')
    
for i in train_label[idx]:
    if i==0:
        print()
        break
    print(idx2word[i], end=' ')

<s> Sue realized she was really bored . </s> 
<s> She ate quickly and asked to be taken home . 
She ate quickly and asked to be taken home . </s> 


In [15]:
def get_batch(context, data, label, batch_size=32):
    """
    构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label
   
    """
    batch_data = []
    context_tensor = torch.tensor(context, dtype=torch.long)
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            print("data not eq batch size.")
            break
            cbatch = context_tensor[start: ]
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            print(batch.size())
        else:
            cbatch = context_tensor[start: end]
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((cbatch, dbatch, lbatch))
    return batch_data

In [16]:
train_batch = get_batch(train_context, train_data, train_label, batch_size=BATCH_SIZE)
dev_batch = get_batch(dev_context, dev_data, dev_label, batch_size=BATCH_SIZE)
test_batch = get_batch(test_context, test_data, test_label, batch_size=BATCH_SIZE)

data not eq batch size.
data not eq batch size.
data not eq batch size.


In [17]:
class LSTMLM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(LSTMLM, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2word = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, context, data):
        
        # [batch_size, seq_len] ==> [batch_size, seq_len, embedding_dim]
        context_embed = self.word_embeddings(context)
        embeds = self.word_embeddings(data)
        # [batch, seq_len, imput_size] ==> [batch, seq_len, hidden_size]
        _, hidden = self.lstm(context_embed)
        lstm_out, (h_n, c_n) = self.lstm(embeds, hidden)
        # [batch, seq_len, hidden_size] ==> [batch*seq_len, vocab_size]
        target_space = self.hidden2word(lstm_out.contiguous().view(-1, self.hidden_dim))
        # 添加mask
        mask = (data != PAD_IDX).view(-1)
        # 获取 非pad 数据
        mask_target = target_space[mask]
        
        target_scores = F.log_softmax(mask_target, dim=1)
        return target_scores

In [18]:
def acc_score(y_hat, y):
    # 返回最大的概率的索引
    pred = y_hat.argmax(dim=1)
    # print(y.view(-1))
    acc_count = torch.eq(pred, y.view(-1))
    score = acc_count.sum().item() / acc_count.size()[0]
    return score

def train(model, device, iterator, optimizer, criterion, grad_clip):
    epoch_loss = 0  # 积累变量
    epoch_acc = 0   # 积累变量
    model.train()   # 该函数表示PHASE=Train
    
    for c, x, y in iterator:  # 拿每一个minibatch
        c = c.to(device)
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        mask = y != PAD_IDX
        pure_y = y[mask]
        
        fx = model(c, x)                 # 进行forward
        loss = criterion(fx, pure_y)  # 计算loss
        acc = acc_score(fx, pure_y)   # 计算准确率
        loss.backward()               # 进行BP
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()  # 更新参数
        
        epoch_loss += loss
        epoch_acc += acc
        
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

def evaluate(model, device, iterator, criterion):
    model.eval()  # 不更新参数，预测模式
    epoch_loss=0  # 积累变量
    epoch_acc=0   # 积累变量
    
    with torch.no_grad():
        for c, x, y in iterator:
            c = c.to(device)
            x = x.to(device)
            y = y.to(device)
            mask = y != PAD_IDX
            pure_y = y[mask]
            
            fx = model(c, x)
            loss = criterion(fx, pure_y)
            acc = acc_score(fx, pure_y)
            epoch_loss += loss
            epoch_acc += acc
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [19]:
model = LSTMLM(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx))
# 使用GPU
DEVICE = torch.device("cuda" if USE_CUDA else 'cpu')
model = model.to(DEVICE)
if NUM_CUDA > 1:
    device_ids = list(range(NUM_CUDA))
    print(device_ids)
    model = nn.DataParallel(model, device_ids=device_ids)

[0, 1, 2, 3]


In [20]:
criterion = nn.NLLLoss()            # 指定损失函数
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)  # 指定优化器
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？

model_name = MODEL_PATH.format(EMBEDDING_DIM)
LOG_INFO = 'Epoch:{0}|Train Loss:{1}|Train Acc:{2}|Val Loss:{3}|Val Acc:{4}'

SCHED_NUM = 0
for epoch in range(1, EPOCHS+1):
    train_loss, train_acc = train(model, DEVICE, train_batch, optimizer, criterion, GRAD_CLIP)
    valid_loss, valid_acc = evaluate(model, DEVICE, dev_batch, criterion)
    if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print("Save model: ", model_name)
        SCHED_NUM = 0
    else:
        SCHED_NUM += 1
        scheduler.step()
        print("Current lr:", optimizer.param_groups[0]['lr'])
        if SCHED_NUM == 7:
            print(LOG_INFO.format(epoch,train_loss,train_acc,valid_loss,valid_acc))
            print("Early stop!")
            break
    print(LOG_INFO.format(epoch,train_loss,train_acc,valid_loss,valid_acc))

  self.dropout, self.training, self.bidirectional, self.batch_first)
  "type " + obj.__name__ + ". It won't be checked "


Save model:  lm-large-cont-dim200.pth
Epoch:1|Train Loss:4.101949691772461|Train Acc:0.27306997536248784|Val Loss:3.583974838256836|Val Acc:0.3116495879365413
Save model:  lm-large-cont-dim200.pth
Epoch:2|Train Loss:3.3380532264709473|Train Acc:0.3339066362551538|Val Loss:3.410975933074951|Val Acc:0.34158666838950674
Save model:  lm-large-cont-dim200.pth
Epoch:3|Train Loss:2.9862654209136963|Train Acc:0.3689788156933724|Val Loss:3.4011497497558594|Val Acc:0.34586250238087063
Current lr: 0.005
Epoch:4|Train Loss:2.7091126441955566|Train Acc:0.3964461528618482|Val Loss:3.4517548084259033|Val Acc:0.34269802318392734
Current lr: 0.0025
Epoch:5|Train Loss:2.4686622619628906|Train Acc:0.4286000853354054|Val Loss:3.528275966644287|Val Acc:0.3448372164230473
Current lr: 0.00125
Epoch:6|Train Loss:2.1563994884490967|Train Acc:0.48828408041104593|Val Loss:3.547022581100464|Val Acc:0.3475418969762569
Current lr: 0.000625
Epoch:7|Train Loss:1.9435194730758667|Train Acc:0.5370194305897364|Val Loss:

In [23]:
model = torch.load(model_name)
test_loss, test_acc = evaluate(model, DEVICE, test_batch, criterion)
print('Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

Test Loss: 3.422940254211426 | Test Acc: 0.33774335173558123 |


In [24]:
def print_pred_error_words(model,device,data_batch):
    model.eval()
    error_words = []
    with torch.no_grad():
        for c, x, y in data_batch:
            c = c.to(device)
            x = x.to(device)
            y = y.to(device)
            
            mask = (y!=PAD_IDX)
            fx = model(c, x)
            
            pred_idx = fx.argmax(dim=1)
            ground_truth_idx = y[mask]
            for p, g in zip(pred_idx.tolist(), ground_truth_idx.tolist()):
                if p != g:
                    error_words.append(" | ".join([idx2word[g], idx2word[p]]))
    return error_words

In [25]:
model = torch.load(model_name)
error_words = print_pred_error_words(model, DEVICE, test_batch)

In [26]:
words_counter = Counter(error_words)
TopN = 35
topn_words = words_counter.most_common(TopN)
print("真实值 | 预测值 | 预测错误次数")
for w in topn_words:
    print(w)

真实值 | 预测值 | 预测错误次数
('Bob | He', 109)
('She | Sue', 101)
('and | .', 45)
('had | was', 45)
('to | .', 42)
('decided | was', 39)
('her | the', 38)
('. | to', 34)
('his | the', 31)
('for | .', 29)
(', | .', 27)
('a | the', 26)
('in | .', 26)
('His | He', 25)
('Bob | Sue', 22)
('Her | Sue', 21)
('got | was', 20)
('went | was', 18)
('and | to', 18)
('! | .', 17)
('he | to', 17)
('a | to', 16)
('for | to', 16)
('wanted | was', 16)
('it | the', 15)
('her | a', 15)
('. | and', 15)
("'s | was", 14)
('the | a', 14)
('They | Sue', 14)
('at | .', 14)
('The | He', 14)
('. | the', 13)
('to | the', 13)
('the | .', 13)


In [33]:
6 %3

0