## 语言模型

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
word_file = './data/bobsue.voc.txt'
train_file = './data/bobsue.lm.train.txt'
test_file = './data/bobsue.lm.test.txt'
dev_file = './data/bobsue.lm.dev.txt'

# 从文件构建词汇集合并构建word2idx 与idx2word

In [3]:
def load_word_set(filename):
    with open(filename, "r", encoding="utf-8") as f:
        word_set = set([line.strip() for line in f])
    return word_set

In [4]:
word_set = load_word_set(word_file)
word2idx = {w:i for i, w in enumerate(word_set, 1)}
idx2word = {i:w for i, w in enumerate(word_set, 1)}
PAD_IDX = 0
word2idx["<pad>"] = PAD_IDX
idx2word[PAD_IDX] = "<pad>"

In [5]:
# 训练验证测试数据准备

In [6]:
def load_corpus(filename):
    """读取数据集，返回句子列表"""
    with open(filename, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]
    return sentences

def sentences2words(sentences):
    return [w for s in sentences for w in s.split()]

In [7]:
train_sentences = load_corpus(train_file)
dev_sentences = load_corpus(dev_file)
test_sentences = load_corpus(test_file)

train_words = sentences2words(train_sentences)
dev_words = sentences2words(dev_sentences)
test_words = sentences2words(test_sentences)

In [8]:
s = "{}句子数: {}，单词数: {}."
print(s.format("训练集", len(train_sentences), len(train_words)))
print(s.format("验证集", len(dev_sentences), len(dev_words)))
print(s.format("测试集", len(test_sentences), len(test_words)))

训练集句子数: 6036，单词数: 71367.
验证集句子数: 750，单词数: 8707.
测试集句子数: 750，单词数: 8809.


In [9]:
def max_sentence_num(sentences):
    """返回最长句子单词数量"""
    return max([len(s.split()) for s in sentences ])

In [10]:
print("训练集最长句子单词个数：", max_sentence_num(train_sentences))
print("验证集最长句子单词个数：", max_sentence_num(dev_sentences))
print("测试集最长句子单词个数：", max_sentence_num(test_sentences))

训练集最长句子单词个数： 21
验证集最长句子单词个数： 20
测试集最长句子单词个数： 21


In [11]:
def model_sequence(corpus, word2idx, seq_len=21):
    """语料句子转换成模型输入的序列idx"""
    sentences = []
    labels = []
    for sentence in corpus:
        words = sentence.split()
        sentence_vec = [0]*seq_len
        for i, w in enumerate(words[:-1]):
            sentence_vec[i] = word2idx[w]
        sentences.append(sentence_vec)
        label_vec = [0] * seq_len
        for i, w in enumerate(words[1:]):
            label_vec[i] = word2idx[w]
        labels.append(label_vec)
    return sentences, labels

In [12]:
train_data, train_label = model_sequence(train_sentences, word2idx)
dev_data, dev_label = model_sequence(dev_sentences, word2idx)
test_data, test_label = model_sequence(test_sentences, word2idx)

In [13]:
print(train_data[1], train_label[1])

[237, 1094, 1044, 1471, 362, 15, 400, 900, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1094, 1044, 1471, 362, 15, 400, 900, 1140, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
idx = 1
for i in train_data[idx]:
    print(idx2word[i],  end=' ')
print('\n')
for i in train_label[idx]:
    print(idx2word[i], end=' ')

<s> The girl broke up with Bob . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

The girl broke up with Bob . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [15]:
# batch_data
def gen_batch_data(data, label, batch_size=32):
    """构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label"""
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((dbatch, lbatch))
    return batch_data

In [16]:
BATCH_SIZE = 32
train_batch = gen_batch_data(train_data, train_label, batch_size=BATCH_SIZE)
dev_batch = gen_batch_data(dev_data, dev_label, batch_size=BATCH_SIZE)
test_batch = gen_batch_data(test_data, test_label, batch_size=BATCH_SIZE)

In [17]:
print(len(train_batch), len(dev_batch), len(test_batch))

188 23 23


In [18]:
class mylstm(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(mylstm, self).__init__()
        self.n_word = vocab_size
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(self.n_word, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2word = nn.Linear(hidden_dim, self.n_word)
        
    def forward(self, x):
        embeds = self.word_embeddings(x)
        lstm_out, (h_n, c_n) = self.lstm(embeds)
        target_space = self.hidden2word(lstm_out.contiguous().view(-1, self.hidden_dim))
        mask = (x != PAD_IDX).view(-1)
        pure_target = target_space[mask]
        
        target_scores = F.log_softmax(pure_target, dim=1)
        return target_scores
        

In [19]:
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 200
HIDDEN_DIM = 200

model = mylstm(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM)

In [20]:
# 使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model, device_ids=[0,1,2,3])
model = model.to(device)

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [21]:
def acc_score(y_hat, y):
    pred = y_hat.argmax(dim=1)
#     print(y.view(-1))
    acc_count = torch.eq(pred, y.view(-1))
    score = acc_count.sum().item() / acc_count.size()[0]
    return score

In [22]:
def train(model, device, iterator, optimizer, criterion, grad_clip):
    epoch_loss=0  # 积累变量
    epoch_acc=0   # 积累变量
    model.train() # 该函数表示PHASE=Train
    
    for x, y in iterator:  # 拿每一个minibatch
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        mask = y != PAD_IDX
        pure_y = y[mask]
        
        # 进行forward
        fx = model(x)  
        # 计算loss
        loss = criterion(fx, pure_y)  
        # 计算准确率
        acc = acc_score(fx, pure_y) 
        # 进行BP
        loss.backward()
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        # 更新参数
        optimizer.step() 
        
        epoch_loss += loss
        epoch_acc += acc
    print(len(iterator))
        
    return epoch_loss/len(iterator),epoch_acc/len(iterator)

In [23]:
def evaluate(model, device, iterator, criterion):
    model.eval()  # 不更新参数，预测模式
    epoch_loss=0  # 积累变量
    epoch_acc=0   # 积累变量
    
    with torch.no_grad():
        for x, y in iterator:
            x = x.to(device)
            y = y.to(device)
            mask = y != PAD_IDX
            pure_y = y[mask]
            
            fx = model(x)
            loss = criterion(fx, pure_y)
            acc = acc_score(fx, pure_y)
            epoch_loss += loss
            epoch_acc += acc
    return epoch_loss/len(iterator), epoch_acc/len(iterator)

In [32]:
GRAD_CLIP = 5.
EPOCHS = 1
BEST_VALID_LOSS = float('inf')
MODEL_PATH = "lm-best.th"

for epoch in range(1, EPOCHS+1):
    train_loss, train_acc = train(model, device, train_batch, optimizer, criterion, GRAD_CLIP)
    valid_loss, valid_acc = evaluate(model, device, dev_batch, criterion)
    if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
        BEST_VALID_LOSS = valid_loss
        torch.save(model, MODEL_PATH)
        scheduler.step()
    print('Epoch:{0}|Train Loss:{1}|Train Acc:{2}|Val Loss:{3}|Val Acc:{4}'.format(epoch,train_loss,train_acc,valid_loss,valid_acc))

  self.dropout, self.training, self.bidirectional, self.batch_first)


188
Epoch:1|Train Loss:1.9092416763305664|Train Acc:0.5186130259426801|Val Loss:4.2120184898376465|Val Acc:0.3113688752733935


  "type " + obj.__name__ + ". It won't be checked "


In [34]:
model = torch.load(MODEL_PATH)
test_loss, test_acc = evaluate(model, device, test_batch, criterion)
print('| Test Loss: {0} | Test Acc: {1} |'.format(test_loss, test_acc))

| Test Loss: 4.213463306427002 | Test Acc: 0.305538647283556 |


## 打印错误单词

In [54]:
def print_pred_error_words(model, data_batch):
    model.eval()
    error_words = []
    with torch.no_grad():
        for x, y in data_batch:
            x = x.to(device)
            y = y.to(device)
            
            mask = (y!=PAD_IDX)
            fx = model(x)
            
            pred_idx = fx.argmax(dim=1)
            ground_truth_idx = y[mask]
            for p, g in zip(pred_idx.tolist(), ground_truth_idx.tolist()):
                if p != g:
                    error_words.append("#".join([idx2word[g], idx2word[p]]))
    return error_words

In [55]:
model = torch.load(MODEL_PATH)
error_words = print_pred_error_words(model, test_batch)

  self.dropout, self.training, self.bidirectional, self.batch_first)


In [56]:
from collections import Counter

In [57]:
words_counter = Counter(error_words)
TopN = 35
topn_words = words_counter.most_common(TopN)
for w in topn_words:
    print(w)

('Bob#He', 137)
('She#He', 109)
('Sue#He', 89)
('had#was', 38)
('to#.', 33)
('decided#was', 33)
('and#.', 32)
('her#the', 32)
('.#to', 27)
('His#He', 26)
('One#He', 25)
('his#the', 23)
('her#a', 22)
('But#He', 21)
('Her#He', 21)
('The#He', 21)
('for#.', 20)
('When#He', 19)
('the#his', 19)
('They#He', 19)
('she#he', 18)
('the#her', 18)
('went#was', 18)
('got#was', 17)
('a#the', 16)
(',#.', 15)
('the#.', 15)
('wanted#was', 15)
('!#.', 14)
('.#the', 14)
('he#to', 14)
('It#He', 13)
('the#a', 13)
('in#.', 13)
("'s#was", 12)
