In [None]:
## 语言模型

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
word_file = './data/bobsue.voc.txt'
train_file = './data/bobsue.lm.train.txt'
test_file = './data/bobsue.lm.test.txt'
dev_file = './data/bobsue.lm.dev.txt'

In [3]:
# 从文件构建词汇集合并构建word2idx 与idx2word

In [6]:
def load_word_set(filename):
    with open(filename, "r", encoding="utf-8") as f:
        word_set = set([line.strip() for line in f])
    return word_set

In [7]:
word_set = load_word_set(word_file)
word2idx = {w:i for i, w in enumerate(word_set, 1)}
idx2word = {i:w for i, w in enumerate(word_set, 1)}
PAD_IDX = 0
word2idx["<pad>"] = PAD_IDX
idx2word[PAD_IDX] = "<pad>"

In [9]:
idx2word

{1: 'father',
 2: 'most',
 3: 'ticket',
 4: 'ring',
 5: 'name',
 6: 'board',
 7: 'bus',
 8: 'moment',
 9: 'buy',
 10: 'huge',
 11: 'eat',
 12: 'agreed',
 13: 'men',
 14: 'will',
 15: 'drinks',
 16: 'beer',
 17: '"',
 18: 'warned',
 19: "'d",
 20: 'returned',
 21: 'bird',
 22: 'invited',
 23: 'During',
 24: 'videos',
 25: 'No',
 26: 'wondered',
 27: 'fear',
 28: 'online',
 29: 'decided',
 30: 'accepted',
 31: 'at',
 32: 'mess',
 33: 'laid',
 34: 'instructor',
 35: 'watching',
 36: 'across',
 37: 'picked',
 38: 'would',
 39: 'faster',
 40: 'To',
 41: 'notice',
 42: 'chose',
 43: 'stop',
 44: 'killed',
 45: 'bottle',
 46: '</s>',
 47: 'keep',
 48: 'read',
 49: '10',
 50: 'backpack',
 51: 'laundry',
 52: 'building',
 53: 'paper',
 54: 'hurt',
 55: 'are',
 56: 'mirror',
 57: 'hated',
 58: 'well',
 59: 'awoke',
 60: 'four',
 61: 'mail',
 62: 'less',
 63: 'tasted',
 64: 'center',
 65: 'twenty',
 66: 'wore',
 67: 'kids',
 68: 'green',
 69: 'hike',
 70: 'face',
 71: 'talking',
 72: 'power',
 73

In [None]:
# 训练验证测试数据准备

In [12]:
def load_corpus(filename):
    """读取数据集，返回句子列表"""
    with open(filename, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]
    return sentences

def sentences2words(sentences):
    return [w for s in sentences for w in s.split()]

In [18]:
train_sentences = load_corpus(train_file)
dev_sentences = load_corpus(dev_file)
test_sentences = load_corpus(test_file)

train_words = sentences2words(train_sentences)
dev_words = sentences2words(dev_sentences)
test_words = sentences2words(test_sentence)

In [20]:
s = "{}句子数: {}，单词数: {}."
print(s.format("训练集", len(train_sentences), len(train_words)))
print(s.format("验证集", len(dev_sentences), len(dev_words)))
print(s.format("测试集", len(test_sentences), len(test_words)))

训练集句子数: 6036，单词数: 71367.
验证集句子数: 750，单词数: 8707.
测试集句子数: 750，单词数: 8809.


In [21]:
def max_sentence_num(sentences):
    """返回最长句子单词数量"""
    return max([len(s.split()) for s in sentences ])

In [22]:
print("训练集最长句子单词个数：", max_sentence_num(train_sentences))
print("验证集最长句子单词个数：", max_sentence_num(dev_sentences))
print("测试集最长句子单词个数：", max_sentence_num(test_sentences))

训练集最长句子单词个数： 21
验证集最长句子单词个数： 20
测试集最长句子单词个数： 21


In [24]:
def model_sequence(corpus, word2idx, seq_len=21):
    """语料句子转换成模型输入的序列idx"""
    sentences = []
    labels = []
    for sentence in corpus:
        words = sentence.split()
        sentence_vec = [0]*seq_len
        for i, w in enumerate(words[:-1]):
            sentence_vec[i] = word2idx[w]
        sentences.append(sentence_vec)
        label_vec = [0] * seq_len
        for i, w in enumerate(words[1:]):
            label_vec[i] = word2idx[w]
        labels.append(label_vec)
    return sentences, labels

In [25]:
train_data, train_label = model_sequence(train_sentences, word2idx)
dev_data, dev_label = model_sequence(dev_sentences, word2idx)
test_data, test_label = model_sequence(test_sentences, word2idx)

In [28]:
print(train_data[1], train_label[1])

[174, 1474, 112, 841, 1181, 692, 617, 1159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1474, 112, 841, 1181, 692, 617, 1159, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [41]:
idx = 1
for i in train_data[idx]:
    print(idx2word[i],  end=' ')
print('\n')
for i in train_label[idx]:
    print(idx2word[i], end=' ')

<s> The girl broke up with Bob . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

The girl broke up with Bob . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [42]:
# batch_data
def gen_batch_data(data, label, batch_size=32):
    """构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label"""
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
        batch_data.append((dbatch, lbatch))
    return batch_data

In [44]:
BATCH_SIZE = 32
train_batch = gen_batch_data(train_data, train_label, batch_size=BATCH_SIZE)
dev_batch = gen_batch_data(dev_data, dev_label, batch_size=BATCH_SIZE)
test_batch = gen_batch_data(test_data, test_label, batch_size=BATCH_SIZE)

In [47]:
len(train_batch), len(tra_batch)

188