## Binary Log Loss实验
- 尝试一个不同的损失函数: binary log loss + 负例采样

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from collections import Counter

In [23]:
# 数据文件
word_file = './data/bobsue.voc.txt'
train_file = './data/bobsue.lm.train.txt'
test_file = './data/bobsue.lm.test.txt'
dev_file = './data/bobsue.lm.dev.txt'

BATCH_SIZE = 32       # 批次大小
EMBEDDING_DIM = 200   # 词向量维度
EMBEDDING_OUT = 200   # 输出层词向量维度
HIDDEN_DIM = 200      # 隐含层
GRAD_CLIP = 5.        # 梯度截断值
EPOCHS = 20 
LEARN_RATE = 0.001    # 初始学习率
SAMPLE_NUM = 2        # 负例采样数目

BEST_VALID_LOSS = float('inf')     # 初始验证集上的损失值，设为最大
MODEL_PATH = "lm-bll-best-dim{}.pth"   # 模型名称
USE_CUDA = torch.cuda.is_available()    # 是否使用GPU
NUM_CUDA = torch.cuda.device_count()    # GPU数量

In [3]:
def load_word_set(filename):
    with open(filename, "r", encoding="utf-8") as f:
        word_set = set([line.strip() for line in f])
    return word_set

In [4]:
def create_word_set(*paths):
    text = []
    for path in paths:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                text.extend(line.split())
    word_set = set(text)
    word2idx = {w:i for i, w in enumerate(word_set, 1)}
    idx2word = {i:w for i, w in enumerate(word_set, 1)}
    vocab = Counter(text)
    word_counts = torch.tensor([vocab[w] for w in word_set], dtype=torch.float32)
    
    word_freqs = word_counts / word_counts.sum()
    # word_freqs = word_freqs ** (3./4.)
    # word_freqs = word_freqs / word_freqs.sum()
    return word_set, word2idx, idx2word, word_freqs

In [5]:
def load_corpus(filename):
    """读取数据集，返回句子列表"""
    with open(filename, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]
    return sentences

def sentences2words(sentences):
    return [w for s in sentences for w in s.split()]

In [6]:
word_set, word2idx, idx2word, word_freqs = create_word_set(train_file, dev_file, test_file)

# 设置 <pad> 值为 0
PAD_IDX = 0
idx2word[PAD_IDX] = '<pad>'
word2idx['<pad>'] = PAD_IDX

VOCAB_SIZE = len(word_set)

In [7]:
VOCAB_SIZE

1492

In [8]:
train_sentences = load_corpus(train_file)
dev_sentences = load_corpus(dev_file)
test_sentences = load_corpus(test_file)

train_words = sentences2words(train_sentences)
dev_words = sentences2words(dev_sentences)
test_words = sentences2words(test_sentences)

In [9]:
s = "{}句子数: {}，单词数: {}."
print(s.format("训练集", len(train_sentences), len(train_words)))
print(s.format("验证集", len(dev_sentences), len(dev_words)))
print(s.format("测试集", len(test_sentences), len(test_words)))

训练集句子数: 6036，单词数: 71367.
验证集句子数: 750，单词数: 8707.
测试集句子数: 750，单词数: 8809.


In [10]:
def max_sentence_num(sentences):
    """返回最长句子单词数量"""
    return max([len(s.split()) for s in sentences ])

In [11]:
print("训练集最长句子单词个数：", max([len(s.split()) for s in train_sentences ]))
print("验证集最长句子单词个数：", max([len(s.split()) for s in dev_sentences ]))
print("测试集最长句子单词个数：", max([len(s.split()) for s in test_sentences ]))

print("训练集最短句子单词个数：", min([len(s.split()) for s in train_sentences ]))
print("验证集最短句子单词个数：", min([len(s.split()) for s in dev_sentences ]))
print("测试集最短句子单词个数：", min([len(s.split()) for s in test_sentences ]))

训练集最长句子单词个数： 21
验证集最长句子单词个数： 20
测试集最长句子单词个数： 21
训练集最短句子单词个数： 5
验证集最短句子单词个数： 5
测试集最短句子单词个数： 6


In [12]:
def model_sequence(corpus, word2idx, word_freqs, sample_num=20, seq_len=21):
    """输入语料句子列表，返回模型输入序列的idx"""
    labels = []
    sentences = []
    neg_words = []
    for sentence in corpus:
        words = sentence.split()
        sentence_tample = [0] * seq_len
        for i, w in enumerate(words[:-1]):
            sentence_tample[i] = word2idx[w]
        target_tample = [0] * seq_len
        for i, w in enumerate(words[1:]):
            target_tample[i] = word2idx[w]
        sentences.append(sentence_tample)
        labels.append(target_tample)
        # 负例采样
        neg_words.append(torch.multinomial(word_freqs, seq_len * sample_num, True))
    return (sentences, labels, neg_words)

In [13]:
train_data, train_label, train_neg = model_sequence(train_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
dev_data, dev_label, dev_neg = model_sequence(dev_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
test_data, test_label, test_neg = model_sequence(test_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)

In [14]:
a = train_data[0]
print(a)
for i in a:
    print(idx2word[i], end=' ')

[1433, 1057, 633, 821, 1296, 391, 650, 817, 214, 1080, 1400, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
<s> She ate quickly and asked to be taken home . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

In [15]:
n = train_neg[0]
n.size()

torch.Size([42])

In [16]:
def gene_batch_data(data, label, neg, batch_size=32):
    """
    构建 batch tensor，返回 batch 列表，每个batch为三元组包含data和label、neg_word
    """
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    neg_tensor = torch.stack(neg)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            nbatch = neg_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
#             break
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
            nbatch = neg_tensor[start: end]
        batch_data.append((dbatch, lbatch, nbatch))
    return batch_data

In [17]:
train_batch = gene_batch_data(train_data, train_label, train_neg, batch_size=BATCH_SIZE)
dev_batch = gene_batch_data(dev_data, dev_label, dev_neg, batch_size=BATCH_SIZE)
test_batch = gene_batch_data(test_data, test_label, test_neg, batch_size=BATCH_SIZE)

In [18]:
class LSTMNegModel(nn.Module):
    def __init__(self, embedding_dim, embedding_out, hidden_dim, vocab_size, sample_num):
        super(LSTMNegModel, self).__init__()
        self.sample_num = sample_num
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_out)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, embedding_out)
        
    def forward(self, data):
        text, label, neg = data
        mask = (text != PAD_IDX)
        neg_mask = mask.unsqueeze(1).expand(text.size(0), SAMPLE_NUM, text.size(1)).contiguous().view(neg.size(0), neg.size(1))
#         print(mask.size(), neg_mask.size())
        # [batch, seq_len] -> [batch, seq_len, emb_dim]
        embed = self.in_embed(text)
        label_embed = self.out_embed(label)
        # [batch, seq_len*sample_num] -> [batch, seq_len*sample_num, emb_dim]
        neg_embed = self.out_embed(neg)
        
        # [batch, seq_len, emb_dim] -> [batch, seq_len, hidden_size]
        out, _ = self.lstm(embed)
        # [batch, seq_len, hidden_size] -> [batch, seq_len, emb_dim]
        out = self.linear(out)
        
        # 计算损失
        # [batch, seq_len, emb_dim] * [batch, seq_len, emb_dim] -> [batch, seq_len]
        label_score = (out * label_embed).sum(2)
#         label_score = torch.mm(label_embed.squeeze(1), out.squeeze(1).permute(1, 0))
        # [batch, seq_len*sample_num, emb_dim] * [batch, seq_len*sample_num, emb_dim] 
        out_expand = out.unsqueeze(1).expand(out.size(0), SAMPLE_NUM, out.size(1), out.size(2)).contiguous().view(neg_embed.size(0),neg_embed.size(1),neg_embed.size(2),)
        # [batch, seq_len*sample_num, emb_dim] -> [batch, seq_len*sample_num]
        neg_score = (out_expand * neg_embed).sum(2)
#         neg_score = torch.mm(neg_embed.view(-1, self.embedding_dim), neg_out)

        label_score = label_score[mask]
        neg_score = neg_score[neg_mask]

        log_label = F.logsigmoid(label_score).mean()
        log_neg = torch.log(1 - torch.sigmoid(neg_score)).mean()

        loss = log_label + log_neg
        
        return -loss

In [19]:
VOCAB_SIZE = len(word2idx)
model = LSTMNegModel(EMBEDDING_DIM, EMBEDDING_OUT, HIDDEN_DIM, VOCAB_SIZE, SAMPLE_NUM)

In [20]:
# DEVICE = torch.device("cuda" if USE_CUDA else 'cpu')
DEVICE = torch.device("cpu")
model = model.to(DEVICE)
# if NUM_CUDA > 1:
#     device_ids = list(range(NUM_CUDA))
#     print(device_ids)
#     model = nn.DataParallel(model, device_ids=device_ids)

In [21]:
def evaluate(model, device, iterator):
    model.eval()  # 不更新参数，预测模式
    epoch_loss=0  # 积累变量
    
    with torch.no_grad():
        for x, y, z in iterator:
            x = x.to(device)
            y = y.to(device)
            z = z.to(device)
            
            loss = model((x,y,z))
            epoch_loss += loss.item()
    return epoch_loss/len(iterator)


def train(model, device, iterator, optimizer, grad_clip):
    epoch_loss = 0  # 积累变量
    model.train()   # 该函数表示PHASE=Train
    
    for x, y, z in iterator:  # 拿每一个minibatch
        x = x.to(device)
        y = y.to(device)
        z = z.to(device)
        
        optimizer.zero_grad()
    
        loss = model((x,y,z))  # loss
        loss.backward()        # 进行BP
        
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()  # 更新参数
        
        epoch_loss += loss.item()
        
    return epoch_loss/len(iterator)

In [26]:
# criterion = nn.NLLLoss()            # 指定损失函数
optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)  # 指定优化器
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？

SCHED_NUM = 0

for epoch in range(1, EPOCHS+1):
    train_loss = train(model, DEVICE, train_batch, optimizer, GRAD_CLIP)
    valid_loss = evaluate(model, DEVICE, dev_batch)
    if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
        BEST_VALID_LOSS = valid_loss
        torch.save(model, MODEL_PATH.format(EMBEDDING_DIM))
        SCHED_NUM = 0
    else:
        SCHED_NUM += 1
        if SCHED_NUM // 3 == 0:
            scheduler.step()
            print("Current lr:", optimizer.param_groups[0]['lr'])
        if SCHED_NUM == 7:
            print("Early stop!")
            break
    print('Epoch:{0}|Train Loss:{1}|Val Loss:{2}'.format(epoch,train_loss,valid_loss))

Current lr: 0.001
Epoch:1|Train Loss:nan|Val Loss:nan
Current lr: 0.0005
Epoch:2|Train Loss:nan|Val Loss:nan
Epoch:3|Train Loss:nan|Val Loss:nan
Epoch:4|Train Loss:nan|Val Loss:nan
Epoch:5|Train Loss:nan|Val Loss:nan
Epoch:6|Train Loss:nan|Val Loss:nan
Early stop!
