## Binary Log Loss实验
- 尝试一个不同的损失函数: binary log loss + 负例采样

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from collections import Counter

In [2]:
# 数据文件
word_file = './data/bobsue.voc.txt'
train_file = './data/bobsue.lm.train.txt'
test_file = './data/bobsue.lm.test.txt'
dev_file = './data/bobsue.lm.dev.txt'

BATCH_SIZE = 32       # 批次大小
EMBEDDING_DIM = 200   # 词向量维度
EMBEDDING_OUT = 100   # 输出层词向量维度
HIDDEN_DIM = 200      # 隐含层
GRAD_CLIP = 5.        # 梯度截断值
EPOCHS = 20
LEARN_RATE = 0.001    # 初始学习率
SAMPLE_NUM = 10       # 负例采样数目

BEST_VALID_LOSS = float('inf')          # 初始验证集上的损失值，设为最大
MODEL_PATH = "lm-bll-samp-{}.pth"       # 模型名称
USE_CUDA = torch.cuda.is_available()    # 是否使用GPU
NUM_CUDA = torch.cuda.device_count()    # GPU数量

In [3]:
def load_word_set(filename):
    with open(filename, "r", encoding="utf-8") as f:
        word_set = set([line.strip() for line in f])
    return word_set

In [4]:
def create_word_set(*paths, power=1):
    text = []
    for path in paths:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                text.extend(line.split())
    word_set = set(text)
    word2idx = {w:i for i, w in enumerate(word_set, 1)}
    idx2word = {i:w for i, w in enumerate(word_set, 1)}
    vocab = Counter(text)
    word_counts = torch.tensor([vocab[w] for w in word_set], dtype=torch.float32)
    
    word_freqs = word_counts / word_counts.sum()
    word_freqs = word_freqs ** power
    word_freqs = word_freqs / word_freqs.sum()
    return word_set, word2idx, idx2word, word_freqs

In [5]:
def load_corpus(filename):
    """读取数据集，返回句子列表"""
    with open(filename, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]
    return sentences

def sentences2words(sentences):
    return [w for s in sentences for w in s.split()]

In [6]:
word_set, word2idx, idx2word, word_freqs = create_word_set(train_file, dev_file, test_file, power=1)

# 设置 <pad> 值为 0
PAD_IDX = 0
idx2word[PAD_IDX] = '<pad>'
word2idx['<pad>'] = PAD_IDX

VOCAB_SIZE = len(word_set)

In [7]:
VOCAB_SIZE

1492

In [8]:
train_sentences = load_corpus(train_file)
dev_sentences = load_corpus(dev_file)
test_sentences = load_corpus(test_file)

train_words = sentences2words(train_sentences)
dev_words = sentences2words(dev_sentences)
test_words = sentences2words(test_sentences)

In [9]:
s = "{}句子数: {}，单词数: {}."
print(s.format("训练集", len(train_sentences), len(train_words)))
print(s.format("验证集", len(dev_sentences), len(dev_words)))
print(s.format("测试集", len(test_sentences), len(test_words)))

训练集句子数: 6036，单词数: 71367.
验证集句子数: 750，单词数: 8707.
测试集句子数: 750，单词数: 8809.


In [10]:
def max_sentence_num(sentences):
    """返回最长句子单词数量"""
    return max([len(s.split()) for s in sentences ])

In [11]:
print("训练集最长句子单词个数：", max([len(s.split()) for s in train_sentences ]))
print("验证集最长句子单词个数：", max([len(s.split()) for s in dev_sentences ]))
print("测试集最长句子单词个数：", max([len(s.split()) for s in test_sentences ]))

print("训练集最短句子单词个数：", min([len(s.split()) for s in train_sentences ]))
print("验证集最短句子单词个数：", min([len(s.split()) for s in dev_sentences ]))
print("测试集最短句子单词个数：", min([len(s.split()) for s in test_sentences ]))

训练集最长句子单词个数： 21
验证集最长句子单词个数： 20
测试集最长句子单词个数： 21
训练集最短句子单词个数： 5
验证集最短句子单词个数： 5
测试集最短句子单词个数： 6


In [12]:
def model_sequence(corpus, word2idx, word_freqs, sample_num=20, seq_len=21):
    """输入语料句子列表，返回模型输入序列的idx"""
    labels = []
    sentences = []
    neg_words = []
    for sentence in corpus:
        words = sentence.split()
        sentence_tample = [0] * seq_len
        for i, w in enumerate(words[:-1]):
            sentence_tample[i] = word2idx[w]
        target_tample = [0] * seq_len
        for i, w in enumerate(words[1:]):
            target_tample[i] = word2idx[w]
        sentences.append(sentence_tample)
        labels.append(target_tample)
        # 负例采样
        neg_words.append(torch.multinomial(word_freqs, seq_len * sample_num, True))
    return (sentences, labels, neg_words)

In [13]:
train_data, train_label, train_neg = model_sequence(train_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
dev_data, dev_label, dev_neg = model_sequence(dev_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
test_data, test_label, test_neg = model_sequence(test_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)

In [14]:
a = train_data[0]
print(a)
for i in a:
    print(idx2word[i], end=' ')
print("--"*20)
b = train_label[0]
for i in b:
    print(idx2word[i], end=' ')
print("--"*20)
print(train_neg[0])

[27, 725, 490, 79, 638, 1358, 721, 328, 729, 665, 1442, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
<s> She ate quickly and asked to be taken home . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> ----------------------------------------
She ate quickly and asked to be taken home . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> ----------------------------------------
tensor([ 673, 1441, 1441,  844,    8, 1326,   68,   26,  863, 1049,  248,   43,
         938,  247,    8,  325,  299,    8,   26, 1006,   26, 1441,  568,  637,
        1065,  247,    8,  828, 1441, 1120,  782,  828,  828,  747,  735,  808,
         253,  465,  559, 1441,   26, 1195,  568, 1441,  180,  701,  530, 1086,
         289, 1169,   75, 1314,  851, 1441, 1441,  273,  497, 1443,  102, 1441,
          75,   75, 1156,  380, 1121, 1441,  822,  116, 1202,   26, 1202,  688,
         938,  863,   26,  720, 1154, 1370,  720,  912, 1377,   26,  794,  893,
        1472,  555, 1413,  811,  828,    8,  720,  63

In [15]:
n = train_neg[0]
n.size()

torch.Size([210])

In [16]:
def gene_batch_data(data, label, neg, batch_size=32):
    """
    构建 batch tensor，返回 batch 列表，每个batch为三元组包含data和label、neg_word
    """
    batch_data = []
    data_tensor = torch.tensor(data, dtype=torch.long)
    label_tensor = torch.tensor(label, dtype=torch.long)
    neg_tensor = torch.stack(neg)
    n, dim = data_tensor.size()
    for start in range(0, n, batch_size):
        end = start + batch_size
        if end > n:
            break
            dbatch = data_tensor[start: ]
            lbatch = label_tensor[start: ]
            nbatch = neg_tensor[start: ]
            print("最后一个batch size:", dbatch.size())
#             break
        else:
            dbatch = data_tensor[start: end]
            lbatch = label_tensor[start: end]
            nbatch = neg_tensor[start: end]
        batch_data.append((dbatch, lbatch, nbatch))
    return batch_data

In [17]:
train_batch = gene_batch_data(train_data, train_label, train_neg, batch_size=BATCH_SIZE)
dev_batch = gene_batch_data(dev_data, dev_label, dev_neg, batch_size=BATCH_SIZE)
test_batch = gene_batch_data(test_data, test_label, test_neg, batch_size=BATCH_SIZE)

In [18]:
class LSTMNegModel(nn.Module):
    def __init__(self, embedding_dim, embedding_out, hidden_dim, vocab_size, sample_num):
        super(LSTMNegModel, self).__init__()
        self.sample_num = sample_num
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.in_embed = nn.Embedding(vocab_size, embedding_dim)
        self.out_embed = nn.Embedding(vocab_size, embedding_out)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, embedding_out)
        
    def forward(self, data):
        text, label, neg = data
        # print("-"*20)
        # print(text.size())
        # print(label.size())
        # print(neg.size())   # (bacth, SAMPLE_NUM*seq_len)
        # (torch.tensor([1,2,3,1]) != 1) ==>[0,1,1, 0]
        mask = (text != PAD_IDX)     # (batch, seq_len)
        # print("mask:", mask.size())
        # (batch, seq_len)-->(batch, 1, seq_len)-->(batch,SAMPLE_NUM,seq_len)-->（batch, SAMPLE_NUM*seq_len)
        neg_mask = mask.unsqueeze(1).expand(text.size(0), SAMPLE_NUM, text.size(1)).contiguous().view(neg.size(0), neg.size(1))
        # 当调用contiguous()时，会强制拷贝一份tensor

        # print("neg_mask:", neg_mask.size(), neg_mask.sum())  # (batch, seq_len*sample_num)
        
        embed = self.in_embed(text)   # (bacth,seq_len) --> (bacth, seq_len, in_emd_dim)
        
        # (batch, seq_len) -> (batch, seq_len, out_emb_dim)
        label_embed = self.out_embed(label)
        # (batch, seq_len*sample_num)-> (batch, seq_len*sample_num, out_emb_dim)
        neg_embed = self.out_embed(neg)
        
        # (batch, seq_len, in_emb_dim) -> (batch, seq_len, out_emb_dim(hn_dim))
        lstm_out, (h_n, c_n) = self.lstm(embed)
        # (batch, seq_len, out_emb_dim) -> (batch, seq_len, out_emb_dim) 即形状不变
        out = self.linear(lstm_out)
        
        # 计算损失
        # (batch, seq_len, out_emb_dim) * (batch, seq_len, out_emb_dim) -> sum(2)-(batch, seq_len)
        # 对应元素相乘，2维度上求和
        label_score = (out * label_embed).sum(2)
        # label_score = torch.mm(label_embed.squeeze(1), out.squeeze(1).permute(1, 0))
        # (batch, seq_len*sample_num, out_emb_dim) * (batch, seq_len*sample_num, out_emb_dim) 
        out_expand = out.unsqueeze(1).expand(out.size(0), SAMPLE_NUM, out.size(1), 
                                             out.size(2)).contiguous().view(
                                             neg_embed.size(0), neg_embed.size(1), neg_embed.size(2))
        # (batch, seq_len*sample_num, out_emb_dim) -> (batch, seq_len*sample_num)
        # 词向量合成一个数的意义是什么？
        neg_score = (out_expand * neg_embed).sum(2)

        label_score = label_score[mask]    # 这个操作会压缩成一行
        neg_score = neg_score[neg_mask]
        
        log_label = F.logsigmoid(label_score).mean()   # 一个常数 
        log_neg = torch.log(1 - torch.sigmoid(neg_score)).mean()

        loss = log_label + log_neg
        
        return -loss

In [19]:
VOCAB_SIZE = len(word2idx)
model = LSTMNegModel(EMBEDDING_DIM, EMBEDDING_OUT, HIDDEN_DIM, VOCAB_SIZE, SAMPLE_NUM)

In [20]:
# DEVICE = torch.device("cuda" if USE_CUDA else 'cpu')
DEVICE = torch.device("cpu")
model = model.to(DEVICE)
# if NUM_CUDA > 1:
#     device_ids = list(range(NUM_CUDA))
#     print(device_ids)
#     model = nn.DataParallel(model, device_ids=device_ids)

In [21]:
def acc_score(y_hat, y):
    # 返回最大的概率的索引
    pred = y_hat.argmax(dim=1)
    # print(y.view(-1))
    acc_count = torch.eq(pred, y.view(-1))
    score = acc_count.sum().item() / acc_count.size()[0]
    return score

def evaluate(model, device, iterator):
    epoch_loss = 0  # 积累变量
    model.eval()  # 不更新参数，预测模式
    
    with torch.no_grad():
        for x, y, z in iterator:
            x = x.to(device)
            y = y.to(device)
            z = z.to(device)
            
            loss = model((x,y,z))
            epoch_loss += loss.item()
            
    return epoch_loss/len(iterator)


def train(model, device, iterator, optimizer, grad_clip):
    epoch_loss = 0  # 积累变量
    model.train()   # 该函数表示PHASE=Train
    
    for x, y, z in iterator:  # 拿每一个minibatch
        x = x.to(device)
        y = y.to(device)
        z = z.to(device)
        
        optimizer.zero_grad()
    
        loss = model((x,y,z))  # loss
        loss.backward()        # 进行BP
        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()  # 更新参数
        epoch_loss += loss.item()

    return epoch_loss/len(iterator)

In [22]:
optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)  # 指定优化器
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？

SCHED_NUM = 0
model_name = MODEL_PATH.format(SAMPLE_NUM)
for epoch in range(1, EPOCHS+1):
    train_loss = train(model, DEVICE, train_batch, optimizer, GRAD_CLIP)
    valid_loss = evaluate(model, DEVICE, dev_batch)
    if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
        BEST_VALID_LOSS = valid_loss
        torch.save(model, model_name)
        print("Save model path:{}| train loss {}| valid loss {}".format(model_name, train_loss, valid_loss))
        SCHED_NUM = 0
    else:
        SCHED_NUM += 1
        if SCHED_NUM % 3 == 0:
            scheduler.step()
            print("Current lr:", optimizer.param_groups[0]['lr'])
        if SCHED_NUM == 7:
            print("Early stop!")
            break
    print('Epoch:{}|Train Loss:{}|Val Loss:{}'.format(epoch, train_loss, valid_loss))
        

  "type " + obj.__name__ + ". It won't be checked "


Save model path:lm-bll-samp-10.pth| train loss 0.6765867387677761| valid loss 0.49544699554858
Epoch:1|Train Loss:0.6765867387677761|Val Loss:0.49544699554858
Save model path:lm-bll-samp-10.pth| train loss 0.40142372963910405| valid loss 0.40998030875040137
Epoch:2|Train Loss:0.40142372963910405|Val Loss:0.40998030875040137
Save model path:lm-bll-samp-10.pth| train loss 0.31070439731504057| valid loss 0.38939876919207367
Epoch:3|Train Loss:0.31070439731504057|Val Loss:0.38939876919207367
Epoch:4|Train Loss:0.24797511615968765|Val Loss:0.3964136398356894
Epoch:5|Train Loss:0.1953815960503639|Val Loss:0.427851562914641
Current lr: 0.001
Epoch:6|Train Loss:0.15279908184992505|Val Loss:0.4718990014946979
Epoch:7|Train Loss:0.12087223809608753|Val Loss:0.539731247269589
Epoch:8|Train Loss:0.09807310455498543|Val Loss:0.6311629868072012
Current lr: 0.0005
Epoch:9|Train Loss:0.08436341861144026|Val Loss:inf
Early stop!


In [24]:
model = torch.load(model_name)
test_loss = evaluate(model, DEVICE, test_batch)
print('Test Loss: {}'.format(test_loss))

Test Loss: 0.4409688711166382


## 问题
- 在使用binary log loss 的情况下，如何评价模型？
- 梯度截断的情况下依然会存在loss nan?

## 不同负采样数量

In [24]:
sample_num = [20, 100, 500]
for n in sample_num:
    print("***负采样数量{}***".format(n))
    model_name = 'lm-bll-samp-{}.pth'.format(n)
    SAMPLE_NUM = n
    BEST_VALID_LOSS = float('inf')
    train_data, train_label, train_neg = model_sequence(train_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    dev_data, dev_label, dev_neg = model_sequence(dev_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    test_data, test_label, test_neg = model_sequence(test_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    
    
    train_batch = gene_batch_data(train_data, train_label, train_neg, batch_size=BATCH_SIZE)
    dev_batch = gene_batch_data(dev_data, dev_label, dev_neg, batch_size=BATCH_SIZE)
    test_batch = gene_batch_data(test_data, test_label, test_neg, batch_size=BATCH_SIZE)
    
    model = LSTMNegModel(EMBEDDING_DIM, EMBEDDING_OUT, HIDDEN_DIM, VOCAB_SIZE, SAMPLE_NUM)
    DEVICE = torch.device("cpu")
    model = model.to(DEVICE)
    
    
    optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)  # 指定优化器
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？

    SCHED_NUM = 0
    for epoch in range(1, EPOCHS+1):
        train_loss = train(model, DEVICE, train_batch, optimizer, GRAD_CLIP)
        valid_loss = evaluate(model, DEVICE, dev_batch)
        if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
            BEST_VALID_LOSS = valid_loss
            torch.save(model, model_name)
            print("Save model path:{}| train loss {}| valid loss {}".format(model_name, train_loss, valid_loss))
            SCHED_NUM = 0
        else:
            SCHED_NUM += 1
            if SCHED_NUM % 3 == 0:
                scheduler.step()
                print("Current lr:", optimizer.param_groups[0]['lr'])
            if SCHED_NUM == 7:
                print("Early stop!")
                break
        print('Epoch:{}|Train Loss:{}|Val Loss:{}'.format(epoch, train_loss, valid_loss))
    print("Start test model:", model_name)
    model = torch.load(model_name)
    test_loss = evaluate(model, DEVICE, test_batch)
    print('Test Loss: {}'.format(test_loss))

***负采样数量20***
Save model path:lm-bll-samp-20.pth| train loss 0.6862944694275551| valid loss 0.499319939509682
Epoch:1|Train Loss:0.6862944694275551|Val Loss:0.499319939509682
Save model path:lm-bll-samp-20.pth| train loss 0.4045477717163715| valid loss 0.4103606472844663
Epoch:2|Train Loss:0.4045477717163715|Val Loss:0.4103606472844663
Save model path:lm-bll-samp-20.pth| train loss 0.31166522775558714| valid loss 0.38945959314056067
Epoch:3|Train Loss:0.31166522775558714|Val Loss:0.38945959314056067
Epoch:4|Train Loss:0.248858502253573|Val Loss:0.39644569288129394
Epoch:5|Train Loss:0.1974467286404143|Val Loss:0.43040112179258594
Current lr: 0.001
Epoch:6|Train Loss:0.15638761119322575|Val Loss:0.48479544209397357
Epoch:7|Train Loss:0.12547429262640628|Val Loss:0.5508403739203578
Epoch:8|Train Loss:0.10333875808468525|Val Loss:inf
Current lr: 0.0005
Epoch:9|Train Loss:0.08819240823071053|Val Loss:inf
Early stop!
Start test model: lm-bll-samp-20.pth
Test Loss: 0.4002216497193212
***负采样数

## 不同采样频率

In [26]:
SAMPLE_NUM = 20

for p in range(1, 5):
    BEST_VALID_LOSS = float('inf')
    power = 0.25*p 
    print("***负采样评率{}***".format(power))
    model_name = 'lm-bll-power-{}.pth'.format(power*100)
    word_set, word2idx, idx2word, word_freqs = create_word_set(train_file, dev_file, test_file, power=power)

    # 设置 <pad> 值为 0
    PAD_IDX = 0
    idx2word[PAD_IDX] = '<pad>'
    word2idx['<pad>'] = PAD_IDX
    
    train_sentences = load_corpus(train_file)
    dev_sentences = load_corpus(dev_file)
    test_sentences = load_corpus(test_file)

    train_words = sentences2words(train_sentences)
    dev_words = sentences2words(dev_sentences)
    test_words = sentences2words(test_sentences)
    train_data, train_label, train_neg = model_sequence(train_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    dev_data, dev_label, dev_neg = model_sequence(dev_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    test_data, test_label, test_neg = model_sequence(test_sentences, word2idx, word_freqs, sample_num=SAMPLE_NUM)
    
    
    train_batch = gene_batch_data(train_data, train_label, train_neg, batch_size=BATCH_SIZE)
    dev_batch = gene_batch_data(dev_data, dev_label, dev_neg, batch_size=BATCH_SIZE)
    test_batch = gene_batch_data(test_data, test_label, test_neg, batch_size=BATCH_SIZE)
    
    model = LSTMNegModel(EMBEDDING_DIM, EMBEDDING_OUT, HIDDEN_DIM, VOCAB_SIZE, SAMPLE_NUM)
    DEVICE = torch.device("cpu")
    model = model.to(DEVICE)
    
    
    optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE)  # 指定优化器
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？

    SCHED_NUM = 0
    for epoch in range(1, EPOCHS+1):
        train_loss = train(model, DEVICE, train_batch, optimizer, GRAD_CLIP)
        valid_loss = evaluate(model, DEVICE, dev_batch)
        if valid_loss < BEST_VALID_LOSS: # 如果是最好的模型就保存到文件夹
            BEST_VALID_LOSS = valid_loss
            torch.save(model, model_name)
            print("Save model path:{}| train loss {}| valid loss {}".format(model_name, train_loss, valid_loss))
            SCHED_NUM = 0
        else:
            SCHED_NUM += 1
            if SCHED_NUM % 3 == 0:
                scheduler.step()
                print("Current lr:", optimizer.param_groups[0]['lr'])
            if SCHED_NUM == 7:
                print("Early stop!")
                break
        print('Epoch:{}|Train Loss:{}|Val Loss:{}'.format(epoch, train_loss, valid_loss))
    print("Start test model:", model_name)
    model = torch.load(model_name)
    test_loss = evaluate(model, DEVICE, test_batch)
    print('Test Loss: {}'.format(test_loss))

***负采样评率0.25***


  "type " + obj.__name__ + ". It won't be checked "


Save model path:lm-bll-power-25.0.pth| train loss 0.9979828307603268| valid loss 0.7766682531522668
Epoch:1|Train Loss:0.9979828307603268|Val Loss:0.7766682531522668
Save model path:lm-bll-power-25.0.pth| train loss 0.6399219172432068| valid loss 0.6289420879405477
Epoch:2|Train Loss:0.6399219172432068|Val Loss:0.6289420879405477
Save model path:lm-bll-power-25.0.pth| train loss 0.5007797333788364| valid loss 0.5997734380804974
Epoch:3|Train Loss:0.5007797333788364|Val Loss:0.5997734380804974
Epoch:4|Train Loss:0.4047311301878158|Val Loss:0.6082124178824218
Epoch:5|Train Loss:0.32498398534160977|Val Loss:0.6545262673626775
Current lr: 0.001
Epoch:6|Train Loss:0.2612926849817976|Val Loss:0.72973189405773
Epoch:7|Train Loss:0.21327265241044632|Val Loss:0.8235294067341349
Epoch:8|Train Loss:0.1791047714650631|Val Loss:0.8798693444417871
Current lr: 0.0005
Epoch:9|Train Loss:0.15849060355190267|Val Loss:0.9844991538835608
Early stop!
Start test model: lm-bll-power-25.0.pth
Test Loss: 0.610