词向量训练
word2Vec



In [None]:
"""
数据预处理
生成：idx2word,word2idx,word_counts,word_freqs,text
"""
from collections import Counter
import numpy as np

MAX_VOCAB_SIZE = 1000  # 包含一个<UNK>，实际上是构建一个999个单词的词典


def  pre_data(data_location:str):
    with open(data_location) as f:
        text = f.read()  # 得到文本内容

    text = text.lower().split()  # 分割成小写单词列表
    vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
    # 统计并筛选词频最高的的999个，变更成key=word，value=times
    vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values()))
    # 把不常用的单词都编码为"<UNK>"，也就是999排名以后的，所有的累加

    idx2word = [word for word in vocab_dict.keys()]  # 关键字排序
    word2idx = {word: i for i, word in enumerate(idx2word)}  # 逆序

    word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
    # 科学计数法，转为numpy.ndarray格式的词频统计

    word_freqs = word_counts / np.sum(word_counts)  # 词频百分比
    return text,word_freqs,word_counts,idx2word,word2idx,vocab_dict


In [None]:
#数据集的读取

from torch.utils.data.dataset import Dataset


C = 3  # context window
K = 15  # number of negative samples

class WordEmbeddingDataset(Dataset):
    def __init__(self, text, word2idx, idx2word, word_freqs, word_counts):
        """ text: a list of words
            word2idx: the dictionary from word to index
            idx2word: index to word mapping
            word_freqs: the frequency of each word
            word_counts: the word counts
        """

        super()  # #通过父类初始化模型，然后重写两个方法

        self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text]
        # 返回单词在词典中的数字下表，把单词数字化表示。如果不在词典中，也表示为unk
        self.text_encoded = torch.LongTensor(self.text_encoded)
        # nn.Embedding需要传入LongTensor类型
        self.word2idx = word2idx
        self.idx2word = idx2word
        self.word_freqs = torch.Tensor(word_freqs)#词频
        self.word_counts = torch.Tensor(word_counts)#总数

    def __len__(self):
        return len(self.text_encoded)  # 返回所有单词的总数，即item的总数

    def __getitem__(self, idx):
        """ 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        """
        center_words = self.text_encoded[idx]  # 取得中心词
        pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))  # 先取得中心左右各C个词的索引,C在前面定义了
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]  # 为了避免索引越界，所以进行取余处理
        pos_words = self.text_encoded[pos_indices]  # tensor(list)，获得indices下标的tensor的word,这里是2*C个

        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
        # K 是负采样
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量

        return center_words, pos_words, neg_words
    #返回中心词，正样本词，负样本词。
    #中心词input参数idx得到，pos_words词为周围C*2个，neg_words为词频表中按照词频概率采样的负样本

In [None]:
#模型的定义
#这里仅仅计算当前词的损失，
from torch import nn
import torch
import torch.nn.functional as F


class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size

        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)  # 词点的尺寸和嵌入向量的维度
        # 这里的embedding就是一个线性模型，也可以换成nn.Linear
        self.out_embed = nn.Embedding(self.embed_size, self.vocab_size)
        # ，给一个编号，嵌入层就能返回这个编号对应的嵌入向量，嵌入向量反映了各个编号代表的符号之间的语义关系

    def forward(self, input_labels, pos_labels, neg_labels):  # 前向传播
        """ input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels：negative words, [batch_size, (window_size * 2 * K)]
            return: loss, [batch_size]
        """
        # 得到对应标签的向量
        input_embedding = self.in_embed(input_labels)  # [batch_size, embed_size]
        pos_embedding = self.in_embed(pos_labels)  # [batch_size, (window * 2), embed_size]
        neg_embedding = self.in_embed(neg_labels)  # [batch_size, (window * 2 * K), embed_size]

        input_embedding = input_embedding.unsqueeze(2)  # [batch_size, embed_size, 1]
        # 在第2维度那里加维度，(0,1,2)

        pos_dot = torch.bmm(pos_embedding, input_embedding)  # [batch_size, (window * 2), 1]
        # 矩阵乘法 强制规定维度和大小相同
        pos_dot = pos_dot.squeeze(2)  # [batch_size, (window * 2)]
        # 删除维度

        neg_dot = torch.bmm(neg_embedding, -input_embedding)  # [batch_size, (window * 2 * K), 1]
        # 负样本的目的是拉开距离，所以距离越小损失越大
        neg_dot = neg_dot.squeeze(2)  # batch_size, (window * 2 * K)]
        # 删除维度
        log_pos = F.logsigmoid(pos_dot).sum(1)  # .sum()结果只为一个数，.sum(1)结果是一维的张量，按照1维度计算
        log_neg = F.logsigmoid(neg_dot).sum(1)

        loss = log_pos + log_neg
        # 共同损失

        return -loss

    def input_embeddings(self):
        return self.in_embed.weight.cpu()

    def out_embeddings(self):
        return self.out_embed.weight.cpu()

In [None]:
#模型的训练
import torch

from torch import optim
from torch.utils.data import DataLoader
from Modul import EmbeddingModel
from WordEmbeddingDataset import WordEmbeddingDataset


epochs = 30 #训练轮数

EMBEDDING_SIZE = 100#embedding大小
batch_size = 15000#一个批次训练的单词个数
lr = 0.2#学习率

# word_freqs = word_freqs ** (3. / 4.)  # 这是因为word2vec论文里面推荐这么做

text,word_freqs,word_counts,idx2word,word2idx,vocab_dict=pre_data('data/text8.train.txt')

dataset = WordEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts)
#加载数据集


dataloader = DataLoader(dataset, batch_size, shuffle=True)
#加载数据到Dataloader中，洗一下

net = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
#定义模型
#net=torch.load('model/embedding_epoch_new_9_.pt')
#加载模型

net.cuda()
#loss_min=0;

print('*********Training begins*******')
for e in range(epochs):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = torch.LongTensor(input_labels.long()).cuda()
        pos_labels = torch.LongTensor(pos_labels.long()).cuda()
        neg_labels = torch.LongTensor(neg_labels.long()).cuda()
        #将数据转为LongTensor

        optimizer = optim.Adam(params=net.parameters(), lr=lr, )
        #优化器
        optimizer.zero_grad()  # net.zero_grad()
        #梯度清0，每个批次算的都不一样
        loss = net.forward(input_labels, pos_labels, neg_labels).mean()
        #返回的是loss, [batch_size]，所以要求平均损失
        loss.backward()
        #反向传播
        optimizer.step()
        #更新参数

        if i % 100 == 0:
            print('epoch', e, 'iteration', i, loss.item())
    if e==0:
        loss_min = loss.item()
        #第一轮的精度保存
    if e != 0 and loss_min > loss.item():
        #有更小的损失，则保存新的模型
        torch.save(net, './model/epoch_{}_loss{}.pt'.format(e,loss.item()))
        loss_min=loss.item()
        lr = lr * 0.5
        #学习率下降



In [None]:
#把训练好的模型拿来使用
#就是找到当前词汇近似的词。

from collections import Counter

import numpy as np
import torch
from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader

from WordEmbeddingDataset import WordEmbeddingDataset



def find_nearest(word,vocab_dict,word2idx):
    """找到目标词的相似词"""
    if word not in train_vocab_dict:#如果是词典的生词
        word = '<UNK>'
    index = word2idx[word]
    embedding = embedding_weights[index]

    cos_dis = np.array([cosine(e, embedding) for e in embedding_weights])
    #计算余弦相似的词，
    return [word2idx[i] for i in cos_dis.argsort()[:10]]#返回排序后的前10个




# 查找最近向量

In [None]:
net = torch.load('model/embedding_epoch5_.pt')
#加载模型
embedding_weights = net.input_embeddings().detach().numpy()
#得到input_embeddings的维度参数，

train_text,train_word_freqs,train_word_counts,train_idx2word,dev_word2idx,train_vocab_dict=pre_data('data/text8.train.txt')

dev_text,dev_word_freqs,dev_word_counts,dev_idx2word,train_word2idx,dev_vocab_dict=pre_data('data/text8.dev.txt')


print("****** train word*******")
for word in ["apple", "america", "computer"]:
    print(word, find_nearest(word,train_vocab_dict,train_word2idx))
print("****** dev word*******")

for word in list(dev_vocab_dict.keys())[101:105]:
    print(word, find_nearest(word,dev_vocab_dict,dev_word2idx))




In [None]:
#随机取样100个样本，测试其平均误差
train_dataset = WordEmbeddingDataset(train_text, train_word2idx,
                                     train_idx2word, train_word_freqs,train_word_counts)


dev_dataset = WordEmbeddingDataset(dev_text, dev_word2idx, dev_idx2word, dev_word_freqs,
                                   dev_word_counts)

train_dataloader = DataLoader(train_dataset, 100, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, 100, shuffle=True)


train_loss = list()
for i, (input_labels, pos_labels, neg_labels) in enumerate(train_dataloader):
    input_labels = torch.LongTensor(input_labels.long()).cuda()
    pos_labels = torch.LongTensor(pos_labels.long()).cuda()
    neg_labels = torch.LongTensor(neg_labels.long()).cuda()

    loss = net.forward(input_labels, pos_labels, neg_labels).mean()
    train_loss.append(loss)
    #记录平均误差
    if i == 100:
        break
print("train_mean_loss")
print(sum(train_loss) / 100)
print("end")



dev_loss = list()

for i, (input_labels, pos_labels, neg_labels) in enumerate(dev_dataloader):
    input_labels = torch.LongTensor(input_labels.long()).cuda()
    pos_labels = torch.LongTensor(pos_labels.long()).cuda()
    neg_labels = torch.LongTensor(neg_labels.long()).cuda()

    loss = net.forward(input_labels, pos_labels, neg_labels).mean()
    dev_loss.append(loss)
    if i == 100:
        break
print("dev_mean_loss")
print(sum(dev_loss) / 100)
print("end")


print(dict(net.__dict__.items()).get('_modules'))