In [1]:
import torch
import torch.nn as nn           # 神经网络工具箱torch.nn
import torch.nn.functional as F # 神经网络函数torch.nn.functional
import torch.utils.data as tud  # Pytorch读取训练集需要用到torch.utils.data

from torch.nn.parameter import Parameter # 参数更新和优化函数

from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy          # SciPy是基于Numpy开发的高级模块，它提供了许多数学算法和函数的实现
import sklearn
from sklearn.metrics.pairwise import cosine_similarity # 余弦相似度函数
USE_CUDA = torch.cuda.is_available()  # 有GPU可以用,没有用CPU
USE_CUDA

False

In [2]:
# 为了保证实验结果可以复现，我们经常会把各种random seed固定在某一个值
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if USE_CUDA:
    torch.cuda.manual_seed(42)

# 设定一些超参数
K = 100  # number of negative samples 负样本随机采样数量
C = 3  # nearby words threshold 指定周围三个单词进行预测
NUM_EPOCHS = 2  # The number of epochs of training 迭代轮数，default=10
MAX_VOCAB_SIZE = 30000  # the vocabulary size 词汇表多大
BATCH_SIZE = 32  # the batch size 每轮迭代1个batch的数量
LEARNING_RATE = 0.2  # the initial learning rate #学习率
EMBEDDING_SIZE = 100  # 词向量维度


LOG_FILE = "word_embedding.log"

# tokenize函数，把一篇文本转化成一个个单词


def word_tokenize(text):
    return text.split()

In [3]:
with open("../data/nietzsche.txt","r") as f:
    text =f.read()

print(f'text: {text[:200]}')

text = [w for w in word_tokenize(text.lower())]
# 分词

vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE-1))
# 取出最常用的MAX_VOCAB_SIZE个单词，-1留给不常见的单词

vocab["<unk>"] =  len(text) - np.sum(list(vocab.values()))
# unk表示不常见的单词数=总单词数-常见单词数

idx_to_word = [word for word in vocab.keys()]

word_to_idx = {word:i for i,word in enumerate(idx_to_word)}

word_counts = np.array([count for count in vocab.values()], dtype=np.float32)

word_freqs = word_counts / np.sum(word_counts)

word_freqs = word_freqs ** (3./4.)

word_freqs = word_freqs/np.sum(word_freqs)

VOCAB_SIZE = len(idx_to_word)
VOCAB_SIZE

text: PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrib


17683

In [4]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self,text,word_to_idx,idx_to_word,word_freqs,word_counts):
        super(WordEmbeddingDataset,self).__init__()
        # 将字符转换成索引
        self.text_encoded = [word_to_idx.get(t,VOCAB_SIZE-1) for t in text]

        self.text_encoded = torch.Tensor(self.text_encoded).long()

        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)

    def __getitem__(self, idx):
        '''
        :param idx:
        :return:
                - 中心词
                - 这个单词附近的positive单词
                - 随机采样的k个单词作为negative sample
        '''

        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx-C,idx)) + list(range(idx+1,idx+C+1))
        # 周围词索引的索引，比如idx=0时。pos_indices = [-3, -2, -1, 1, 2, 3]
        pos_indices = [i%len(self.text_encoded) for i in pos_indices]
        # range(idx+1, idx+C+1)超出词汇总数时，需要特别处理，取余数
        pos_words = self.text_encoded[pos_indices]
        # 周围词的索引

        neg_words = torch.multinomial(
            self.word_freqs,K*pos_words.shape[0],True
        )
        # 负采样单词索引

        return center_word,pos_words,neg_words

In [7]:
dataset = WordEmbeddingDataset(
    text,word_to_idx,idx_to_word,word_freqs,word_counts
)

list(dataset[0][2].shape)

[600]

In [8]:
dataloader = tud.DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=0)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fe11de199a0>

In [9]:
for i,(input_labels,pos_labels,neg_labels) in enumerate(dataloader):
    print(input_labels,pos_labels,neg_labels)
    if i > 2:
        break

tensor([   28,   160,   331, 12247,  1783,    41,     6,    11,  1284,   657,
            1,    76,  1584,   248,  1038,    51,     6,  7296,  1479, 17549,
          122,   293,    31,     0,     2, 13459,   361,     1,    42,    17,
            4,   235]) tensor([[11454,  5081,    49,    23,   151,    84],
        [ 1340,     8,  4835,     0,  1946,     1],
        [    4,    40,     6,    38,  2104,     8],
        [ 2398,    57,   179,    25,  3228,  4246],
        [    5,  4026,     0,    11,    99,     4],
        [   77,  1041,     7,     0, 10907,     1],
        [    4,    19,   292,  3474,     2, 14189],
        [    0, 10307,   478,  1633, 10308,  1633],
        [12915, 12916, 12917, 12918, 12919,  3616],
        [ 1309,     1,     0,   549,   108,   303],
        [15336,     0,  1600,     0,   355, 15337],
        [ 8986,     3,  4552,     4,  8987,     2],
        [ 4139,  4751,     7,    27,    16,   328],
        [ 2679,  1700,  1228,  6175,     3,     7],
        [   51,

In [12]:
class EmbeddinModel(nn.Module):
    def __init__(self,vocab_size,embed_size):
        super(EmbeddinModel,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size

        initrange = 0.5 / self.embed_size
        self.out_embed = nn.Embedding(
            self.vocab_size, self.embed_size, sparse=False)
        #模型输出nn.Embedding(30000, 100)
        self.out_embed.weight.data.uniform_(-initrange, initrange)
        # 权重初始化的一种方法

        self.in_embed = nn.Embedding(
            self.vocab_size, self.embed_size, sparse=False)
        #模型输入nn.Embedding(30000, 100)
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        # 权重初始化的一种方法

    def forward(self, input_labels, pos_labels, neg_labels):
        """
        input_labels: 中心词, [batch_size]
        pos_labels: 中心词周围 context window 出现过的单词 [batch_size * (window_size * 2)]
        neg_labelss: 中心词周围没有出现过的单词，从 negative sampling 得到 [batch_size, (window_size * 2 * K)]

        return: loss, [batch_size]
        """

        # input_labels是输入的标签，tud.DataLoader()返回的。相已经被分成batch了。
        batch_size = input_labels.size(0)

        input_embedding = self.in_embed(input_labels)
        # B * embed_size
        # 这里估计进行了运算：（128,30000）*（30000,100）= 128(B) * 100 (embed_size)

        pos_embedding = self.out_embed(pos_labels)  # B * (2*C) * embed_size
        # 同上，增加了维度(2*C)，表示一个batch有B组周围词单词，一组周围词有(2*C)个单词，每个单词有embed_size个维度。

        # B * (2*C * K) * embed_size
        neg_embedding = self.out_embed(neg_labels)
        # 同上，增加了维度(2*C*K)

        # torch.bmm()为batch间的矩阵相乘（b,n.m)*(b,m,p)=(b,n,p)
        log_pos = torch.bmm(
            pos_embedding, input_embedding.unsqueeze(2)).squeeze()  # B * (2*C)
        log_neg = torch.bmm(
            neg_embedding, -input_embedding.unsqueeze(2)).squeeze()  # B * (2*C*K)
        # unsqueeze(2)指定位置升维，.squeeze()压缩维度。

        # 下面loss计算就是论文里的公式
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)  # batch_size
        loss = log_pos + log_neg

        return -loss

    def input_embeddings(self):  # 取出self.in_embed数据参数
        return self.in_embed.weight.data.cpu().numpy()

In [13]:
model = EmbeddinModel(VOCAB_SIZE,EMBEDDING_SIZE)

In [14]:
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
# 随机梯度下降

for e in range(NUM_EPOCHS):  # 开始迭代
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        # print(input_labels, pos_labels, neg_labels)

        input_labels = input_labels.long()  # longtensor
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        if USE_CUDA:
            input_labels = input_labels.cuda()
            pos_labels = pos_labels.cuda()
            neg_labels = neg_labels.cuda()

        # 下面第一节课都讲过的
        optimizer.zero_grad()  # 梯度归零
        loss = model(input_labels, pos_labels, neg_labels).mean()

        loss.backward()
        optimizer.step()

        # 打印结果。
        if i % 100 == 0:
            with open(LOG_FILE, "a") as fout:
                fout.write("epoch: {}, iter: {}, loss: {}\n".format(
                    e, i, loss.item()))
                print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))

        # if i % 2000 == 0:
        #     embedding_weights = model.input_embeddings()
        #     sim_simlex = evaluate("simlex-999.txt", embedding_weights)
        #     sim_men = evaluate("men.txt", embedding_weights)
        #     sim_353 = evaluate("wordsim353.csv", embedding_weights)
        #     with open(LOG_FILE, "a") as fout:
        #         print("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
        #             e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
        #         fout.write("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
        #             e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))

    embedding_weights = model.input_embeddings()
    np.save("embedding-{}".format(EMBEDDING_SIZE), embedding_weights)
    torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

epoch: 0, iter: 0, loss: 420.04718017578125
epoch: 0, iter: 100, loss: 267.3180236816406
epoch: 0, iter: 200, loss: 189.4784393310547
epoch: 0, iter: 300, loss: 217.42340087890625
epoch: 0, iter: 400, loss: 145.1310272216797
epoch: 0, iter: 500, loss: 131.80780029296875
epoch: 0, iter: 600, loss: 157.67408752441406
epoch: 0, iter: 700, loss: 113.30000305175781
epoch: 0, iter: 800, loss: 179.28933715820312
epoch: 0, iter: 900, loss: 79.97330474853516
epoch: 0, iter: 1000, loss: 83.37088012695312
epoch: 0, iter: 1100, loss: 124.0562515258789
epoch: 0, iter: 1200, loss: 79.22520446777344
epoch: 0, iter: 1300, loss: 114.33177185058594
epoch: 0, iter: 1400, loss: 103.77493286132812
epoch: 0, iter: 1500, loss: 76.44828033447266
epoch: 0, iter: 1600, loss: 121.24266052246094
epoch: 0, iter: 1700, loss: 146.60772705078125
epoch: 0, iter: 1800, loss: 109.6523208618164
epoch: 0, iter: 1900, loss: 36.54298782348633
epoch: 0, iter: 2000, loss: 75.1758804321289
epoch: 0, iter: 2100, loss: 85.203636