In [4]:
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import copy
import os
import numpy as np

def read_data(data_path):
    print("---读取文本数据---")
    english_stopwords = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    with open(data_path, 'r', encoding='utf-8') as file:
        text = file.read().lower().split()
    text = [word for word in text if word not in english_stopwords]
    text = [stemmer.stem(word) for word in text]#>>10890638
    np.save('sgns_data/text.npy',text)
def map(data_path):
    text=np.load(data_path)
    vocab_dict = dict(Counter(text))# 得到单词字典表，key是单词，value是次数
    word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
    idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
    word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
    word_freqs =word_counts ** (3./4.)
    word_freqs = word_freqs / np.sum(word_freqs)
    return text,word2idx,idx2word,word_freqs
class WordEmbeddingDataset(Dataset):
    def __init__(self, text, word2idx, word_freqs,window_size,K):
        ''' text: a list of words, all text from the training dataset
            word2idx: the dictionary from word to index
            word_freqs: the frequency of each word
        '''
        super().__init__() # #通过父类初始化模型，然后重写两个方法
        self.text=text
        self.text_encoded = [word2idx.get(word) for word in self.text] # 把文本转换成数字编码
        self.text_encoded = torch.tensor(self.text_encoded,dtype=torch.int32)
        self.word2idx = word2idx
        self.word_freqs = torch.tensor(word_freqs,dtype=torch.float32)
        self.window_size=window_size
        self.K=K
        
    def __len__(self):
        return len(self.text_encoded) # 返回所有单词的总数，即item的总数
    
    def __getitem__(self, idx) -> tuple[torch.Tensor,torch.Tensor,torch.Tensor] :
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        '''
        center_word = self.text_encoded[idx] # 取得中心词
        left = list(range(idx - self.window_size, idx))
        right = list(range(idx + 1, idx + self.window_size + 1))
        pos_indices = [i % len(self.text) for i in left + right]  
        # pos_indices=list(range(max(0,idx-self.window),min(idx+1+self.window,len(self.text))))
        # pos_indices.remove(idx)      
        pos_words = self.text_encoded[pos_indices]
        select_weight = copy.deepcopy(self.word_freqs)
        select_weight[pos_words] = 0
        select_weight[center_word] = 0
        neg_words = torch.multinomial(select_weight, self.K * pos_words.shape[0], True)
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
        # while 循环是为了保证 neg_words中不能包含背景词
        return center_word, pos_words, neg_words
class SGNS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SGNS, self).__init__()

        self.vocab_size = vocab_size
        self.embed_size = embed_size 
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

    def forward(self, center_word, target_word, negative_word):
        ''' input_labels: center words, [batch_size, 1]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels:negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]  
        '''
        emb_v = self.in_embed(center_word) # [batch_size, 1, embed_size]
        emb_u = self.out_embed(target_word) # [batch_size, (window * 2), embed_size]
        emb_neg = self.out_embed(negative_word)  # [batch_size, (window * 2 * K), embed_size]      
        emb_v=emb_v.unsqueeze(2)
        pos_score =torch.bmm(emb_u, emb_v) #中心词和上下文词是一对一的因此逐个乘
        pos_score =pos_score.squeeze(2)
        neg_score = torch.bmm(emb_neg,emb_v)
        #一个[中心词,上下文词]词对，对应K个负样本，因此要批量相乘
        log_pos =  F.logsigmoid(pos_score).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg =  F.logsigmoid(-1 * neg_score).sum(1)
        loss = torch.sum(log_pos) + torch.sum(log_neg)
        return -loss
    def input_embedding(self):
        return self.embedding_u.weight.data.cpu().numpy()

In [2]:
text,word2idx,idx2word,word_freqs=map('text.npy')

In [3]:
window=2
K=15
running_loss=0
vocab_size=len(word2idx)
dataset=WordEmbeddingDataset(text,word2idx,word_freqs,window,K)

In [6]:
dataloader = DataLoader(dataset, batch_size=2)

In [7]:
model = SGNS(vocab_size, 100)

In [8]:
for i, batch_data in enumerate(dataloader):
    if i == 1:
        # 找到了我们想要的批次
        input_data, positive_data, negative_data = batch_data
        break  # 找到后退出循环


In [9]:
in_embed = nn.Embedding(vocab_size, 100)
input_data, positive_data, negative_data = in_embed(input_data),in_embed(positive_data),in_embed(negative_data)

In [10]:
print(input_data.shape,positive_data.shape,negative_data.shape)

torch.Size([2, 100]) torch.Size([2, 4, 100]) torch.Size([2, 60, 100])


In [12]:
print(len(dataloader))

5445319


In [13]:
input_data=input_data.unsqueeze(2)
x=torch.bmm(positive_data,input_data)

In [14]:
print(input_data.shape)

torch.Size([2, 100, 1])


In [15]:
zz=torch.bmm(negative_data,input_data)

In [16]:
print(x.shape)

torch.Size([2, 4, 1])


In [17]:
x=x.squeeze(2)
print(x)

tensor([[-12.0917,   8.2146,   0.8796,   6.6761],
        [  5.1587,   0.8796,  -4.4565,   1.4659]], grad_fn=<SqueezeBackward1>)


In [20]:
log_pos = F.logsigmoid(x)
print(log_pos)
print(log_pos.shape)
log_pos=log_pos.sum(1)
print(log_pos.shape)
print(log_pos)

tensor([[-1.2092e+01, -2.7064e-04, -3.4708e-01, -1.2599e-03],
        [-5.7325e-03, -3.4708e-01, -4.4680e+00, -2.0773e-01]],
       grad_fn=<LogSigmoidBackward0>)
torch.Size([2, 4])
torch.Size([2])
tensor([-12.4403,  -5.0286], grad_fn=<SumBackward1>)


In [21]:
y=log_pos.mean()
print(y)

tensor(-8.7344, grad_fn=<MeanBackward0>)


In [43]:
print(y)

tensor(-11.5785, grad_fn=<MeanBackward0>)


In [29]:
print(input_data.shape)

torch.Size([8, 100, 1])


RuntimeError: batch1 must be a 3D tensor

In [None]:
print(input_data)

In [18]:
loss = model(input_data, positive_data, negative_data)

torch.Size([8, 100]) torch.Size([8, 4, 100]) torch.Size([8, 60, 100])


RuntimeError: The size of tensor a (8) must match the size of tensor b (4) at non-singleton dimension 1

In [26]:
input_labels, pos_labels, neg_labels =dataset[2]
input_labels = input_labels.unsqueeze(0)  # 或者 input_labels.unsqueeze_(0)
pos_labels = pos_labels.unsqueeze(0)    # 或者 pos_labels.unsqueeze_(0)
neg_labels = neg_labels.unsqueeze(0)    
loss = model(input_labels, pos_labels, neg_labels)

torch.Size([1, 100]) torch.Size([1, 4, 100]) torch.Size([1, 60, 100])


In [12]:
print(loss.item())

252.2958984375


In [31]:
window=2
K=15
running_loss=0
vocab_size=len(word2idx)
dataset=WordEmbeddingDataset(text,word2idx,word_freqs,window,K)
dataloader = DataLoader(dataset, batch_size=256,shuffle=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = SGNS(vocab_size, 100)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
for e in range(1):
    print("---start---")
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels)
        running_loss += loss.item()
        loss.backward()

        optimizer.step()

        if i % 20000 == 0:
            print('epoch', e, 'iteration', i, running_loss/(i+1))
torch.save(model.state_dict(), "embedding.th")
embedding_weights = model.input_embedding()


In [1]:
for epoch in range(1, args.epoch + 1):
        dataset = SGNSDataset(word2idx, text_idx, words_freq, args)
        dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
        print('dataset size: {}'.format(len(dataset)))
        print('batch num: {}'.format(len(dataloader)))
        train_loss = 0
        loss_o = 0
        loss_n = 0
        step = 0
        avg_err = 0
        print('Starting epoch: {}'.format(epoch))
        for _, (iword, owords, nwords) in enumerate(dataloader):
            step += 1
            if epoch == 1 and step <= 1:
                print('iword: {}, shape: {}, max: {}'.format(iword, iword.shape, iword.max()))
                print('owords: {}, shape: {}, max: {}'.format(owords, owords.shape, owords.max()))
                print('nwords: {}, shape: {}, max: {}'.format(nwords, nwords.shape, nwords.max()))
            sys.stdout.flush()
            optim.zero_grad()
            if args.cuda == 'True':
                iword, owords, nwords = iword.cuda(), owords.cuda(), nwords.cuda()
            score_o, score_n = sgns(iword, owords, nwords)
            # loss = torch.mean(score_o + score_n)
            loss = score_o + score_n
            train_loss += loss.item()
            loss_o += score_o.item()
            loss_n += score_n.item()
            loss.backward()
            optim.step()

            # print the training stats
            if step % 1000 == 0:
                in_embed = sgns.get_embeddings('in')
                avg_err = calc_err(in_embed, word2idx, args)
                print('Epoch: {}, Step: {}, train_loss: {}, score_o: {}, score_n: {}, avg_error: {}'.format(epoch, step, loss.item(), score_o.item(), score_n.item(), avg_err))
                sys.stdout.flush()

                # test the embedding
                test_list = ['rain', 'utah', 'computer', 'brother', 'house']
                for test_w in test_list:
                    print('Words closest to chosen word: {}'.format(test_w))
                    most_similar(test_w, in_embed, vocabulary, word2idx)

        train_loss /= step
        loss_o /= step
        loss_n /= step
        print('Finished Epoch: {}, train_loss: {}, loss_o: {}, loss_n: {}, avg error: {}'.format(epoch, train_loss, loss_o, loss_n, avg_err))


NameError: name 'dataset' is not defined

In [None]:
EMBEDDING_SIZE = 100
class SGNS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SGNS, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
        
        init_scale = 0.5 / self.embed_size
        self.in_embed.weight.data.uniform_(-init_scale, init_scale)
        self.out_embed.weight.data.uniform_(-init_scale, init_scale)
    def forward(self, input_labels, pos_labels, neg_labels):
        ''' input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels:negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
        '''
        input_embedding = self.in_embed(input_labels) # [batch_size, embed_size]
        pos_embedding = self.out_embed(pos_labels)# [batch_size, (window * 2), embed_size]
        neg_embedding = self.out_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size]
        
        input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1]
        
        pos_dot = torch.bmm(pos_embedding, input_embedding) # [batch_size, (window * 2), 1]
        pos_dot = pos_dot.squeeze(2) # [batch_size, (window * 2)]
        
        neg_dot = torch.bmm(neg_embedding, -input_embedding) # [batch_size, (window * 2 * K), 1]
        neg_dot = neg_dot.squeeze(2) # batch_size, (window * 2 * K)]
        
        log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg = F.logsigmoid(neg_dot).sum(1)
        
        loss = log_pos + log_neg
        
        return -loss
    
    # def input_embedding(self):
    #     return self.in_embed.weight.detach().numpy()


In [None]:
x=torch.tensor([[1],[2]])
print(x.shape)
x=x.unsqueeze(1)
print(x)
print(x.shape)

torch.Size([2, 1])
tensor([[[1]],

        [[2]]])
torch.Size([2, 1, 1])


In [None]:
dataloader = DataLoader(dataset, batch_size=256,shuffle=True,num_workers=2,pin_memory=True,persistent_workers=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = SGNS(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
for e in range(1):
    print("---start---")
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()

        optimizer.step()

        if i % 100 == 0:
            print('epoch', e, 'iteration', i, loss.item())
# embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

cuda
---start---


In [None]:
import torch
import numpy as np
vocab_size=10000
embed_size=100
vec_sta=np.zeros((vocab_size,embed_size))
model = SGNS(vocab_size, embed_size)
# 加载模型的状态字典
state_dict = torch.load('embedding-100.th')
# 加载模型参数
model.load_state_dict(state_dict)
model.eval()
for i in range(vocab_size):
    word_vectors = model.in_embed(torch.tensor(i))
    vec_sta[i]=word_vectors
print(word_vectors.shape)
# for i in range(vocab_size):
#     word_vectors = model.in_embed(i)
#     vec_sta[i]


In [None]:
vec_sta=model.in_embed.weight.detach().numpy()

In [None]:
print(vec_sta.shape)

(10000, 100)


In [None]:
vocabulary=[word for word in vocab_dict.keys()]
result=[]
with open('wordsim353_agreed.txt', 'r') as file:
    for line in file:
        parts = line.strip().split('\t') 
        word1,word2=parts[1],parts[2]
        if word1 in vocabulary :
            index1 = word2idx[word1]
        else :
            index1 = 9999
        if word2 in vocabulary:
            index2 = word2idx[word2]  
        else :
            index2 = 9999
        # 计算余弦相似度
        vec1 = vec_sta[index1]
        vec2 = vec_sta[index2]
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        result.append([word1,word2,cosine_sim])



NameError: name 'vocab_dict' is not defined

In [None]:
print(result[1])

['tiger', 'cat', -0.12228242]


In [None]:
with open('output_sgns.txt', 'w') as file:
    # 遍历结果列表
    for item in result:
        # 将每个元素转换为字符串，并以制表符分隔
        line = '\t'.join(map(str, item))
        # 写入文件
        file.write(line + '\n')


In [147]:
class WordEmbeddingDataset(Dataset):
    def __init__(self, text, word2idx, word_freqs,window,K):
        ''' text: a list of words, all text from the training dataset
            word2idx: the dictionary from word to index
            word_freqs: the frequency of each word
        '''
        super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型，然后重写两个方法
        # self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把文本数字化表示。如果不在词典中，也表示为unk
        self.text_encoded = [word2idx.get(word,word2idx['<UNK>']) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word2idx = word2idx
        self.word_freqs = torch.Tensor(word_freqs)
        self.window=window
        self.K=K
        
    def __len__(self):
        return len(self.text_encoded) # 返回所有单词的总数，即item的总数
    
    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        '''
        center_word = torch.LongTensor([self.text_encoded[idx]]) # 取得中心词
        pos_indices = list(range(idx - self.window, idx)) + list(range(idx +1, idx + self.window + 1)) # 先取得中心左右各C个词的索引
        pos_indices = [i % len(self.text_encoded) for i in pos_indices] # 为了避免索引越界，所以进行取余处理
        pos_words = self.text_encoded[pos_indices]
        # x=[-2,-1,0,1,2]
        # pos_indices = [i % 10000 for i in x] <<[9998, 9999, 0, 1, 2]
        select_weight = copy.deepcopy(self.word_freqs)
        select_weight[pos_words] = 0
        select_weight[center_word] = 0
        neg_words = torch.multinomial(select_weight, self.K * pos_words.shape[0], True)
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出
        # 的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
        # while 循环是为了保证 neg_words中不能包含背景词
        return center_word, pos_words, neg_words

In [178]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SGNS, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.embedding_v = nn.Embedding(self.vocab_size, self.embed_size, sparse=True)
        self.embedding_u = nn.Embedding(self.vocab_size, self.embed_size, sparse=True)
        
    def forward(self, center_word, target_word, negative_word):
        ''' input_labels: center words, [batch_size, 1]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels:negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]  
        '''
        print(center_word.shape,target_word.shape,negative_word.shape)
        emb_v = self.embedding_v(center_word) # [batch_size, 1, embed_size]
        emb_u = self.embedding_u(target_word) # [batch_size, (window * 2), embed_size]
        emb_neg = self.embedding_u(negative_word)  # [batch_size, (window * 2 * K), embed_size]      
        print(emb_v.shape,emb_u.shape,emb_neg.shape) 
        pos_score = torch.sum(torch.mul(emb_v, emb_u), dim=2) #中心词和上下文词是一对一的因此逐个乘
        neg_score = torch.sum(torch.mul( emb_neg, emb_v),dim=2)
        #一个[中心词,上下文词]词对，对应K个负样本，因此要批量相乘
        log_pos =  F.logsigmoid(pos_score).squeeze() # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg =  F.logsigmoid(-1 * neg_score).squeeze()
        loss = log_pos + log_neg
        return -loss
    def input_embedding(self):
        return self.embedding_u.weight.data.cpu().numpy()

In [148]:
dataset=WordEmbeddingDataset(text,word2idx,word_freqs,2,15)

In [149]:
import copy
print(dataset[0])


(tensor([5233]), tensor([  15,   72, 3080,   11]), tensor([[ 433,   77, 5937,    7,   24, 7952, 5292,   14, 1530, 3741, 8741,  670,
         1921,  112,    4],
        [ 499, 2089,  786, 5164,   18, 6964, 2594, 9847, 4073,    3,   74, 5299,
         2371,    2, 2485],
        [   1,    0, 1354,  821, 4075,   24, 4855,    5,  175,   73, 4699,  936,
          307, 9016, 7076],
        [3636, 2822, 1257,  292,  157,  853,   76, 9999,  725,  573, 7469, 8666,
         1303,  253, 3942]]))


In [152]:
def build_vocab():
    with open('lmtraining.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    text=text.lower().split()
    # print(type(text)) >>list
    # print(len(text)) >>17005207
    # vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
    # vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
    vocab_dict=dict(Counter(text)) #total=253854
    word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
    idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
    word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
    word_freqs = word_counts ** (3./4.)
    word_freqs = word_freqs / np.sum(word_freqs)
    return text,vocab_dict,word2idx,idx2word,word_freqs
MAX_VOCAB_SIZE=10000 #total=253854
text,vocab_dict,word2idx,idx2word,word_freqs=build_vocab()
vocab_size=len(vocab_dict)


In [176]:
vocab_size=len(vocab_dict)
print(vocab_size)

20000


In [2]:
dataloader=DataLoader(dataset,1)
model = SGNS(vocab_size,MAX_VOCAB_SIZE )
input_labels, pos_labels, neg_labels=dataset[0]
loss = model(input_labels, pos_labels, neg_labels)
print(loss.shape)

NameError: name 'dataset' is not defined