In [1]:
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from collections import Counter
import numpy as np
import random
import scipy
from sklearn.metrics.pairwise import cosine_similarity
MAX_VOCAB_SIZE=10000 #total=253854
with open('lmtraining.txt', 'r', encoding='utf-8') as file:
    text = file.read()
text=text.lower().split()
# print(type(text)) >>list
# print(len(text)) >>17005207
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs =word_counts ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs)

In [175]:
z=list(vocab_dict.values())
print(z[19998])

44


In [None]:
min=10000
for i in vocab_dict.values():
    if i<min:
        min=i
print(min) #126
    

126


In [None]:
import copy
class WordEmbeddingDataset(Dataset):
    def __init__(self, text, word2idx, word_freqs,window,K):
        ''' text: a list of words, all text from the training dataset
            word2idx: the dictionary from word to index
            word_freqs: the frequency of each word
        '''
        super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型，然后重写两个方法
        self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把文本数字化表示。如果不在词典中，也表示为unk
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word2idx = word2idx
        self.word_freqs = torch.Tensor(word_freqs)
        self.window=window
        self.K=K
        
    def __len__(self):
        return len(self.text_encoded) # 返回所有单词的总数，即item的总数
    
    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        '''
        center_word = self.text_encoded[idx] # 取得中心词
        pos_indices = list(range(idx - self.window, idx)) + list(range(idx +1, idx + self.window + 1)) # 先取得中心左右各C个词的索引
        pos_indices = [i % len(self.text_encoded) for i in pos_indices] # 为了避免索引越界，所以进行取余处理
        pos_words = self.text_encoded[pos_indices]
        # x=[-2,-1,0,1,2]
        # pos_indices = [i % 10000 for i in x] <<[9998, 9999, 0, 1, 2]
        select_weight = copy.deepcopy(self.word_freqs)
        select_weight[pos_words] = 0
        select_weight[center_word] = 0
        neg_words = torch.multinomial(select_weight, self.K * pos_words.shape[0], True)
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
        # while 循环是为了保证 neg_words中不能包含背景词
        return center_word, pos_words, neg_words

In [None]:
print([x for x in range(1,3)])

[1, 2]


In [None]:
dataset = WordEmbeddingDataset(text, word2idx, word_freqs,2,15)

In [None]:
next(iter(dataset))

(tensor(5233),
 tensor([  15,   72, 3080,   11]),
 tensor([   0, 1487,    1, 1761, 1904, 2113, 1431, 8095, 9999,  420,  437,   59,
          703,   89, 1173, 2294, 1207, 6486, 6267,  138,   35,    2, 3453, 9999,
         1049, 9999, 3800,    5, 4378,  242,  974, 1529,   97, 2369, 9999,  378,
          719,    2,   27, 3877,  163, 9999, 5427, 7108, 9999, 2944, 2191, 2173,
          113,   65,  710,   24,   16, 6522, 1599,    1,  425, 3900, 9999,  158]))

In [None]:
EMBEDDING_SIZE = 100
class SGNS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SGNS, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
        
    def forward(self, input_labels, pos_labels, neg_labels):
        ''' input_labels: center words, [batch_size]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels:negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]
        '''
        input_embedding = self.in_embed(input_labels) # [batch_size, embed_size]
        pos_embedding = self.out_embed(pos_labels)# [batch_size, (window * 2), embed_size]
        neg_embedding = self.out_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size]
        
        input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1]
        
        pos_dot = torch.bmm(pos_embedding, input_embedding) # [batch_size, (window * 2), 1]
        pos_dot = pos_dot.squeeze(2) # [batch_size, (window * 2)]
        
        neg_dot = torch.bmm(neg_embedding, -input_embedding) # [batch_size, (window * 2 * K), 1]
        neg_dot = neg_dot.squeeze(2) # batch_size, (window * 2 * K)]
        
        log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg = F.logsigmoid(neg_dot).sum(1)
        
        loss = log_pos + log_neg
        
        return -loss
    
    # def input_embedding(self):
    #     return self.in_embed.weight.detach().numpy()


In [None]:
x=torch.tensor([[1],[2]])
print(x.shape)
x=x.unsqueeze(1)
print(x)
print(x.shape)

torch.Size([2, 1])
tensor([[[1]],

        [[2]]])
torch.Size([2, 1, 1])


In [None]:
dataloader = DataLoader(dataset, batch_size=256,shuffle=True,num_workers=2,pin_memory=True,persistent_workers=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = SGNS(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
for e in range(1):
    print("---start---")
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()

        optimizer.step()

        if i % 100 == 0:
            print('epoch', e, 'iteration', i, loss.item())
# embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

cuda
---start---


In [None]:
import torch
import numpy as np
vocab_size=10000
embed_size=100
vec_sta=np.zeros((vocab_size,embed_size))
model = SGNS(vocab_size, embed_size)
# 加载模型的状态字典
state_dict = torch.load('embedding-100.th')
# 加载模型参数
model.load_state_dict(state_dict)
model.eval()
for i in range(vocab_size):
    word_vectors = model.in_embed(torch.tensor(i))
    vec_sta[i]=word_vectors
print(word_vectors.shape)
# for i in range(vocab_size):
#     word_vectors = model.in_embed(i)
#     vec_sta[i]


In [None]:
vec_sta=model.in_embed.weight.detach().numpy()

In [None]:
print(vec_sta.shape)

(10000, 100)


In [None]:
vocabulary=[word for word in vocab_dict.keys()]
result=[]
with open('wordsim353_agreed.txt', 'r') as file:
    for line in file:
        parts = line.strip().split('\t') 
        word1,word2=parts[1],parts[2]
        if word1 in vocabulary :
            index1 = word2idx[word1]
        else :
            index1 = 9999
        if word2 in vocabulary:
            index2 = word2idx[word2]  
        else :
            index2 = 9999
        # 计算余弦相似度
        vec1 = vec_sta[index1]
        vec2 = vec_sta[index2]
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        result.append([word1,word2,cosine_sim])



NameError: name 'vocab_dict' is not defined

In [None]:
print(result[1])

['tiger', 'cat', -0.12228242]


In [None]:
with open('output_sgns.txt', 'w') as file:
    # 遍历结果列表
    for item in result:
        # 将每个元素转换为字符串，并以制表符分隔
        line = '\t'.join(map(str, item))
        # 写入文件
        file.write(line + '\n')


In [147]:
class WordEmbeddingDataset(Dataset):
    def __init__(self, text, word2idx, word_freqs,window,K):
        ''' text: a list of words, all text from the training dataset
            word2idx: the dictionary from word to index
            word_freqs: the frequency of each word
        '''
        super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型，然后重写两个方法
        # self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把文本数字化表示。如果不在词典中，也表示为unk
        self.text_encoded = [word2idx.get(word,word2idx['<UNK>']) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word2idx = word2idx
        self.word_freqs = torch.Tensor(word_freqs)
        self.window=window
        self.K=K
        
    def __len__(self):
        return len(self.text_encoded) # 返回所有单词的总数，即item的总数
    
    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的positive word
            - 随机采样的K个单词作为negative word
        '''
        center_word = torch.LongTensor([self.text_encoded[idx]]) # 取得中心词
        pos_indices = list(range(idx - self.window, idx)) + list(range(idx +1, idx + self.window + 1)) # 先取得中心左右各C个词的索引
        pos_indices = [i % len(self.text_encoded) for i in pos_indices] # 为了避免索引越界，所以进行取余处理
        pos_words = self.text_encoded[pos_indices]
        # x=[-2,-1,0,1,2]
        # pos_indices = [i % 10000 for i in x] <<[9998, 9999, 0, 1, 2]
        select_weight = copy.deepcopy(self.word_freqs)
        select_weight[pos_words] = 0
        select_weight[center_word] = 0
        neg_words = torch.multinomial(select_weight, self.K * pos_words.shape[0], True)
        # torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出
        # 的是self.word_freqs对应的下标
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
        # 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
        # while 循环是为了保证 neg_words中不能包含背景词
        return center_word, pos_words, neg_words

In [178]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SGNS, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.embedding_v = nn.Embedding(self.vocab_size, self.embed_size, sparse=True)
        self.embedding_u = nn.Embedding(self.vocab_size, self.embed_size, sparse=True)
        
    def forward(self, center_word, target_word, negative_word):
        ''' input_labels: center words, [batch_size, 1]
            pos_labels: positive words, [batch_size, (window_size * 2)]
            neg_labels:negative words, [batch_size, (window_size * 2 * K)]
            
            return: loss, [batch_size]  
        '''
        print(center_word.shape,target_word.shape,negative_word.shape)
        emb_v = self.embedding_v(center_word) # [batch_size, 1, embed_size]
        emb_u = self.embedding_u(target_word) # [batch_size, (window * 2), embed_size]
        emb_neg = self.embedding_u(negative_word)  # [batch_size, (window * 2 * K), embed_size]      
        print(emb_v.shape,emb_u.shape,emb_neg.shape) 
        pos_score = torch.sum(torch.mul(emb_v, emb_u), dim=2) #中心词和上下文词是一对一的因此逐个乘
        neg_score = torch.sum(torch.mul( emb_neg, emb_v),dim=2)
        #一个[中心词,上下文词]词对，对应K个负样本，因此要批量相乘
        log_pos =  F.logsigmoid(pos_score).squeeze() # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg =  F.logsigmoid(-1 * neg_score).squeeze()
        loss = log_pos + log_neg
        return -loss
    def input_embedding(self):
        return self.embedding_u.weight.data.cpu().numpy()

In [148]:
dataset=WordEmbeddingDataset(text,word2idx,word_freqs,2,15)

In [149]:
import copy
print(dataset[0])


(tensor([5233]), tensor([  15,   72, 3080,   11]), tensor([[ 433,   77, 5937,    7,   24, 7952, 5292,   14, 1530, 3741, 8741,  670,
         1921,  112,    4],
        [ 499, 2089,  786, 5164,   18, 6964, 2594, 9847, 4073,    3,   74, 5299,
         2371,    2, 2485],
        [   1,    0, 1354,  821, 4075,   24, 4855,    5,  175,   73, 4699,  936,
          307, 9016, 7076],
        [3636, 2822, 1257,  292,  157,  853,   76, 9999,  725,  573, 7469, 8666,
         1303,  253, 3942]]))


In [152]:
def build_vocab():
    with open('lmtraining.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    text=text.lower().split()
    # print(type(text)) >>list
    # print(len(text)) >>17005207
    # vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
    # vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
    vocab_dict=dict(Counter(text)) #total=253854
    word2idx = {word:i for i, word in enumerate(vocab_dict.keys())}
    idx2word = {i:word for i, word in enumerate(vocab_dict.keys())}
    word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
    word_freqs = word_counts ** (3./4.)
    word_freqs = word_freqs / np.sum(word_freqs)
    return text,vocab_dict,word2idx,idx2word,word_freqs
MAX_VOCAB_SIZE=10000 #total=253854
text,vocab_dict,word2idx,idx2word,word_freqs=build_vocab()
vocab_size=len(vocab_dict)


In [176]:
vocab_size=len(vocab_dict)
print(vocab_size)

20000


In [2]:
dataloader=DataLoader(dataset,1)
model = SGNS(vocab_size,MAX_VOCAB_SIZE )
input_labels, pos_labels, neg_labels=dataset[0]
loss = model(input_labels, pos_labels, neg_labels)
print(loss.shape)

NameError: name 'dataset' is not defined