In [45]:
from sympy.printing.pretty.pretty_symbology import center

sentence1 = 'Jane wants to go to Shenzhen .'
sentence2 = 'Bob wants to go to Shanghai , me too .'
token1 = sentence1.split(' ')
token2 = sentence2.split(' ')
print("token1 is {}".format(token1))
print("token2 is {}".format(token2))

token1 is ['Jane', 'wants', 'to', 'go', 'to', 'Shenzhen', '.']
token2 is ['Bob', 'wants', 'to', 'go', 'to', 'Shanghai', ',', 'me', 'too', '.']


In [46]:
# 向量化
def vectorize_sentence(tokens, filtered_vocab):
    vector = []
    for w in filtered_vocab:
        vector.append(tokens.count(w))
    return vector


# 去重
def unique(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]


# 停用词
stopwords = ["to", "is", "a"]
# 标点符号
special_chars = [",", ":", ";", ".", "?"]

In [47]:
filtered_tokens = []
for w in unique(token1 + token2):
    if w not in stopwords and w not in special_chars:
        filtered_tokens.append(w)
filtered_tokens

['Jane', 'wants', 'go', 'Shenzhen', 'Bob', 'Shanghai', 'me', 'too']

In [48]:
vector1 = vectorize_sentence(token1, filtered_tokens)
vector2 = vectorize_sentence(token2, filtered_tokens)
print(vector1)
print(vector2)

[1, 1, 1, 1, 0, 0, 0, 0]
[0, 1, 1, 0, 1, 1, 1, 1]


词袋模式 api

In [49]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentence1 = 'Jane wants to go to Shenzhen .'
sentence2 = 'Bob wants to go to Shanghai , me too .'
count_vec = CountVectorizer(ngram_range=(1, 1))
#transform
feature = count_vec.fit_transform([sentence1, sentence2])

#create dataframe
df = pd.DataFrame(feature.toarray(), columns=count_vec.get_feature_names_out())
df

Unnamed: 0,bob,go,jane,me,shanghai,shenzhen,to,too,wants
0,0,1,1,0,0,1,2,0,1
1,1,1,0,1,1,0,2,1,1


TF - IDF

In [50]:
import jieba

sentence1 = '李四爱去深圳'
sentence2 = '张三很爱去上海，我也是'
contents = [sentence1, sentence2]
vec = TfidfVectorizer(tokenizer=jieba.lcut,
                      stop_words=stopwords,
                      norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
feature = vec.fit_transform(contents)  #直接对文档进行转换提取tfidf特征
#一步就得到了tfidf向量
print(feature.toarray())
#create dataframe
df = pd.DataFrame(feature.toarray(), columns=vec.get_feature_names_out())
df

[[0.         0.         0.         0.         0.         0.
  0.         0.57735027 0.57735027 0.         0.57735027 0.        ]
 [0.33333333 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333
  0.33333333 0.         0.         0.33333333 0.         0.33333333]]




Unnamed: 0,上海,也,去,张三,很,我,是,李四,深圳,爱,爱去,，
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,0.0,0.333333,0.0,0.333333


word2vec

In [51]:
import json


def get_open(data_file):
    f = open(data_file, 'r', encoding='utf-8')
    lines = f.readlines()
    sentence = []
    for line in lines:
        line = json.loads(line.strip())
        sentence.append(line['sentence'])
    return sentence

In [52]:
train_data = get_open('./data/tnews/train.json')
test_data = get_open('./data/tnews/test.json')
dev_data = get_open('./data/tnews/dev.json')

In [53]:
sentences = train_data + test_data + dev_data
words = [list(jieba.cut(sentence)) for sentence in sentences]

In [54]:
from gensim.models.word2vec import LineSentence
from gensim.models import word2vec
import gensim

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [55]:
from gensim.models import FastText

model = FastText(words, vector_size=4, window=3, min_count=1, workers=4, epochs=10)

2024-10-24 11:15:01,088 : INFO : collecting all words and their counts
2024-10-24 11:15:01,089 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-24 11:15:01,104 : INFO : PROGRESS: at sentence #10000, processed 129141 words, keeping 25698 word types
2024-10-24 11:15:01,119 : INFO : PROGRESS: at sentence #20000, processed 258779 words, keeping 39123 word types
2024-10-24 11:15:01,136 : INFO : PROGRESS: at sentence #30000, processed 387255 words, keeping 49324 word types
2024-10-24 11:15:01,157 : INFO : PROGRESS: at sentence #40000, processed 515598 words, keeping 57864 word types
2024-10-24 11:15:01,178 : INFO : PROGRESS: at sentence #50000, processed 644895 words, keeping 65382 word types
2024-10-24 11:15:01,201 : INFO : PROGRESS: at sentence #60000, processed 775338 words, keeping 74072 word types
2024-10-24 11:15:01,225 : INFO : PROGRESS: at sentence #70000, processed 906119 words, keeping 80406 word types
2024-10-24 11:15:01,235 : INFO : collected 823

In [56]:
model1 = word2vec.Word2Vec(words, vector_size=4, window=3, min_count=1, workers=4, epochs=10)

2024-10-24 11:15:38,118 : INFO : collecting all words and their counts
2024-10-24 11:15:38,119 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-24 11:15:38,134 : INFO : PROGRESS: at sentence #10000, processed 129141 words, keeping 25698 word types
2024-10-24 11:15:38,151 : INFO : PROGRESS: at sentence #20000, processed 258779 words, keeping 39123 word types
2024-10-24 11:15:38,167 : INFO : PROGRESS: at sentence #30000, processed 387255 words, keeping 49324 word types
2024-10-24 11:15:38,183 : INFO : PROGRESS: at sentence #40000, processed 515598 words, keeping 57864 word types
2024-10-24 11:15:38,198 : INFO : PROGRESS: at sentence #50000, processed 644895 words, keeping 65382 word types
2024-10-24 11:15:38,214 : INFO : PROGRESS: at sentence #60000, processed 775338 words, keeping 74072 word types
2024-10-24 11:15:38,231 : INFO : PROGRESS: at sentence #70000, processed 906119 words, keeping 80406 word types
2024-10-24 11:15:38,240 : INFO : collected 823

In [57]:
model.wv.most_similar('法律', topn=10)

[('新貌', 0.9997027516365051),
 ('八德', 0.9996379613876343),
 ('巴音', 0.9995285272598267),
 ('武勇', 0.9993701577186584),
 ('风采录', 0.9991586804389954),
 ('名作', 0.9991145730018616),
 ('淅川县', 0.9990727305412292),
 ('饭团', 0.9990687370300293),
 ('广播操', 0.9990463256835938),
 ('劝导', 0.9987432956695557)]

In [58]:
model1.wv.most_similar('法律', topn=10)

[('伊', 0.9996498227119446),
 ('制造', 0.9993472099304199),
 ('生产线', 0.9990198612213135),
 ('渠道', 0.9988608360290527),
 ('骗局', 0.9987649321556091),
 ('分销', 0.9985112547874451),
 ('唱空', 0.9984816312789917),
 ('第一步', 0.9983130693435669),
 ('举措', 0.998151421546936),
 ('操作系统', 0.9981495141983032)]

In [65]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.nn.parameter import Parameter
from collections import Counter
import numpy as np
import random
import math
import pandas as pd
import scipy  # SciPy是基于NumPy开发的高级模块，它提供了许多数学算法和函数的实现
import sklearn
from sklearn.metrics.pairwise import cosine_similarity  # 余弦相似度函数

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
K = 100  # number of negative samples 负样本随机采样数量
C = 3  # nearby words threshold 指定周围三个单词进行预测
NUM_EPOCHS = 3  # The number of epochs of training 迭代轮数，default=10
MAX_VOCAB_SIZE = 30000  # the vocabulary size 词汇表多大
BATCH_SIZE = 32  # the batch size 每轮迭代1个batch的数量
LEARNING_RATE = 0.2  # the initial learning rate #学习率
EMBEDDING_SIZE = 100  # 词向量维度
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [66]:
def word_tokenize(text):
    return text.split()


with open("data/nietzsche.txt", "r") as fin:  # 读入文件
    text = fin.read()

print('text: ', text[:500])

text = [w for w in word_tokenize(text.lower())]
# 分词，在这里类似于text.split()

vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
# 字典格式，把（MAX_VOCAB_SIZE-1）个最频繁出现的单词取出来，-1是留给不常见的单词

vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
# unk表示不常见单词数=总单词数-常见单词数
# 这里计算的到vocab["<unk>"]=29999

idx_to_word = [word for word in vocab.keys()]
# 取出字典的所有单词key

word_to_idx = {word: i for i, word in enumerate(idx_to_word)}
# 取出所有单词的单词和对应的索引，索引值与单词出现次数相反，最常见单词索引为0。

word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
# 所有单词的频数values

word_freqs = word_counts / np.sum(word_counts)
# 所有单词的频率

word_freqs = word_freqs ** (3. / 4.)
# 论文里乘以3/4次方

word_freqs = word_freqs / np.sum(word_freqs)  # 用来做 negative sampling
# 重新计算所有单词的频率

VOCAB_SIZE = len(idx_to_word)  # 词汇表单词数30000=MAX_VOCAB_SIZE
VOCAB_SIZE

text:  PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,
indeed, it s


17683

In [67]:
class WordDataset(data.Dataset):
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(WordDataset, self).__init__()
        self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE - 1) for t in text]
        # 取出text里每个单词word_to_idx字典里对应的索引,不在字典里返回"<unk>"的索引
        self.text_encoded = torch.Tensor(self.text_encoded).long()
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)  # 所有单词的总数

    def __getitem__(self, idx):
        center_word = self.text_encoded[idx]
        # 中心词索引
        pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))
        # 周围词索引的索引，比如idx=0时。pos_indices = [-3, -2, -1, 1, 2, 3]
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        # range(idx+1, idx+C+1)超出词汇总数时，需要特别处理，取余数
        pos_words = self.text_encoded[pos_indices]
        # 周围词索引，就是希望出现的正例单词
        # print(pos_words)
        neg_words = torch.multinomial(
            self.word_freqs, K * pos_words.shape[0], True)
        # 负例采样单词索引，torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，
        # 输出的是self.word_freqs对应的下标。
        # 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大。
        # 每个正确的单词采样K个，pos_words.shape[0]是正确单词数量
        # print(neg_words)
        return center_word, pos_words, neg_words


dataset = WordDataset(text, word_to_idx, idx_to_word, word_freqs, word_counts)
dataloader = data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [68]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(Embedding, self).__init__()
        self.vocab_size = vocab_size  # 字典大小
        self.embedding_size = embedding_size  # 编码维度
        init = 0.5 / self.embedding_size
        self.out_embed = nn.Embedding(
            self.vocab_size, self.embedding_size, sparse=False)
        #模型输出nn.Embedding(30000, 100)
        self.out_embed.weight.data.uniform_(-init, init)
        # 权重初始化的一种方法

        self.in_embed = nn.Embedding(
            self.vocab_size, self.embedding_size, sparse=False)
        #模型输入nn.Embedding(30000, 100)
        self.in_embed.weight.data.uniform_(-init, init)
        # 权重初始化的一种方法

    def forward(self, input_labels, pos_labels, neg_labels):
        """
        input_labels: 中心词, [batch_size]
        pos_labels: 中心词周围 context window 出现过的单词 [batch_size * (window_size * 2)]
        neg_labels: 中心词周围没有出现过的单词，从 negative sampling 得到 [batch_size, (window_size * 2 * K)]
        """
        batch_size = input_labels.size(0)

        input_embedding = self.in_embed(input_labels)
        # B * embedding_size
        # 这里估计进行了运算：（128,30000）*（30000,100）= 128(B) * 100 (embedding_size)

        pos_embedding = self.out_embed(pos_labels)  # B * (2*C) * embedding_size
        # 同上，增加了维度(2*C)，表示一个batch有B组周围词单词，一组周围词有(2*C)个单词，每个单词有embedding_size个维度。

        # B * (2*C * K) * embedding_size
        neg_embedding = self.out_embed(neg_labels)
        # 同上，增加了维度(2*C*K)

        # torch.bmm()为batch间的矩阵相乘（b,n.m)*(b,m,p)=(b,n,p)
        log_pos = torch.bmm(
            pos_embedding, input_embedding.unsqueeze(2)).squeeze()  # B * (2*C)
        log_neg = torch.bmm(
            neg_embedding, -input_embedding.unsqueeze(2)).squeeze()  # B * (2*C*K)
        # unsqueeze(2)指定位置升维，.squeeze()压缩维度。

        # 下面loss计算就是论文里的公式
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)  # batch_size
        loss = log_pos + log_neg

        return -loss

    def input_embeddings(self):  # 取出self.in_embed数据参数
        return self.in_embed.weight.data.cpu().numpy()

In [69]:
model = Embedding(VOCAB_SIZE, EMBEDDING_SIZE).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [70]:
for epoch in range(NUM_EPOCHS):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)
        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("epoch: {}, iter: {}, loss: {}".format(epoch, i, loss.item()))
embedding_weights = model.input_embeddings()

epoch: 0, iter: 0, loss: 420.04742431640625
epoch: 0, iter: 100, loss: 251.5990753173828
epoch: 0, iter: 200, loss: 247.44834899902344
epoch: 0, iter: 300, loss: 174.37144470214844
epoch: 0, iter: 400, loss: 164.48870849609375
epoch: 0, iter: 500, loss: 131.84848022460938
epoch: 0, iter: 600, loss: 149.32656860351562
epoch: 0, iter: 700, loss: 130.55337524414062
epoch: 0, iter: 800, loss: 116.36502075195312
epoch: 0, iter: 900, loss: 84.2267074584961
epoch: 0, iter: 1000, loss: 121.55711364746094
epoch: 0, iter: 1100, loss: 123.77498626708984
epoch: 0, iter: 1200, loss: 93.50875854492188
epoch: 0, iter: 1300, loss: 88.81300354003906
epoch: 0, iter: 1400, loss: 122.85433959960938
epoch: 0, iter: 1500, loss: 60.804752349853516
epoch: 0, iter: 1600, loss: 122.99179077148438
epoch: 0, iter: 1700, loss: 83.84132385253906
epoch: 0, iter: 1800, loss: 122.31605529785156
epoch: 0, iter: 1900, loss: 125.36405944824219
epoch: 0, iter: 2000, loss: 96.53458404541016
epoch: 0, iter: 2100, loss: 85.0

In [71]:
import os


def evaluate(filename, embedding_weights):
    if not os.path.isfile(filename):
        return
    if filename.endswith(".csv"):
        data = pd.read_csv(filename, sep=",")
    else:
        data = pd.read_csv(filename, sep="\t")
    print(data.head())
    human_similarity = []
    model_similarity = []
    for i in data.iloc[:, 0:2].index:
        word1, word2 = data.iloc[i, 0], data.iloc[i, 1]
        if word1 not in word_to_idx or word2 not in word_to_idx:
            continue
        else:
            word1_idx, word2_idx = word_to_idx[word1], word_to_idx[word2]
            word1_embed, word2_embed = embedding_weights[[
                word1_idx]], embedding_weights[[word2_idx]]
            model_similarity.append(
                float(sklearn.metrics.pairwise.cosine_similarity(word1_embed, word2_embed)))
            human_similarity.append(float(data.iloc[i, 2]))

    # model_similarity
    return scipy.stats.spearmanr(human_similarity, model_similarity)


def find_nearest(word):
    if word not in word_to_idx:
        return
    index = word_to_idx.get(word, 0)
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding)
                        for e in embedding_weights])
    return [idx_to_word[i] for i in cos_dis.argsort()[:10]]


print("simlex-999", evaluate("data/en-simlex-999.txt", embedding_weights))

     old          new  1.58
0  smart  intelligent  9.20
1   hard    difficult  8.77
2  happy     cheerful  9.55
3   hard         easy  0.95
4   fast        rapid  8.75
simlex-999 SignificanceResult(statistic=0.08471268321334338, pvalue=0.11261446417779314)


In [72]:
for word in ["good", "green", "like", "america", "chicago", "work", "computer", "language"]:
    print(word, find_nearest(word))

good ['good', 'other', 'life', 'bad', 'first', 'only', 'things', 'present', 'almost', 'being']
green ['green', 'infinite,--when', 'calmest', '_operari_,', 'dissection,', 'travesty', 'advantageous', 'selfishness', 'bath,', 'no-more-weeping']
like ['like', 'after', 'before', 'free', 'thereby', 'thus', 'against', 'again', 'call', 'makes']
america None
chicago None
work ['work', 'error', 'desires', 'schopenhauer', 'feelings', 'europe,', 'master', 'future', 'common', 'sacrifice']
computer None
language ['language', 'sense,', 'saint', '"good"', 'extraordinary', 'distinction', 'beautiful', 'hatred', 'path', 'significance']


In [73]:
man_idx = word_to_idx["man"]
king_idx = word_to_idx["king"]
woman_idx = word_to_idx["woman"]
embedding = embedding_weights[woman_idx] - embedding_weights[man_idx] + embedding_weights[king_idx]
cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
for i in cos_dis.argsort()[:20]:
    print(idx_to_word[i])

gained--and
omit
aristophanes--that
possess--and
circumlocution:
different?
preachers
move,
break,
signify,
loved?
little.
hunger,
affected,
inspirited.
do!"--i
scoffers
dig
wiser
health).
