In [4]:
import torch
import torch.nn as nn

a = torch.tensor([
    [1, 2, 3],
    [3, 4, 4]
])

a.sum(dim=0, keepdim=True).shape  # sum of each column

torch.Size([1, 3])

In [None]:
import torch

a = torch.tensor([
    [1, 2],
    [3, 4]
])
b = torch.tensor([
    [5, 6],
    [7, 8]
])
c = torch.matmul(a, b)
print(c)

In [6]:
s = set()
s.update([1, 2, 3], {4, 5, 6}, range(7, 10))
s

{1, 2, 3, 4, 5, 6, 7, 8, 9}

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

# 示例语料库
corpus = [
    "我们 都 是 好 朋友",
    "你们 也 是 我们 的 朋友",
    "他们 是 新 同学",
    "我们 欢迎 新 同学"
]

# 参数设置
window_size = 2
embedding_dim = 10
epochs = 100
learning_rate = 0.01

# 构建词汇表
words = set()
for sentence in corpus:
    words.update(sentence.split())
word_list = sorted(list(words))
word_to_idx = {w: idx for idx, w in enumerate(word_list)}
idx_to_word = {idx: w for w, idx in word_to_idx.items()}
vocab_size = len(word_list)

# 生成训练数据（中心词和上下文词对）


def generate_skipgram_data(corpus, window_size):
    data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i, center_word in enumerate(tokens):
            context_indices = list(range(max(0, i - window_size), i)) + \
                list(range(i + 1, min(len(tokens), i + window_size + 1)))
            for j in context_indices:
                context_word = tokens[j]
                data.append((center_word, context_word))

    return data


training_data = generate_skipgram_data(corpus, window_size)

# 转换为索引表示
input_indices = [word_to_idx[pair[0]] for pair in training_data]
output_indices = [word_to_idx[pair[1]] for pair in training_data]
output_indices

[5,
 5,
 11,
 11,
 11,
 7,
 7,
 7,
 7,
 4,
 4,
 4,
 8,
 8,
 2,
 2,
 0,
 0,
 0,
 7,
 7,
 7,
 7,
 5,
 5,
 5,
 5,
 10,
 10,
 10,
 8,
 8,
 1,
 1,
 7,
 7,
 7,
 6,
 6,
 6,
 3,
 3,
 5,
 5,
 9,
 9,
 9,
 6,
 6,
 6,
 3,
 3]

In [33]:
import torch
out = torch.tensor([1, 2, 3, 4, 5])
torch.topk(out, 4).indices

tensor([4, 3, 2, 1])

In [16]:
import torch

torch.tensor([1], dtype=torch.long) == torch.LongTensor([1])

tensor([True])

In [29]:
a = dict({})
a['3'] = 3
a['2'] = 2
a['4'] = 4

a

{'3': 3, '2': 2, '4': 4}

In [31]:
import numpy as np

np.random.choice([1, 2, 3], size=2)

array([2, 1])

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import random
import numpy as np

# 定义 Vocabulary 类


class Vocabulary:
    def __init__(self, corpus, min_count=5):
        self.word2idx = {}
        self.idx2word = {}
        self.word_freq = {}
        self.total_words = 0
        self.build_vocab(corpus, min_count)
        self.vocab_size = len(self.word2idx)
        self.word_probs = self.get_unigram_table()

    def build_vocab(self, corpus, min_count):
        word_counts = {}
        for line in corpus:
            for word in line.strip().split():
                word_counts[word] = word_counts.get(word, 0) + 1
                self.total_words += 1
        idx = 0
        for word, count in word_counts.items():
            if count >= min_count:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                self.word_freq[idx] = count
                idx += 1

    def get_unigram_table(self):
        # 构建用于负采样的表
        power = 0.75
        norm = sum([freq ** power for freq in self.word_freq.values()])
        table_size = 1e8  # 根据需要调整
        table = []

        for idx in self.word_freq:
            prob = (self.word_freq[idx] ** power) / norm
            count = int(prob * table_size)
            table.extend([idx] * count)
        return np.array(table)

# 自定义 Dataset


class SkipGramDataset(Dataset):
    def __init__(self, corpus, vocab, window_size=5, negative_samples=5):
        self.corpus = corpus
        self.vocab = vocab
        self.window_size = window_size
        self.negative_samples = negative_samples

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        sentence = self.corpus[idx]
        words = sentence.strip().split()
        word_indices = [self.vocab.word2idx[word]
                        for word in words if word in self.vocab.word2idx]
        pairs = []
        for i, center in enumerate(word_indices):
            window = random.randint(1, self.window_size)
            context_indices = word_indices[max(
                0, i - window): i] + word_indices[i + 1: i + window + 1]
            for context in context_indices:
                pairs.append((center, context))
        return pairs

    def collate_fn(self, batch):
        centers = []
        contexts = []
        negatives = []
        for pairs in batch:
            for center, context in pairs:
                centers.append(center)
                contexts.append(context)
                neg_samples = np.random.choice(
                    self.vocab.word_probs, size=self.negative_samples).tolist()
                negatives.append(neg_samples)
        return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(negatives)

# 定义 Skip-Gram 模型


class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_words, context_words, negative_words):
        center_embeds = self.in_embeddings(
            center_words)  # (batch_size, embedding_dim)
        context_embeds = self.out_embeddings(
            context_words)  # (batch_size, embedding_dim)
        # (batch_size, negative_samples, embedding_dim)
        neg_embeds = self.out_embeddings(negative_words)

        # 正样本得分
        pos_score = torch.mul(center_embeds, context_embeds).sum(dim=1)
        pos_loss = torch.log(torch.sigmoid(pos_score))

        # 负样本得分
        neg_score = torch.bmm(neg_embeds, center_embeds.unsqueeze(2)).squeeze()
        neg_loss = torch.log(torch.sigmoid(-neg_score)).sum(dim=1)

        # 总损失
        loss = - (pos_loss + neg_loss).mean()
        return loss


# 参数设置
embedding_dim = 100
batch_size = 2
epochs = 5
learning_rate = 0.01

# 读取大语料


def corpus_reader(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            yield line.strip()

# 假设语料文件路径为 'large_corpus.txt'
# corpus = corpus_reader('large_corpus.txt')


# 为演示，使用小语料代替
corpus = [
    "我们 都 是 好 朋友",
    "你们 也 是 我们 的 朋友",
    "他们 是 新 同学",
    "我们 欢迎 新 同学"
    # ... 更多句子
]

# 初始化词汇表和数据集
vocab = Vocabulary(corpus, min_count=1)
dataset = SkipGramDataset(corpus, vocab)
dataloader = DataLoader(dataset, batch_size=batch_size,
                        shuffle=True, collate_fn=dataset.collate_fn, num_workers=4)

# 初始化模型和优化器
model = SkipGramModel(vocab.vocab_size, embedding_dim)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# 训练模型
for epoch in range(1, epochs + 1):
    total_loss = 0
    for centers, contexts, negatives in dataloader:
        optimizer.zero_grad()
        loss = model(centers, contexts, negatives)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss / len(dataloader):.4f}")

# 保存模型
# torch.save(model.state_dict(), 'skipgram_model.pth')

# 词向量提取
# word_embeddings = model.in_embeddings.weight.data

ERROR:tornado.general:SEND Error: Host unreachable
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.9/3.9.19/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/homebrew/Cellar/python@3.9/3.9.19/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'SkipGramDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [37]:
a = [1, 3, 2]


def sorted_list(a):
    return sorted(a, reverse=True)


sorted_list(a)

[3, 2, 1]

In [52]:
import numpy as np

a = np.array([
    [1, 2],
    [3, 4]
])

b = np.array([
    [2, 3],
    [4, 5]
])

# 向量内积
np.matmul(a, b), np.dot(a, b)

(array([[10, 13],
        [22, 29]]),
 array([[10, 13],
        [22, 29]]))

In [9]:
import numpy as np
a = np.array([2.5, 1.2, 0.8])
np.exp(a) / np.sum(np.exp(a))

array([0.68718353, 0.18727936, 0.12553711])