In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from collections import Counter
import numpy as np
%matplotlib inline

In [4]:
class NPLM(nn.Module):
    def __init__(self, vocab_size=1000, emb_dim=64, context_size=3, hidden_dim=128):
        """
        参数说明：
        vocab_size: 词汇表大小 
        emb_dim: 词嵌入维度
        context_size: 上下文词数 (n-1)
        hidden_dim: 隐藏层维度
        """
        super().__init__()
        
        # 嵌入层：将离散词索引映射为连续向量
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        
        # 全连接层1：上下文向量拼接后投影
        self.fc1 = nn.Linear(context_size * emb_dim, hidden_dim)
        
        # 非线性激活
        self.relu = nn.ReLU()
        
        # 输出层：预测下一个词的概率分布
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        
        # 初始化权重
        nn.init.xavier_uniform_(self.embeddings.weight)
        nn.init.kaiming_normal_(self.fc1.weight)
        nn.init.normal_(self.fc2.weight, mean=0, std=0.1)

    def forward(self, x):
        """
        前向传播流程：
        x: 输入词索引序列 [batch_size, context_size]
        """
        # 嵌入层：[batch, context] => [batch, context*emb_dim]
        embeds = self.embeddings(x).view(x.size(0), -1)
        
        # 全连接 + 激活
        hidden = self.relu(self.fc1(embeds))
        
        # 输出层
        logits = self.fc2(hidden)
        return logits

In [8]:
text = """
the cat sat on the mat 
the dog chased the cat 
cats like fish and milk
"""

# 构建词汇表
words = text.strip().split()
vocab = Counter(words)
vocab_size = len(vocab)

# 创建词到索引的映射
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# 超参数设置
CONTEXT_SIZE = 3  # 使用3个词预测下一个词
EMB_DIM = 32
HIDDEN_DIM = 64

# 生成训练数据
def make_dataset(words, context_size):
    data = []
    for i in range(context_size, len(words)):
        context = words[i-context_size:i]
        target = words[i]
        data.append((
            torch.tensor([word_to_idx[w] for w in context], dtype=torch.long),
            torch.tensor(word_to_idx[target], dtype=torch.long)
        ))
    return data

dataset = make_dataset(words, CONTEXT_SIZE)
print(f"生成 {len(dataset)} 个训练样本")

生成 13 个训练样本


In [11]:
# %% [markdown]
# ## 词向量训练过程可视化
# %%
import matplotlib.animation as animation
from sklearn.decomposition import PCA
from IPython.display import HTML

# 初始化记录器
embedding_history = []
similarity_history = {
    ('cat', 'dog'): [],
    ('cat', 'fish'): []
}

# 创建模型和优化器
model = NPLM(vocab_size, EMB_DIM, CONTEXT_SIZE, HIDDEN_DIM)

##  模型预测的概率分布 与 真实标签的概率分布 之间的差异
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 定义跟踪函数
def record_embeddings(epoch):
    # 记录当前词向量
    embeddings = model.embeddings.weight.detach().numpy()
    embedding_history.append(embeddings.copy())
    
    # 记录特定词对的相似度
    with torch.no_grad():
        cat_idx = word_to_idx['cat']
        dog_idx = word_to_idx['dog']
        fish_idx = word_to_idx['fish']
        
        cat_vec = model.embeddings.weight[cat_idx]
        dog_vec = model.embeddings.weight[dog_idx]
        fish_vec = model.embeddings.weight[fish_idx]
        
        sim_cat_dog = torch.cosine_similarity(cat_vec, dog_vec, dim=0).item()
        sim_cat_fish = torch.cosine_similarity(cat_vec, fish_vec, dim=0).item()
        
        similarity_history[('cat', 'dog')].append(sim_cat_dog)
        similarity_history[('cat', 'fish')].append(sim_cat_fish)

# 修改后的训练循环
for epoch in range(300):  # 减少总epoch数便于演示
    total_loss = 0
    for context, target in dataset:
        optimizer.zero_grad()
        logits = model(context.unsqueeze(0))
        loss = criterion(logits, target.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    # 每10个epoch记录一次
    if epoch % 10 == 0:
        record_embeddings(epoch)
        print(f"Epoch {epoch}, Loss: {total_loss/len(dataset):.4f}")

# %% [markdown]
# ## 动态可视化词向量变化
# %%
# 准备可视化数据
pca = PCA(n_components=2)
words_to_plot = ['the', 'cat', 'dog', 'fish', 'mat', 'sat']

# 创建画布
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

def update(frame):
    ax1.clear()
    ax2.clear()
    
    # 词向量可视化
    reduced = pca.fit_transform(embedding_history[frame])
    for i, word in enumerate(vocab):
        if word in words_to_plot:
            ax1.scatter(reduced[i,0], reduced[i,1], alpha=0.5)
            ax1.text(reduced[i,0]+0.02, reduced[i,1]+0.02, word, fontsize=9)
    ax1.set_title(f"Epoch {frame*10} Word Embeddings")
    
    # 相似度变化曲线
    epochs = list(range(0, len(similarity_history[('cat', 'dog')])*10, 10))
    ax2.plot(epochs[:frame+1], similarity_history[('cat', 'dog')][:frame+1], label='cat-dog')
    ax2.plot(epochs[:frame+1], similarity_history[('cat', 'fish')][:frame+1], label='cat-fish')
    ax2.set_xlabel("Epoch")
    ax2.set_ylabel("Cosine Similarity")
    ax2.legend()
    ax2.set_title("Semantic Relationship Evolution")
    
    return ax1, ax2

# 创建动画
ani = animation.FuncAnimation(fig, update, frames=len(embedding_history), interval=800)
plt.close()

# 在Notebook中显示
HTML(ani.to_jshtml())

Epoch 0, Loss: 2.4611
Epoch 10, Loss: 1.1230
Epoch 20, Loss: 0.2354
Epoch 30, Loss: 0.0660
Epoch 40, Loss: 0.0298
Epoch 50, Loss: 0.0169
Epoch 60, Loss: 0.0109
Epoch 70, Loss: 0.0076
Epoch 80, Loss: 0.0056
Epoch 90, Loss: 0.0042
Epoch 100, Loss: 0.0033
Epoch 110, Loss: 0.0027
Epoch 120, Loss: 0.0022
Epoch 130, Loss: 0.0018
Epoch 140, Loss: 0.0015
Epoch 150, Loss: 0.0013
Epoch 160, Loss: 0.0011
Epoch 170, Loss: 0.0010
Epoch 180, Loss: 0.0008
Epoch 190, Loss: 0.0007
Epoch 200, Loss: 0.0006
Epoch 210, Loss: 0.0006
Epoch 220, Loss: 0.0005
Epoch 230, Loss: 0.0005
Epoch 240, Loss: 0.0004
Epoch 250, Loss: 0.0004
Epoch 260, Loss: 0.0003
Epoch 270, Loss: 0.0003
Epoch 280, Loss: 0.0003
Epoch 290, Loss: 0.0002


