In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
from collections import Counter
import random

from typing import List

random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f07740bf690>

In [2]:
# 分词器，将文本转换为词汇索引
class Tokenizer:
    def __init__(self, texts: List[str], min_freq: int=1, max_vocab_size: int=10000):
        self.texts = texts
        self.min_freq = min_freq
        self.max_vocab_size = max_vocab_size
        self.vocab = self.build_vocab()

    def preprocess(self, text: str):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # 删除所有不是范围内的所有字符
        return text.split()

    def build_vocab(self):
        word_counts = Counter()
        for text in self.texts:
            tokens = self.preprocess(text)
            word_counts.update(tokens)
        vocab = {"<pad>": 0, "<unk>": 1}
        for word, freq, in word_counts.most_common(self.max_vocab_size - 2):
            if freq >= self.min_freq:
                vocab[word] = len(vocab)
        return vocab
    
    def text_to_sequence(self, text: str):
        tokens = self.preprocess(text)
        return [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]

In [3]:
# 定义示例文本数据
texts = [
    "The quick brown fox jumps over the lazy dog",
    "PyTorch is widely used for deep learning tasks",
    "Natural language processing enables complex interactions",
    "This example demonstrates text embedding in PyTorch",
]
tokenizer = Tokenizer(texts)
tokenizer.vocab

{'<pad>': 0,
 '<unk>': 1,
 'the': 2,
 'pytorch': 3,
 'quick': 4,
 'brown': 5,
 'fox': 6,
 'jumps': 7,
 'over': 8,
 'lazy': 9,
 'dog': 10,
 'is': 11,
 'widely': 12,
 'used': 13,
 'for': 14,
 'deep': 15,
 'learning': 16,
 'tasks': 17,
 'natural': 18,
 'language': 19,
 'processing': 20,
 'enables': 21,
 'complex': 22,
 'interactions': 23,
 'this': 24,
 'example': 25,
 'demonstrates': 26,
 'text': 27,
 'embedding': 28,
 'in': 29}

In [4]:
# 定义示例文本，并转换为索引序列
text_sequence = tokenizer.text_to_sequence("The quick brown fox")
text_sequence

[2, 4, 5, 6]

In [5]:
# 嵌入层定义，将词汇索引转换为嵌入向量
class TextEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(TextEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embedding(x)

In [6]:
# 超参数设置
VOCAB_SIZE = len(tokenizer.vocab)
EMBEDDING_DIM = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# 嵌入层实例
embedding_layer = TextEmbedding(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM).to(DEVICE)
# 将索引序列转换为张量并传递到嵌入层
text_tensor = torch.tensor([text_sequence], dtype=torch.long).to(DEVICE)
embedded_output = embedding_layer(text_tensor)
print("嵌入层输出的形状：", embedded_output.shape)
print("嵌入向量：", embedded_output)

嵌入层输出的形状： torch.Size([1, 4, 8])
嵌入向量： tensor([[[ 1.6423, -0.1596, -0.4974,  0.4396, -0.7581,  1.0783,  0.8008,
           1.6806],
         [-1.3847, -0.8712, -0.2234,  1.7174,  0.3189, -0.4245,  0.3057,
          -0.7746],
         [-1.5576,  0.9956, -0.8798, -0.6011, -1.2742,  2.1228, -1.2347,
          -0.4879],
         [-0.9138, -0.6581,  0.0780,  0.5258, -0.4880,  1.1914, -0.8140,
          -0.7360]]], device='cuda:0', grad_fn=<EmbeddingBackward0>)


In [8]:
# 嵌入层训练示例
optimizer = optim.Adam(embedding_layer.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [9]:
# 假设目标是另一个随机生成的嵌入向量
target_embedding = torch.rand(embedded_output.shape).to(DEVICE)

In [10]:
loss = criterion(embedded_output, target_embedding)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("训练后损失：", loss.item())

训练后损失： 1.5542532205581665


In [12]:
# 反向传播和优化
for iter_cnt in range(100):
    embedded_output = embedding_layer(text_tensor)
    loss = criterion(embedded_output, target_embedding)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"训练后损失{iter_cnt}：", loss.item())

训练后损失0： 1.330237865447998
训练后损失1： 1.311125636100769
训练后损失2： 1.2922273874282837
训练后损失3： 1.2735439538955688
训练后损失4： 1.255075216293335
训练后损失5： 1.2368210554122925
训练后损失6： 1.21878182888031
训练后损失7： 1.200957179069519
训练后损失8： 1.183347463607788
训练后损失9： 1.1659528017044067
训练后损失10： 1.148773431777954
训练后损失11： 1.1318092346191406
训练后损失12： 1.1150603294372559
训练后损失13： 1.098526120185852
训练后损失14： 1.0822062492370605
训练后损失15： 1.0660998821258545
训练后损失16： 1.050205945968628
训练后损失17： 1.034523367881775
训练后损失18： 1.0190508365631104
训练后损失19： 1.0037868022918701
训练后损失20： 0.9887295961380005
训练后损失21： 0.9738776087760925
训练后损失22： 0.9592286348342896
训练后损失23： 0.9447810649871826
训练后损失24： 0.9305331707000732
训练后损失25： 0.9164826273918152
训练后损失26： 0.9026280641555786
训练后损失27： 0.8889673948287964
训练后损失28： 0.87549889087677
训练后损失29： 0.8622208833694458
训练后损失30： 0.8491311073303223
训练后损失31： 0.8362277746200562
训练后损失32： 0.8235086798667908
训练后损失33： 0.8109720945358276
训练后损失34： 0.7986156344413757
训练后损失35： 0.7864373922348022
训练后损失36： 0.7744

In [20]:
# 显示嵌入层权重的部分
print("嵌入层的权重矩阵：\n", embedding_layer.embedding.weight[:5])

嵌入层的权重矩阵：
 tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784, -1.2345, -0.0431, -1.6047],
        [-0.7521,  1.6487, -0.3925, -1.4036, -0.7279, -0.5594, -0.7688,  0.7624],
        [ 1.6323, -0.1496, -0.4874,  0.4296, -0.7481,  1.0683,  0.7908,  1.6706],
        [ 1.2791,  1.2964,  0.6105,  1.3347, -0.2316,  0.0418, -0.2516,  0.8599],
        [-1.3747, -0.8612, -0.2134,  1.7074,  0.3289, -0.4145,  0.3157, -0.7646]],
       device='cuda:0', grad_fn=<SliceBackward0>)
