In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import jieba  # 用于中文分词

# 读取周杰伦歌词文件，假设文件名为jay_chou_lyrics.txt，每行是一首歌词
def read_lyrics(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lyrics = f.readlines()
    return lyrics

# 构建数据集类
class LyricsDataset(Dataset):
    def __init__(self, lyrics, word2idx, seq_length):
        self.lyrics = lyrics
        self.word2idx = word2idx
        self.seq_length = seq_length
        self.vocab_size = len(word2idx)

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        lyric = self.lyrics[idx]
        words = jieba.lcut(lyric) # 分词。输出分词后的列表
        # 将歌词转换为索引序列
        indices = [self.word2idx.get(word, 0) for word in words]
        print('indices:',indices)
        # 生成输入和目标序列
        inputs = indices[:-1]
        targets = indices[1:]

        # 对输入序列进行填充或截断
        if len(inputs) < self.seq_length:
            inputs = [0] * (self.seq_length - len(inputs)) + inputs
        else:
            inputs = inputs[-self.seq_length:]
        inputs = torch.tensor(inputs, dtype=torch.long)

        # 对目标序列进行填充或截断
        if len(targets) < self.seq_length:
            targets = [0] * (self.seq_length - len(targets)) + targets
        else:
            targets = targets[-self.seq_length:]
        targets = torch.tensor(targets, dtype=torch.long)

        return inputs, targets


# 构建词表
def build_vocab(lyrics):
    word2idx = {"<PAD>": 0}
    idx = 1
    for lyric in lyrics:
        words = jieba.lcut(lyric)
        for word in words:
            if word not in word2idx:
                word2idx[word] = idx
                idx += 1
    return word2idx

# 训练函数
def train(model, dataloader, optimizer, criterion, num_epochs):
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            print('input:',inputs.size())
            optimizer.zero_grad()
            h_prev = model.init_hidden(inputs.size(0))
            print('init h_prev:',h_prev.size())
            output, h_prev = model(inputs, h_prev)
            print('output',output.size(),'targets',model.embedding(targets[:,0]).size())
            loss = criterion(output, model.embedding(targets))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# 主函数
if __name__ == "__main__":
    file_path = "/home/ma-user/work/dev/easy-nlp-main/docs/chapter1/test.txt"
    lyrics = read_lyrics(file_path)
    print(lyrics)
    word2idx = build_vocab(lyrics)
    print(word2idx)
    seq_length = 20
    dataset = LyricsDataset(lyrics, word2idx, seq_length)
    print(dataset[0])
    print(dataset[1])
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    vocab_size = dataset.vocab_size
    hidden_size = 256
    num_layers = 2
    output_size = vocab_size
    model = CustomRNN(vocab_size, hidden_size, num_layers, output_size)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    train(model, dataloader, optimizer, criterion, num_epochs=10)

['第二部分 架构师的视角\n', '1 访问远程服务\n', '1.1 远程服务调用\n', '进程间通信（IPC）的解决方法\n', '管道/具名管道：用于解决进程间传递少量字符流或字节流\n', '信号：用于通知目标进程有某种事件发生\n', '信号量：用于在两个进程之间同步协作\n', '消息队列：用于进程间传递数据量较多的通信\n', '共享内存（效率最高）：运行多个进程访问同一块公共内存空间\n', '本地套接字接口：用于不同机器之间的进程通信\n', '1.2 三个基本问题\n', '如何表示数据：序列化和反序列化\n', '如何传递数据：Wire Protocol，传输协议\n', '如何表示方法：接口描述语言（Android接口定义语言、Web服务描述语言、JSON Web服务协议）\n', '1.3 RPC的发展\n', '面向对象：RMI、.NET Remoting\n', '性能：gRPC（支持多路复用和Header压缩）、Thrift（基于传输层的TCP协议）\n', '简化：JSON-RPC\n', '1.4 REST设计风格\n', '术语定义\n', '\n', '资源\n', '表征：不同的形式\n', '状态：在特定语境中的上下文信息\n', '转移：服务端将资源表征从一个状态转移到另一个状态\n', '统一接口：GET、HEAD、POST、PUT、DELETE、TRACE、OPTION\n', '超文本驱动：通过超文本内部的链接进行跳转\n', '自描述信息：Content-Type\n', 'RESTful系统特点\n', '\n', '客户端与服务端分离：用户界面所关注的逻辑和数据存储的逻辑分离\n', '无状态：每次从客户端发送的请求中，仅包含必要的上下文信息\n', '可缓存：将部分客户端的应答缓存\n', '分层系统：客户端一般不需要知道是否直接连接到最终的服务器（透明访问）\n', '统一接口：面向资源编程\n', '按需代码：将可执行的软件程序从服务端发送到客户端，WebAssembly\n', 'RMM（Richardson成熟度模型）\n', '\n', '第0级：完全不REST\n', '第1级：开始引入资源的概念，使用资源ID进行请求\n', '第2级：引入统一接口，映射到HTTP协议的方法上，把不同业务需

TypeError: forward() takes 2 positional arguments but 3 were given

In [26]:
pip install jieba

Looking in indexes: http://repo.myhuaweicloud.com/repository/pypi/simple
Collecting jieba
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.2/19.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: jieba
  Building wheel for jieba (setup.py) ... [?25ldone
[?25h  Created wheel for jieba: filename=jieba-0.42.1-py3-none-any.whl size=19314458 sha256=cb1778a85057dd0ed71a1cc72dea68c8831b9ea18506d6544669bfdb061d3aa3
  Stored in directory: /home/ma-user/.cache/pip/wheels/2d/22/9e/9af7e8c2773513ac75905acfb75073922bcc1aa176f730a0c9
Successfully built jieba
Installing collected packages: jieba
Successfully installed jieba-0.42.1
You should consider upgrading via the '/home/ma-user/anaconda3/envs/PyTorch-

In [30]:
class CustomRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # 取最后一个时间步的输出
        return out
    def init_hidden(self, batch_size):
        return torch.zeros( batch_size, self.hidden_size)

In [22]:
class RNNLayer(nn.Module):
    def __init__(self,input_size, hidden_size, num_layers=1, batch_first=True):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.input_size = input_size
        super().__init__()
        self.W_ih = nn.Parameter(torch.rand(self.input_size, self.hidden_size))
        self.W_hh = nn.Parameter(torch.rand(self.hidden_size, self.hidden_size))
        self.b_ih = nn.Parameter(torch.zeros(self.hidden_size))
        self.b_hh = nn.Parameter(torch.zeros(self.hidden_size))
        
    def forward(self,x_t,h_prev=None):
        if h_prev == None:
             h_prev = torch.zeros( x_t.size(0), self.hidden_size)
        output = torch.tanh(torch.matmul(x_t, self.W_ih) + self.b_ih + torch.matmul(h_prev, self.W_hh) + self.b_hh)
        return output,output[:,-1,:].unsqueeze(0)
        

In [23]:
# 单向、单层rnn
single_rnn = RNNLayer(input_size=4, hidden_size=3, num_layers=1, batch_first=True) # batch_first=True表示输入数据的维度为[batch_size, seq_len, input_size]
input = torch.randn(1, 5, 4) # 输入数据维度为[batch_size, seq_len, input_size]
output,h_n = single_rnn(input) # output维度为[batch_size, seq_len, hidden_size=3]，h_n维度为[num_layers=1, batch_size, hidden_size=3]
print(output, output.shape, h_n, h_n.shape,  sep='\n')

tensor([[[ 0.9024,  0.9291,  0.7738],
         [-0.2708,  0.5224,  0.6002],
         [-0.9924, -0.9147, -0.8783],
         [-0.3688,  0.0709, -0.6016],
         [ 0.4797, -0.0872, -0.5128]]], grad_fn=<TanhBackward0>)
torch.Size([1, 5, 3])
tensor([[[ 0.4797, -0.0872, -0.5128]]], grad_fn=<UnsqueezeBackward0>)
torch.Size([1, 1, 3])


In [76]:
# 自定义 RNN 模型
class CustomRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, output_size):
        super(CustomRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.W_ih = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.W_hh = nn.Parameter(torch.rand(hidden_size, hidden_size))
        self.b_ih = nn.Parameter(torch.zeros(hidden_size))
        self.b_hh = nn.Parameter(torch.zeros(hidden_size))
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h_prev):
        print('x',x.size())
        batch_size, seq_length = x.size()
        embedded = self.embedding(x)
        print('embedded',embedded.size())
        hiddens = []
        for t in range(seq_length):
            x_t = embedded[:, t, :]
            print(x_t.size(),self.W_ih.size(),self.b_ih.size() ,h_prev.size(), self.W_hh.size(),self.b_hh.size())
            h_t = torch.tanh(torch.mm(x_t, self.W_ih) + self.b_ih + torch.mm(h_prev, self.W_hh) + self.b_hh)
            hiddens.append(h_t)
            h_prev = h_t
        
        h_final = hiddens[-1]
        print('h_final',h_final.size())
        output = self.fc(h_final)
        return output, h_prev

    def init_hidden(self, batch_size):
        return torch.zeros( batch_size, self.hidden_size)


In [68]:
class CustomRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CustomRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.input_size = input_size
        self.W_ih = nn.Parameter(torch.rand(self.input_size, self.hidden_size))
        self.W_hh = nn.Parameter(torch.rand(self.hidden_size, self.hidden_size))
        self.b_ih = nn.Parameter(torch.zeros(self.hidden_size))
        self.b_hh = nn.Parameter(torch.zeros(self.hidden_size))
        self.fc = nn.Linear(self.hidden_size, output_size)

    def forward(self, x, h_prev):
        print(x.size())
        batch_size, seq_length, _ = x.size()
        hiddens = []
        for t in range(seq_length):
            x_t = x[:, t, :]
            h_t = torch.tanh(torch.mm(x_t, self.W_ih) + self.b_ih + torch.mm(h_prev, self.W_hh) + self.b_hh)
            hiddens.append(h_t)
            h_prev = h_t
        h_final = hiddens[-1]
        print('h_final',h_final.size())
        output = self.fc(h_final)
        return output, h_prev

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)





In [73]:
# 假设输入歌词维度、隐藏层维度、层数、输出维度等
input_size = 100
hidden_size = 256
num_layers = 1
output_size = 100
rnn = CustomRNN(input_size, hidden_size, num_layers, output_size)

# 模拟输入数据（实际要根据歌词进行词向量等转换），这里假设一批次2条数据，序列长度5，维度为input_size
x = torch.randn(2, 5, input_size)
optimizer = optim.Adam(rnn.parameters(), lr=0.001)
criterion = nn.MSELoss()

for epoch in range(10):
    h_prev = rnn.init_hidden(2)
    print('init h_prev:',h_prev.size())
    output, h_prev = rnn(x, h_prev)
    print('output',output.size())
    print(torch.randn(2, output_size).size())
    loss = criterion(output, torch.randn(2, output_size))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch}, Loss: {loss.item()}')


init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 0, Loss: 1.334328055381775
init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 1, Loss: 1.228675365447998
init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 2, Loss: 0.9169263243675232
init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 3, Loss: 1.086367130279541
init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 4, Loss: 1.2416033744812012
init h_prev: torch.Size([2, 256])
torch.Size([2, 5, 100])
h_final torch.Size([2, 256])
output torch.Size([2, 100])
torch.Size([2, 100])
Epoch 5, Loss: 0.