In [2]:
import torch
import torch.nn as nn
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

In [3]:
trans = transforms.Compose([
	transforms.ToTensor()
])

train_data = MNIST(root='./data', train=True, download=True, transform=trans)
test_data = MNIST(root='./data', train=False, download=True, transform=trans)

batch_size = 128
trainLoader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
testLoader = DataLoader(test_data, batch_size=batch_size)

In [4]:
# 定义一个 rnn 层

# 定义数据维度
seq_len = 28
input_size = 28
hidden_size = 128
output_size = 10

# 定义参数，权重和偏置
Wxh = nn.Parameter(torch.normal(0, 1, (input_size, hidden_size)))
Whh = nn.Parameter(torch.normal(0, 1, (hidden_size, hidden_size)))
Why = nn.Parameter(torch.normal(0, 1, (hidden_size, output_size)))

bxh = nn.Parameter(torch.zeros(hidden_size))
bhy = nn.Parameter(torch.zeros(output_size))

weights = [Wxh, Whh, Why]
bias = [bxh, bhy]
params = weights + bias

# 定义一个函数，实现计算每个时间点的输出和隐藏状态
def rnn(X, params, hidden_size, output_size):
	batch_size, seq_len, input_size = X.shape
	hs = []
	outputs = [] 
	h0 = torch.zeros((batch_size, hidden_size))
	for i in range(seq_len):
			h = torch.sigmoid(torch.matmul(X[:, i, :], params[0]) + torch.matmul(h0, params[1]) + params[3])
			out = torch.tanh((torch.matmul(h, params[2]) + params[4]))
			h0 = h

			outputs.append(out)
			hs.append(h)
			


	return outputs[-1]


# 示例
X = torch.normal(0, 1, (5, 28, 28))
rnn(X, params, hidden_size, output_size)


tensor([[ 0.9981,  0.9658,  1.0000,  1.0000,  1.0000, -0.9997, -0.9998,  1.0000,
          0.9862, -0.6995],
        [ 1.0000,  1.0000,  0.9999,  0.9165,  1.0000, -0.9890, -0.9999,  1.0000,
          0.9699, -0.9949],
        [ 0.9999, -0.9895,  1.0000,  1.0000,  1.0000,  0.9995,  0.9996,  0.9995,
         -1.0000,  0.9986],
        [ 0.9988,  0.2226,  1.0000,  1.0000,  1.0000, -0.9999, -0.1644,  1.0000,
         -1.0000, -0.6687],
        [-0.9095,  0.9515,  1.0000,  1.0000,  0.9999, -0.6369,  0.4715,  1.0000,
         -0.7136, -0.9976]], grad_fn=<TanhBackward0>)

In [5]:
# import torch
# import torch.nn as nn

# # 定义数据维度
# seq_len = 28
# input_size = 28
# hidden_size = 128
# output_size = 10

# # 定义参数，权重和偏置（使用更合理的初始化）
# Wxh = nn.Parameter(torch.normal(0, 0.01, (input_size, hidden_size)))
# Whh = nn.Parameter(torch.normal(0, 0.01, (hidden_size, hidden_size)))
# Why = nn.Parameter(torch.normal(0, 0.01, (hidden_size, output_size)))

# bxh = nn.Parameter(torch.zeros(hidden_size))
# bhy = nn.Parameter(torch.zeros(output_size))



# weights = [Wxh, Whh, Why]
# bias = [bxh, bhy]
# params = weights + bias

# # 修正后的RNN函数
# def rnn(X, params, hidden_size, output_size):
# 	batch_size, seq_len, input_size = X.shape
	
# 	# 提取参数
# 	Wxh, Whh, Why = params[:3]
# 	bxh, bhy = params[3:]
	
# 	# 初始化隐藏状态
# 	h = torch.zeros((batch_size, hidden_size), device=X.device)
	
# 	# 按时间步处理序列
# 	for t in range(seq_len):
# 		# 计算当前时间步的隐藏状态
# 		h = torch.sigmoid(torch.matmul(X[:, t, :], Wxh) + 
# 						 torch.matmul(h, Whh) + 
# 						 bxh)
		
# 		# 计算当前时间步的输出（可选，这里只保留最后一个）
# 		out = torch.matmul(h, Why) + bhy
	
# 	# 返回最后一个时间步的输出
# 	return out

# lr = 1e-3
# epochs = 5
# train_loss = []
# train_acc = []
# optimizer = torch.optim.SGD(params, lr=lr)
# for epoch in range(epochs):
# 	trainLossSum = 0
# 	trainACCSum = 0
# 	train_batch_cnt = 0
# 	train_sample_cnt = 0
# 	for X, y in trainLoader:
# 		train_batch_cnt += 1
# 		train_sample_cnt += len(X)
# 		X = X.squeeze()
# 		y_hat = rnn(X, params, hidden_size, output_size)
# 		# loss = nn.CrossEntropyLoss()(y_hat, y)
# 		loss = F.cross_entropy(y_hat, y, reduction='sum')

# 		optimizer.zero_grad()
# 		loss.backward()
# 		# for i, param in enumerate(params):
# 		# 	if param.grad is not None:
# 		# 		print(f"Param {i} grad norm: {param.grad.norm().item():.6f}")
# 		# 	else:
# 		# 		print(f"Param {i} has no gradient!")
# 		optimizer.step()

# 		trainLossSum += loss.item()
# 		trainACCSum += (torch.argmax(y_hat, dim=1) == y).sum().item()
# 	train_loss.append(trainLossSum / train_sample_cnt)
# 	train_acc.append(trainACCSum / train_sample_cnt)
# 	print(f"Train Loss: {trainLossSum / train_sample_cnt: 0.4f}\tTrain ACC: {trainACCSum / train_sample_cnt: 0.4f}")


In [6]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# trans = transforms.Compose([
# 	transforms.ToTensor()
# ])

# train_data = MNIST(root='./data', train=True, download=True, transform=trans)
# test_data = MNIST(root='./data', train=False, download=True, transform=trans)

# batch_size = 128
# trainLoader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
# testLoader = DataLoader(test_data, batch_size=batch_size)

# # 定义数据维度
# seq_len = 28
# input_size = 28
# hidden_size = 128
# output_size = 10

# # 定义参数，权重和偏置（使用更合理的初始化）
# Wxh = torch.normal(0, 0.01, (input_size, hidden_size), requires_grad=True)
# Whh = torch.normal(0, 0.01, (hidden_size, hidden_size), requires_grad=True)
# Why = torch.normal(0, 0.01, (hidden_size, output_size), requires_grad=True)

# bxh = torch.zeros(hidden_size)
# bhy = torch.zeros(output_size)
# # bxh.requires_grad_()
# # bhy.requires_grad_()

# import torch.nn.init as init

# # Xavier 正态初始化
# Wxh = init.xavier_normal_(Wxh)
# init.xavier_normal_(Whh)
# init.xavier_normal_(Why)

# # 对于偏置，通常初始化为 0
# init.zeros_(bxh)
# init.zeros_(bhy)


# weights = [Wxh, Whh, Why]
# bias = [bxh, bhy]
# params = weights + bias

# # 修正后的RNN函数
# def rnn(X, params, hidden_size, output_size):
# 	batch_size, seq_len, input_size = X.shape
	
# 	# 提取参数
# 	Wxh, Whh, Why = params[:3]
# 	bxh, bhy = params[3:]
	
# 	# 初始化隐藏状态
# 	h = torch.zeros((batch_size, hidden_size))
	
# 	# 按时间步处理序列
# 	for t in range(seq_len):
# 		# 计算当前时间步的隐藏状态
# 		h = torch.sigmoid(torch.matmul(X[:, t, :], Wxh) + 
# 						 torch.matmul(h, Whh) + 
# 						 bxh)
		
# 	# 最后一个时间步的输出
# 	out = torch.matmul(h, Why) + bhy
# 	return out

# lr = 1e-2
# epochs = 5
# train_loss = []
# train_acc = []
# optimizer = torch.optim.SGD(params, lr=lr)

# for epoch in range(epochs):
# 	trainLossSum = 0
# 	trainACCSum = 0
# 	train_batch_cnt = 0
# 	train_sample_cnt = 0
# 	for X, y in trainLoader:
# 		train_batch_cnt += 1
# 		train_sample_cnt += len(X)
# 		X = X.squeeze()  # 确保 X 的形状是 (batch_size, 28, 28)
# 		y_hat = rnn(X, params, hidden_size, output_size)
		
# 		# 计算损失
# 		loss = F.cross_entropy(y_hat, y, reduction='mean')

# 		optimizer.zero_grad()
# 		loss.backward()
# 		optimizer.step()

# 		trainLossSum += loss.item() * len(X)
# 		trainACCSum += (torch.argmax(y_hat, dim=1) == y).sum().item()

# 	# 计算平均损失和准确率
# 	train_loss.append(trainLossSum / train_sample_cnt)
# 	train_acc.append(trainACCSum / train_sample_cnt)
# 	print(f"Epoch [{epoch+1}/{epochs}], Loss: {trainLossSum / train_sample_cnt: 0.4f}, ACC: {trainACCSum / train_sample_cnt: 0.4f}")

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

trans = transforms.Compose([
	transforms.ToTensor()
])

train_data = MNIST(root='./data', train=True, download=True, transform=trans)
test_data = MNIST(root='./data', train=False, download=True, transform=trans)

batch_size = 128
trainLoader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
testLoader = DataLoader(test_data, batch_size=batch_size)

# 定义数据维度
seq_len = 28
input_size = 28
hidden_size = 128
output_size = 10

# 定义参数，权重和偏置（使用更合理的初始化）
Wxh = nn.Parameter(torch.normal(0, 1, (input_size, hidden_size), requires_grad=True))
Whh = nn.Parameter(torch.normal(0, 1, (hidden_size, hidden_size), requires_grad=True))
Why = nn.Parameter(torch.normal(0, 1, (hidden_size, output_size), requires_grad=True))

bxh = nn.Parameter(torch.zeros(hidden_size))
bhy = nn.Parameter(torch.zeros(output_size))

weights = [Wxh, Whh, Why]
bias = [bxh, bhy]
params = weights + bias

# 修正后的RNN函数
def rnn(X, params, hidden_size, output_size):
	batch_size, seq_len, input_size = X.shape
	
	# 提取参数
	Wxh, Whh, Why = params[:3]
	bxh, bhy = params[3:]
	
	# 初始化隐藏状态
	h = torch.zeros((batch_size, hidden_size))
	
	# 按时间步处理序列
	for t in range(seq_len):
		# 计算当前时间步的隐藏状态
		h = torch.sigmoid(torch.matmul(X[:, t, :], Wxh) + 
						 torch.matmul(h, Whh) + 
						 bxh)
		
	# 最后一个时间步的输出
	out = torch.matmul(h, Why) + bhy
	return out

epochs = 20
train_loss = []
train_acc = []
optimizer = torch.optim.Adam(params)

for epoch in range(epochs):
	trainLossSum = 0
	trainACCSum = 0
	train_batch_cnt = 0
	train_sample_cnt = 0
	for X, y in trainLoader:
		train_batch_cnt += 1
		train_sample_cnt += len(X)
		X = X.squeeze()  # 确保 X 的形状是 (batch_size, 28, 28)
		y_hat = rnn(X, params, hidden_size, output_size)
		
		# 计算损失
		loss = F.cross_entropy(y_hat, y, reduction='mean')

		optimizer.zero_grad()
		loss.backward()
		# torch.nn.utils.clip_grad_norm_(params, max_norm=1)
		optimizer.step()

		trainLossSum += loss.item() * len(X)
		trainACCSum += (torch.argmax(y_hat, dim=1) == y).sum().item()

	# 计算平均损失和准确率
	train_loss.append(trainLossSum / train_sample_cnt)
	train_acc.append(trainACCSum / train_sample_cnt)
	print(f"Epoch [{epoch+1}/{epochs}], Loss: {trainLossSum / train_sample_cnt: 0.4f}, ACC: {trainACCSum / train_sample_cnt: 0.4f}")

Epoch [1/20], Loss:  2.7543, ACC:  0.2268
Epoch [2/20], Loss:  1.6415, ACC:  0.4147
Epoch [3/20], Loss:  1.3650, ACC:  0.5089
Epoch [4/20], Loss:  1.1867, ACC:  0.5694
Epoch [5/20], Loss:  1.0464, ACC:  0.6301
Epoch [6/20], Loss:  0.9286, ACC:  0.6794
Epoch [7/20], Loss:  0.8300, ACC:  0.7124
Epoch [8/20], Loss:  0.7609, ACC:  0.7359
Epoch [9/20], Loss:  0.7085, ACC:  0.7530
Epoch [10/20], Loss:  0.6704, ACC:  0.7670
Epoch [11/20], Loss:  0.6350, ACC:  0.7786
Epoch [12/20], Loss:  0.6117, ACC:  0.7861
Epoch [13/20], Loss:  0.5815, ACC:  0.7963
Epoch [14/20], Loss:  0.5618, ACC:  0.8035
Epoch [15/20], Loss:  0.5435, ACC:  0.8094
Epoch [16/20], Loss:  0.5281, ACC:  0.8147
Epoch [17/20], Loss:  0.5025, ACC:  0.8244
Epoch [18/20], Loss:  0.4875, ACC:  0.8284
Epoch [19/20], Loss:  0.4736, ACC:  0.8354
Epoch [20/20], Loss:  0.4527, ACC:  0.8435
