In [1]:
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import Subset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

class CrossEntropyLoss:
    def forward(self, logits, labels):
        # 计算softmax
        self.logits = logits
        self.labels = labels
        self.probs = self.softmax(logits)
        self.loss = -np.sum(labels * np.log(self.probs + 1e-9)) / logits.shape[0]
        return self.loss
    
    def backward(self):
        d_logits = (self.probs - self.labels) / self.logits.shape[0]
        return d_logits
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

class ReLU:
    def __init__(self):
        pass

    def forward(self, inputs):
        self.inputs = inputs
        return np.maximum(0, inputs)

    def backward(self, d_out):
        return d_out * (self.inputs > 0)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

class Conv2D():
    def __init__(self,  input_channels, num_filters, kernel_size, stride=1, padding=0):
        self.num_filters = num_filters
        self.kernel_size = kernel_size
        self.input_channels = input_channels
        self.stride = stride
        self.padding = padding
        self.count = 0
        # 初始化滤波器和偏置
        self.filters = np.random.randn(num_filters, input_channels, kernel_size, kernel_size) / (kernel_size * kernel_size)
        self.biases = np.zeros(num_filters)
    
    def forward(self, input):
        self.input = input
        self.input_padded = np.pad(input, ((0, 0), (0, 0), (self.padding, self.padding), (self.padding, self.padding)), mode='constant')
        
        self.output_height = (self.input_padded.shape[2] - self.kernel_size) // self.stride + 1
        self.output_width = (self.input_padded.shape[3] - self.kernel_size) // self.stride + 1
        self.output_shape = (self.num_filters, self.output_height, self.output_width)
        self.output = np.zeros((input.shape[0],) + self.output_shape)

        for i in range(self.output_height):
            for j in range(self.output_width):
                for f in range(self.num_filters):
                    vertical_start = i * self.stride
                    vertical_end = vertical_start + self.kernel_size
                    horizontal_start = j * self.stride
                    horizontal_end = horizontal_start + self.kernel_size
                    region = self.input_padded[:, :, vertical_start:vertical_end, horizontal_start:horizontal_end]
                    self.output[:, f, i, j] = np.sum(region * self.filters[f], axis=(1, 2, 3)) + self.biases[f]
        
        return self.output

    def backward(self, d_out, learning_rate):
        self.count += 1
        d_filters = np.zeros(self.filters.shape)
        d_biases = np.zeros(self.biases.shape)
        d_input_padded = np.zeros(self.input_padded.shape)

        for i in range(self.output_height):
            for j in range(self.output_width):
                for f in range(self.num_filters):
                    vertical_start = i * self.stride
                    vertical_end = vertical_start + self.kernel_size
                    horizontal_start = j * self.stride
                    horizontal_end = horizontal_start + self.kernel_size

                    region = self.input_padded[:, :, vertical_start:vertical_end, horizontal_start:horizontal_end]

                    for b in range(d_out.shape[0]):
                        d_filters[f] += d_out[b, f, i, j] * region[b]
                        d_biases[f] += d_out[b, f, i, j]
                        d_input_padded[b, :, vertical_start:vertical_end, horizontal_start:horizontal_end] += d_out[b, f, i, j] * self.filters[f]
        
        d_input = d_input_padded[:, :, self.padding:self.input_padded.shape[2]-self.padding, self.padding:self.input_padded.shape[3]-self.padding]

        # 更新权重和偏置
        self.filters -= learning_rate * d_filters
        self.biases -= learning_rate * d_biases

        return d_input

class BatchNorm:
    def __init__(self, num_features, momentum=0.9, epsilon=1e-5):
        self.num_features = num_features
        self.epsilon = epsilon
        self.momentum = momentum
        
        # 初始化缩放和平移参数
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
        
        # 运行均值和方差
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)
        
    def forward(self, X, training=True):
        if training:
            self.batch_mean = np.mean(X, axis=0) # 计算均值 三维
            self.batch_var = np.var(X, axis=0) # 计算方差 三维
            
            self.X_normalized = (X - self.batch_mean) / np.sqrt(self.batch_var + self.epsilon)
            self.out = self.gamma * self.X_normalized + self.beta
            
            # 更新运行均值和方差
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.batch_mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.batch_var
        else:
            self.X_normalized = (X - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            self.out = self.gamma * self.X_normalized + self.beta
        
        return self.out
    
    def backward(self, d_out ,learning_rate = 0.001):
        N, D = d_out.shape
        
        X_mu = self.X_normalized * np.sqrt(self.batch_var + self.epsilon)
        
        dbeta = np.sum(d_out, axis=0)
        dgamma = np.sum(d_out * self.X_normalized, axis=0)
        
        dX_normalized = d_out * self.gamma
        dvar = np.sum(dX_normalized * X_mu * -0.5 * (self.batch_var + self.epsilon) ** -1.5, axis=0)
        dmean = np.sum(dX_normalized * -1 / np.sqrt(self.batch_var + self.epsilon), axis=0) + dvar * np.mean(-2 * X_mu, axis=0)
        
        dX = (dX_normalized / np.sqrt(self.batch_var + self.epsilon)) + (dvar * 2 * X_mu / N) + (dmean / N)
        
        # 更新参数
        self.gamma -= learning_rate * dgamma
        self.beta -= learning_rate * dbeta
        
        return dX

class MaxPooling2D:
    def __init__(self, pool_size, stride=None):
        self.pool_size = pool_size
        self.stride = stride if stride is not None else pool_size

    def forward(self, inputs):
        self.inputs = inputs
        batch_size, num_channels, input_height, input_width = inputs.shape
        self.output_height = (input_height - self.pool_size) // self.stride + 1
        self.output_width = (input_width - self.pool_size) // self.stride + 1
        self.output = np.zeros((batch_size, num_channels, self.output_height, self.output_width))

        for i in range(self.output_height):
            for j in range(self.output_width):
                input_slice = inputs[:, :, i*self.stride:i*self.stride+self.pool_size, j*self.stride:j*self.stride+self.pool_size]
                self.output[:, :, i, j] = np.max(input_slice, axis=(2, 3))

        return self.output

    def backward(self, d_out):
        d_input = np.zeros_like(self.inputs)
        for i in range(self.output_height):
            for j in range(self.output_width):
                input_slice = self.inputs[:, :, i*self.stride:i*self.stride+self.pool_size, j*self.stride:j*self.stride+self.pool_size]
                mask = (input_slice == np.max(input_slice, axis=(2, 3))[:, :, None, None])
            d_input[:, :, i*self.stride:i*self.stride+self.pool_size, j*self.stride:j*self.stride+self.pool_size] += mask * d_out[:, :, i, j][:, :, None, None]

        return d_input

class Flatten:
    def forward(self, inputs):
        self.input_shape = inputs.shape
        return inputs.reshape(self.input_shape[0], -1)

    def backward(self, d_out):
        return d_out.reshape(self.input_shape)

class Dense:
    def __init__(self, input_units, output_units):
        self.weights = np.random.randn(input_units, output_units) * np.sqrt(2.0 / input_units)
        self.biases = np.zeros((1, output_units))

    def forward(self, inputs):
        self.inputs = inputs
        return np.dot(inputs, self.weights) + self.biases

    def backward(self, d_out, learning_rate):
        d_input = np.dot(d_out, self.weights.T)
        self.weight_gradients = np.dot(self.inputs.T, d_out)
        self.bias_gradients = np.sum(d_out, axis=0, keepdims=True)
        self.weights -= learning_rate * self.weight_gradients
        self.biases -= learning_rate * self.bias_gradients
        return d_input

class Dropout:
    def __init__(self, dropout_prob):
        self.dropout_prob = dropout_prob

    def forward(self, inputs, is_training=True):
        if is_training:
            self.mask = (np.random.rand(*inputs.shape) < self.dropout_prob) / self.dropout_prob
            self.output = inputs * self.mask
        else:
            self.output = inputs
        return self.output

    def backward(self, d_out):
        return d_out * self.mask
    


# Define the CNN model with BatchNorm and Dropout
class CNN():
    def __init__(self,num_conv_layers, num_dense_layers):

        self.num_conv_layers = num_conv_layers
        self.num_dense_layers = num_dense_layers
        self.sequence_conv_layers = [] #卷积层列表
        input_channels = 3

        if self.num_conv_layers == -1:#Alexnet模型
            """ self.conv1 = Conv2D(input_channels, 6, kernel_size=11, stride=4, padding=0)
            self.relu1 = ReLU()
            self.maxpool1 = MaxPooling2D(pool_size=3, stride=2)
            self.conv2 = Conv2D(6, 8, kernel_size=5, stride=1, padding=2)
            self.relu2 = ReLU()
            self.maxpool2 = MaxPooling2D(pool_size=3, stride=2)
            self.conv3 = Conv2D(8, 8, kernel_size=3, stride=1, padding=1)
            self.relu3 = ReLU()
            self.conv4 = Conv2D(8, 8, kernel_size=3, stride=1, padding=1)
            self.relu4 = ReLU()
            self.conv5 = Conv2D(8, 10, kernel_size=3, stride=1, padding=1)
            self.relu5 = ReLU()
            self.maxpool3 = MaxPooling2D(pool_size=3, stride=2)
            self.flatten = Flatten() """
            self.sequence_conv_layers.append(Conv2D(input_channels, 96, kernel_size=11, stride=4, padding=0))  #55*55
            self.sequence_conv_layers.append(ReLU()) 
            self.sequence_conv_layers.append(MaxPooling2D(pool_size=3, stride=2))    #27*27
            self.sequence_conv_layers.append(Conv2D(96, 256, kernel_size=5, stride=1, padding=2))  #27*27
            self.sequence_conv_layers.append(ReLU())
            self.sequence_conv_layers.append(MaxPooling2D(pool_size=3, stride=2))  #13*13
            self.sequence_conv_layers.append(Conv2D(256, 384, kernel_size=3, stride=1, padding=1))  #13*13
            self.sequence_conv_layers.append(ReLU())
            self.sequence_conv_layers.append(Conv2D(384, 384, kernel_size=3, stride=1, padding=1))  #13*13
            self.sequence_conv_layers.append(ReLU())
            self.sequence_conv_layers.append(Conv2D(384, 256, kernel_size=3, stride=1, padding=1))  #13*13
            self.sequence_conv_layers.append(ReLU())
            self.sequence_conv_layers.append(MaxPooling2D(pool_size=3, stride=2))  #6*6
            self.sequence_conv_layers.append(Flatten())

        else:
            for i in range(num_conv_layers):
                print('输入第', i+1, '层卷积层后是否有池化层：\n1.有\n2.无')  
                choice = int(input())
                print('输入num_filters：')
                num_filters = int(input())
                print('输入kernel_size：')
                kernel_size = int(input())
                print('输入stride：')
                stride = int(input())
                print('输入padding：')
                padding = int(input())
                self.sequence_conv_layers.append(Conv2D(input_channels, num_filters, kernel_size, stride, padding))
                
                self.sequence_conv_layers.append(ReLU())
                if choice == 1:
                    self.sequence_conv_layers.append(MaxPooling2D(pool_size=2, stride=2))
                input_channels = num_filters
                if i == num_conv_layers-1:
                    self.sequence_conv_layers.append(Flatten())

        self.sequence_dense_layers = [] #全连接层列表
        if self.num_dense_layers == -1:#Alexnet模型
            """ self.dense1 = Dense(10*6*6, 100)
            self.bn6 = BatchNorm(100)
            self.relu6 = ReLU()
            self.dropout1 = Dropout(0.5)
            self.dense2 = Dense(100, 100)
            self.bn7 = BatchNorm(100)
            self.relu7 = ReLU()
            self.dropout2 = Dropout(0.5)
            self.dense3 = Dense(100, 10) """

            self.sequence_dense_layers.append(Dense(256*6*6, 4096))
            self.sequence_dense_layers.append(BatchNorm(4096))
            self.sequence_dense_layers.append(ReLU())
            self.sequence_dense_layers.append(Dropout(0.2))
            self.sequence_dense_layers.append(Dense(4096, 4096))
            self.sequence_dense_layers.append(BatchNorm(4096))
            self.sequence_dense_layers.append(ReLU())
            self.sequence_dense_layers.append(Dropout(0.2))
            self.sequence_dense_layers.append(Dense(4096, 10)) 
            

        else:
            num_units = 0
            num_pre = 5*6*6
            for i in range(num_dense_layers):
                if i == num_dense_layers - 1:
                    self.sequence_dense_layers.append(Dense(num_pre, 10))
                    break
                print('输入第', i+1, '层全连接层输出的神经元个数')  
                num_units = int(input())
                self.sequence_dense_layers.append(Dense(num_pre, num_units))
                self.sequence_dense_layers.append(BatchNorm(num_units))
                self.sequence_dense_layers.append(ReLU())
                self.sequence_dense_layers.append(Dropout(0.2))
                num_pre = num_units
        
            

    def forward(self, inputs, is_training=True):
        self.inputs = inputs
        
        """ inputs = self.conv1.forward(inputs)
        
        inputs = self.relu1.forward(inputs)
        inputs = self.maxpool1.forward(inputs)
        inputs = self.conv2.forward(inputs)
        
        inputs = self.relu2.forward(inputs)
        inputs = self.maxpool2.forward(inputs)
        inputs = self.conv3.forward(inputs)
        
        inputs = self.relu3.forward(inputs)
        inputs = self.conv4.forward(inputs)
        
        inputs = self.relu4.forward(inputs)
        inputs = self.conv5.forward(inputs)
        
        inputs = self.relu5.forward(inputs)
        inputs = self.maxpool3.forward(inputs)
        inputs = self.flatten.forward(inputs)
        inputs = self.dense1.forward(inputs)
        inputs = self.bn6.forward(inputs, is_training)
        inputs = self.relu6.forward(inputs)
        inputs = self.dropout1.forward(inputs, is_training)
        inputs = self.dense2.forward(inputs)
        inputs = self.bn7.forward(inputs, is_training)
        inputs = self.relu7.forward(inputs)
        inputs = self.dropout2.forward(inputs, is_training)
        inputs = self.dense3.forward(inputs) """

        for fun in self.sequence_conv_layers:
            if isinstance(fun, Conv2D):
                inputs = fun.forward(inputs)     
            elif isinstance(fun, BatchNorm):
                inputs = fun.forward(inputs, is_training)
            elif isinstance(fun, ReLU):
                inputs = fun.forward(inputs)
            elif isinstance(fun, MaxPooling2D):
                inputs = fun.forward(inputs)
            elif isinstance(fun, Flatten):
                inputs = fun.forward(inputs)
                
                
        
        for fun in self.sequence_dense_layers:
            if isinstance(fun, Dense):
                inputs = fun.forward(inputs)
            elif isinstance(fun, BatchNorm):
                inputs = fun.forward(inputs, is_training)
            elif isinstance(fun, ReLU):
                inputs = fun.forward(inputs)
            elif isinstance(fun, Dropout):
                inputs = fun.forward(inputs, is_training) 

        x = inputs
        return x

    def backward(self, d_out, learning_rate):
        
        
        for fun in reversed(self.sequence_dense_layers):
            if isinstance(fun, Dense):
                d_out = fun.backward(d_out, learning_rate)
            elif isinstance(fun, BatchNorm):
                d_out = fun.backward(d_out, learning_rate)   
            elif isinstance(fun, ReLU):
                d_out = fun.backward(d_out)
            elif isinstance(fun, Dropout):
                d_out = fun.backward(d_out)
                   

        for fun in reversed(self.sequence_conv_layers):
            if isinstance(fun, Conv2D):
                
                d_out = fun.backward(d_out, learning_rate)
            elif isinstance(fun, BatchNorm):
                d_out = fun.backward(d_out, learning_rate)
            elif isinstance(fun, ReLU):
                d_out = fun.backward(d_out)
            elif isinstance(fun, MaxPooling2D):
                d_out = fun.backward(d_out)
            elif isinstance(fun, Flatten):
                d_out = fun.backward(d_out) 

        """ d_out = self.dense3.backward(d_out, learning_rate)
        d_out = self.dropout2.backward(d_out)
        d_out = self.relu7.backward(d_out)
        d_out = self.bn7.backward(d_out, learning_rate)
        d_out = self.dense2.backward(d_out, learning_rate)
        d_out = self.dropout1.backward(d_out)
        d_out = self.relu6.backward(d_out)
        d_out = self.bn6.backward(d_out, learning_rate)
        d_out = self.dense1.backward(d_out, learning_rate)
        d_out = self.flatten.backward(d_out)
        d_out = self.maxpool3.backward(d_out)
        d_out = self.relu5.backward(d_out)
        d_out = self.conv5.backward(d_out, learning_rate)
        d_out = self.relu4.backward(d_out)
        d_out = self.conv4.backward(d_out, learning_rate)
        d_out = self.relu3.backward(d_out)
        d_out = self.conv3.backward(d_out, learning_rate)
        d_out = self.maxpool2.backward(d_out)
        d_out = self.relu2.backward(d_out)
        d_out = self.conv2.backward(d_out, learning_rate)
        d_out = self.maxpool1.backward(d_out)
        d_out = self.relu1.backward(d_out)
        d_out = self.conv1.backward(d_out, learning_rate) """
        
        
        return d_out
    
    def train(self, X_train, y_train, epochs, learning_rate):
        loss_fn = CrossEntropyLoss()
        step = 0
        
        for epoch in range(epochs):
            running_loss = 0.0
            for i in range(0, X_train.shape[0], 16):
                inputs = X_train[i:i+16]
                labels = y_train[i:i+16]
                outputs = self.forward(inputs)
                loss = loss_fn.forward(outputs, labels)


                running_loss += loss
                d_loss = loss_fn.backward()
                self.backward(d_loss, learning_rate)

                if step % 40 == 0:
                    print(f'Step {step}, Loss: {loss:.4f}')
                    
                step += 1
            print(f'Epoch {epoch + 1}, Loss: {running_loss / (X_train.shape[0] // 16):.4f}')

Using device: cuda


In [2]:
print('构造模型：1.自定义 2.Alexnet')
model_type = int(input())
if model_type == 1:
    print('请输入卷积层数量：')
    num_conv_layers = int(input())
    print('请输入全连接层数量：')
    num_dense_layers = int(input())
else:
    num_conv_layers = -1
    num_dense_layers = -1

model = CNN(num_conv_layers, num_dense_layers)

# 定义数据预处理
transform = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

# 加载 CIFAR-10 训练和测试数据集
full_trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
full_testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

subset_indices = list(range(2000))
trainset = Subset(full_trainset, subset_indices)
testset = Subset(full_testset, subset_indices)
print('划分完成')

# 将数据转换为 NumPy 数组
X_train = np.array([np.array(trainset[i][0]) for i in range(len(trainset))])
y_train = np.array([trainset[i][1] for i in range(len(trainset))])

X_test = np.array([np.array(testset[i][0]) for i in range(len(testset))])
y_test = np.array([testset[i][1] for i in range(len(testset))])

# 将标签转换为 one-hot 编码
y_train_one_hot = np.zeros((y_train.size, 10))
for i in range(y_train.size):
    y_train_one_hot[i, y_train[i]] = 1

y_test_one_hot = np.zeros((y_test.size, 10))
for i in range(y_test.size):
    y_test_one_hot[i, y_test[i]] = 1

num_epochs = 3
model.train(X_train, y_train_one_hot, num_epochs, learning_rate=0.001)

构造模型：1.自定义 2.Alexnet
请输入卷积层数量：
请输入全连接层数量：
输入第 1 层卷积层后是否有池化层：
1.有
2.无
输入num_filters：
输入kernel_size：
输入stride：
输入padding：
输入第 2 层卷积层后是否有池化层：
1.有
2.无
输入num_filters：
输入kernel_size：
输入stride：
输入padding：
输入第 1 层全连接层输出的神经元个数
Files already downloaded and verified
Files already downloaded and verified
划分完成
Step 0, Loss: 4.4879
Step 40, Loss: 3.7385
Step 80, Loss: 2.9870
Step 120, Loss: 2.3948
Epoch 1, Loss: 3.3403
Step 160, Loss: 1.9174
Step 200, Loss: 1.5957
Step 240, Loss: 1.3087
Epoch 2, Loss: 1.7285
Step 280, Loss: 1.0727
Step 320, Loss: 0.8744
Step 360, Loss: 0.7065
Epoch 3, Loss: 0.9270


In [8]:
print('compared with pytorch')
import torch.optim as optim

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=False, num_workers=2)

# 定义卷积神经网络
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=10, stride=2, padding=0)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=5, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(5 * 6 * 6, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # 展平层
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 实例化模型
cnn = SimpleCNN()

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters(), lr=0.001)

for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data

            # 梯度置零
            optimizer.zero_grad()

            # 前向传播
            outputs = cnn(inputs)
            loss = criterion(outputs, labels)

            # 反向传播和优化
            loss.backward()
            optimizer.step()

            # 打印统计信息
            running_loss += loss.item()
            if i % 40 == 0:    
                print(f'Epoch {epoch + 1}, Batch {i + 1}, Loss: {running_loss / 100:.4f}')
                running_loss = 0.0

print('Finished Training')

compared with pytorch
Files already downloaded and verified
Files already downloaded and verified
Epoch 1, Batch 1, Loss: 0.0231
Epoch 1, Batch 41, Loss: 0.9080
Epoch 1, Batch 81, Loss: 0.8452
Epoch 1, Batch 121, Loss: 0.8174
Epoch 1, Batch 161, Loss: 0.7897
Epoch 1, Batch 201, Loss: 0.7794
Epoch 1, Batch 241, Loss: 0.7482
Epoch 1, Batch 281, Loss: 0.7554
Epoch 1, Batch 321, Loss: 0.7507
Epoch 1, Batch 361, Loss: 0.7463
Epoch 1, Batch 401, Loss: 0.7012
Epoch 1, Batch 441, Loss: 0.6946
Epoch 1, Batch 481, Loss: 0.6964
Epoch 1, Batch 521, Loss: 0.6668
Epoch 1, Batch 561, Loss: 0.6932
Epoch 1, Batch 601, Loss: 0.6801
Epoch 1, Batch 641, Loss: 0.6587
Epoch 1, Batch 681, Loss: 0.6706
Epoch 1, Batch 721, Loss: 0.6698
Epoch 1, Batch 761, Loss: 0.6497
Epoch 1, Batch 801, Loss: 0.6439
Epoch 1, Batch 841, Loss: 0.6609
Epoch 1, Batch 881, Loss: 0.6296
Epoch 1, Batch 921, Loss: 0.6304
Epoch 1, Batch 961, Loss: 0.6697
Epoch 1, Batch 1001, Loss: 0.6351
Epoch 1, Batch 1041, Loss: 0.6224
Epoch 1, Batc