# 实验 Cross Entropy Loss的向量化实现

## 读取MNIST数据集，并将其划分为train/val/test数据集

### MNIST数据集读取

In [3]:
import gzip
import os
import struct
import numpy as np

def load_mnist(path, kind='train'):
    """加载MNIST数据集"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(len(labels), 784)

    return images, labels



# 数据集划分
def data_split(images, labels, ratio):
    
    total_len = images.shape[0]
    offset = int(total_len * ratio)
    
    val_img = images[:offset][:]
    val_lb = labels[:offset]
    
    train_img = images[offset:][:]
    train_lb = labels[offset:]
    
    return train_img, train_lb, val_img, val_lb    

### 按照 5:1:1 划分为训练集，验证集，测试集


In [49]:
# 读取训练集和测试集数据
[images, labels] = load_mnist('./MNIST', kind='train')
[test_img, test_lb] = load_mnist('./MNIST',kind='test')
train_img, train_lb, val_img, val_lb = data_split(images, labels, 1/6)


# 打印查看数据集格式
print('训练集图像格式为:', train_img.shape, '训练集标签格式为:', train_lb.shape)
print('验证集图像格式为:', val_img.shape, '验证集标签格式为:', val_lb.shape)
print('测试集图像格式为:', test_img.shape, '测试集标签格式为:', test_lb.shape)

训练集图像格式为: (50000, 784) 训练集标签格式为: (50000,)
验证集图像格式为: (10000, 784) 验证集标签格式为: (10000,)
测试集图像格式为: (10000, 784) 测试集标签格式为: (10000,)


## 两种方式计算Cross Entropy Loss

### 用for循环计算Cross Entropy Loss

In [66]:
def loss_softmax(X, y, W):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=50000 examples.
    
    Inputs:
    - W: Indexs of linear classifier, a numpy array of shape (D, C) containing weights.
    - X: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,) containing training labels; y[i] = c means
         that X[i] has label c, where 0 <= c < C.
    
    Returns a tuple of:
    - Softmax loss as single float.
    """
    
    # set the value of delta, lamda
    delta = 1.0
    lamda = 1.0
    loss = 0.0
    num_train = X.shape[0]
    num_class = W.shape[1]
    
    for i in range(num_train):
        # scores: class x 1
        scores = W.T.dot(X[i][:]).reshape(num_class, 1)
 
        # get the max score
        scores_max = np.max(scores, axis=0)
        
        # prob: class x 1, calculate the log probability
        # use scores_max to limit the boundary of exp indexes
        prob = np.exp(scores-scores_max) / np.sum(np.exp(scores-scores_max), axis=0)
        
        # calculate the loss
        for j in range(num_class):
            if j == y[i]:
                # accumulate loss for the i-th example
                loss -=np.log(prob[j])
    # calculate the average data loss
    loss = loss[0]/num_train
    
    
    return loss

### 用向量化计算Cross Entropy Loss

In [50]:
def vectorized_loss_softmax(X, y, W):
    """
    Inputs have dimension D=784, there are C=10 classes, and we operate on N=50000 examples.
    
    Inputs:
    - W: Indexs of linear classifier, a numpy array of shape (D, C) containing weights.
    - X: Training images, a numpy array of shape (N, D) containing a minibatch of data.
    - y: Training labels, a numpy array of shape (N,) containing training labels; y[i] = c means
         that X[i] has label c, where 0 <= c < C.
    
    Returns a tuple of:
    - Softmax loss as single float.
    """
    
    # set the value of delta, lamda
    delta = 1.0
    lamda = 1.0
    num_train = X.shape[0]
    num_class = W.shape[1]
    
    # scores: class x examples
    scores = W.T.dot(X.T).reshape(num_class, num_train)
    
    # scores_max: 1 x examples, get the max value from each column
    scores_max = np.reshape(np.max(scores, axis=0), (1, num_train))

    # prob: class x examples, calculate the log probability
    # use scores_max to limit the boundary of exp indexes
    prob = np.exp(scores-scores_max) / np.sum(np.exp(scores-scores_max), axis=0)

    # set value 1 in true label positions, 0 for false labels
    y_true = np.zeros(prob.shape)
    y_true[y, np.arange(num_train)] = 1.0
    # calculate the average data loss
    loss = np.sum(-y_true*np.log(prob))/num_train
    return loss

### 比较向量化和for循环的运行时间

In [67]:
import time
X=train_img#[0:1]
y=train_lb#[0:1]
W=np.random.rand(784,10)*0.0001
timea=time.time()
print("for的损失值loss:",loss_softmax(X, y, W))
timea=time.time()-timea

timeb=time.time()
print("矩阵的损失值loss:",vectorized_loss_softmax(X, y, W))
timeb=time.time()-timeb

print("for用时:",timea)
print("矩阵用时:",timeb)

for的损失值loss: 2.300610152181934
矩阵的损失值loss: 2.300610152181918
for用时: 2.722450017929077
矩阵用时: 0.3175334930419922
