In [40]:
import numpy as np

In [3]:
!ls ./cifar-10-batches-py


batches.meta data_batch_2 data_batch_4 readme.html
data_batch_1 data_batch_3 data_batch_5 test_batch


In [22]:
file = './cifar-10-batches-py/test_batch'
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo,encoding='bytes')
    return dict

dd = unpickle(file)

In [30]:
dd.keys()

dict_keys([b'labels', b'data', b'filenames', b'batch_label'])

In [77]:
raw = dd[b'data']
ydata = dd[b'labels']

## 损失函数

In [70]:
def L(X, y, W):
    """
    fully-vectorized implementation :
    - X holds all the training examples as columns (e.g. 3073 x 50,000 in CIFAR-10)
    - y is array of integers specifying correct class (e.g. 50,000-D array)
    - W are weights (e.g. 10 x 3073)
    """
    delta = 1.0
    scores = W.dot(X.T)
    lidx = range(0, len(y))
    margins = np.maximum(0, scores - scores[y, lidx] + delta)
    margins[y, lidx] = 0
    loss = np.sum(margins)/(len(y))
    return loss

In [107]:

xdata =[]
for x in raw:
    xdata.append(np.insert(x,0,1))
xdata = np.array(xdata)

1 loop, best of 3: 164 ms per loop


In [108]:

# 每个样本增加一个bias 1

# 法一
# xdata =[]
# for x in raw:
#     xdata.append(np.insert(x,0,1))
# xdata = np.array(xdata)

#法二
xdata = np.column_stack((raw, np.ones(len(raw))))

# 法三
# xdata = np.ones((raw.shape[0],raw.shape[1]+1))
# xdata[:,:-1] = raw

In [109]:
xdata.shape,raw.shape

((10000, 3073), (10000, 3072))

In [113]:
W = np.random.randn(10, 3073) * 0.0001

L(xdata,ydata,W)

10.081358115448854

## 优化

In [114]:
X_train = xdata
Y_train = ydata

In [118]:
# 随机搜索
bestloss = float("inf") # Python assigns the highest possible float value
for num in range(100):
    W = np.random.randn(10, 3073) * 0.0001 # generate random parameters
    loss = L(X_train, Y_train, W) # get the loss over the entire training set
    if loss < bestloss: # keep track of the best solution
        bestloss = loss
        bestW = W
print('in attempt %d the loss was %f, best %f' % (num, loss, bestloss))

in attempt 99 the loss was 8.983980, best 8.983980


In [155]:
#随机下降法
W = np.random.randn(10, 3073) * 0.001 # generate random starting W
bestloss = float("inf")
for i in range(1002):
    step_size = 0.00001
    Wtry = W + np.random.randn(10, 3073) * step_size
    loss = L(X_train, Y_train, Wtry)
    if loss < bestloss:
        W = Wtry
        bestloss = loss
    if i%100 == 0:
        print('iter %d loss is %f' % (i, bestloss))

iter 0 loss is 41.519405
iter 100 loss is 37.031914
iter 200 loss is 32.567393
iter 300 loss is 27.555161
iter 400 loss is 23.722530
iter 500 loss is 21.058190
iter 600 loss is 19.341223
iter 700 loss is 18.505415
iter 800 loss is 17.961653
iter 900 loss is 17.513660
iter 1000 loss is 17.251243


In [149]:
# 梯度下降法
def eval_numerical_gradient(f, x):
    """ 
    a naive implementation of numerical gradient of f at x 
    - f should be a function that takes a single argument
    - x is the point (numpy array) to evaluate the gradient at
    """ 

    fx = f(x) # evaluate function value at original point
    grad = np.zeros(x.shape)
    h = 0.00001

    # iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:

        # evaluate function at x+h
        ix = it.multi_index
        old_value = x[ix]
        x[ix] = old_value + h # increment by h
        fxh = f(x) # evalute f(x + h)
        x[ix] = old_value # restore to previous value (very important!)

        # compute the partial derivative
        grad[ix] = (fxh - fx) / h # the slope
        it.iternext() # step to next dimension

    return grad

# 10000个太多了,测试1000个
k=100
def CIFAR10_loss_fun(W):
    return L(X_train[:k], Y_train[:k], W)

In [150]:
%%time
W = np.random.rand(10, 3073) * 0.001 # random weight vector

df = eval_numerical_gradient(CIFAR10_loss_fun, W) # get the gradient

CPU times: user 18.6 s, sys: 216 ms, total: 18.8 s
Wall time: 19.3 s


In [153]:

%%time
loss_original = CIFAR10_loss_fun(W) # the original loss
print('original loss: %f' % (loss_original, ))

for step_size_log in [-10, -9, -8, -7, -6, -5,-4,-3,-2,-1]:
    step_size = 10 ** step_size_log
    W_new = W - step_size * df # new position in the weight space
    loss_new = CIFAR10_loss_fun(W_new)
    print('for step size %f new loss: %f' % (step_size, loss_new))

original loss: 18.754741
for step size 0.000000 new loss: 18.740895
for step size 0.000000 new loss: 18.616471
for step size 0.000000 new loss: 17.402123
for step size 0.000000 new loss: 10.438134
for step size 0.000001 new loss: 124.150568
for step size 0.000010 new loss: 1328.548602
for step size 0.000100 new loss: 13372.797059
for step size 0.001000 new loss: 133815.281625
for step size 0.010000 new loss: 1338240.127288
for step size 0.100000 new loss: 13382488.583912
CPU times: user 15.1 ms, sys: 2.49 ms, total: 17.6 ms
Wall time: 15.2 ms


**结论:**要选择大小合适的stepsize

- 梯度方向其实是函数增加最快的方向
- 梯度下降法是沿着负梯度方向