# Try to use gpu 

## 1 Import package

In [1]:
import numpy
import minpy.numpy
import cupy
import pandas
import matplotlib.pyplot as plt
import random
from scipy.io import loadmat
from scipy.optimize import minimize
from sklearn.preprocessing import OneHotEncoder
from scipy.special import expit

[31mW1024 21:34:27 2758 minpy.dispatch.registry:register:47][0m Type MXNet for name reshape has already existed


## 2 Choose whether to use gpu

In [2]:
np = numpy # Only use cpu

## 3 Determine the network structure

In [3]:
num_units = 5 # the CNN ' size
in_size = 20 # input size is (20, 20)
k_size = 5 # the filtter size is (5, 5)
c_size = in_size - k_size + 1 # the convolution result's size is (16, 16) 
pf_size = 2 # the pooling fillters' size is (2, 2)
p_size = c_size // pf_size # the pooling results' size is (8, 8) 
output_size = 10

weights_size = (k_size * k_size + 1 +# w and b of convolution layer
                p_size * p_size * output_size) * num_units + output_size # w of output layer
params = (np.random.random(size=weights_size) - 0.5) * 0.25 # all weights
params.shape

(3340,)

## 4 Initializate data set

In [4]:
data = loadmat("ex4data1.mat")
X = data["X"]
m = X.shape[0]
X = X.reshape((m, in_size, in_size))
y = data["y"]

training_set_scale = 0.7
tr_m = int(m * training_set_scale)
tr_X = np.array(X[:tr_m])
ts_m = m - tr_m
ts_X = np.array(X[tr_m:])
onehot_encoder = OneHotEncoder(sparse=False, categories="auto")
y_onehot = onehot_encoder.fit_transform(y)
tr_y = np.array(y_onehot[:tr_m]).reshape((tr_m, output_size, 1))
ts_y = np.array(y[tr_m:])

tr_X.shape, tr_y.shape, ts_X.shape, ts_y.shape

((3500, 20, 20), (3500, 10, 1), (1500, 20, 20), (1500, 1))

## 5 Initializate weights

In [5]:
weights = (np.random.random(size=weights_size) - 0.5) * 0.25
weights.shape

(3340,)

## 6 Encode and decode weights

In [143]:
def encode(theta1, theta2, b1, b2):
    return np.concatenate((theta1.ravel(), theta2.ravel(), b1.ravel(), b2.ravel()))
def decode(weights, num_units, k_size, p_size, output_size):
    theta1 = weights[:num_units*k_size*k_size].reshape((num_units, k_size, k_size))
    theta2 = weights[num_units*k_size*k_size:-output_size-num_units].reshape((num_units, p_size, p_size, output_size))
    b1 = weights[-output_size-num_units:-output_size].reshape((num_units, 1))
    b2 = weights[-output_size:].reshape((output_size, 1))
    return theta1, theta2, b1, b2

In [144]:
theta1, theta2, b1, b2 = decode(weights, num_units, k_size, p_size, output_size)
theta1.shape, b1.shape, theta2.shape, b2.shape

((5, 5, 5), (5, 1), (5, 8, 8, 10), (10, 1))

In [30]:
encode(theta1, b1, theta2, b2).shape

(3340,)

In [31]:
theta1.size + b1.size + theta2.size + b2.size

3340

## 7 Convolution

In [38]:
def convolution(X, w, k_size, c_size):
    res = np.zeros((c_size, c_size))
    for i in range(c_size):
        for j in range(c_size):
            res[i,j] = np.sum(w * X[i:i+k_size,j:j+k_size])
    return res # (16, 16)

## 8 Pooling

In [39]:
def maxPooling(conv, c_size, pf_size, p_size):
    res = np.zeros((p_size, p_size))
    grad = np.zeros((c_size, c_size))
    for i in range(0, c_size, pf_size):
        for j in range(0, c_size, pf_size):
            res[i//pf_size,j//pf_size] = np.max(conv[i:i+pf_size,j:j+pf_size])
            idx = np.argmax(conv[i:i+pf_size,j:j+pf_size])
            grad[i+idx//pf_size,j+idx%pf_size] = 1
    return res, grad

In [40]:
#res, grad = maxPooling(a, crow, ccol, pfrow, pfcol, prow, pcol)
a = np.array([i for i in range(36)]).reshape((6,6))
b = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
c = convolution(a, b, 3, 4)
res, grad = maxPooling(c, 4, 2, 2)

## 9 Sigmod

In [145]:
sigmod = expit

## 10 Forward propagate

In [146]:
def forwardPropagate(X, theta1, b1, theta2, b2, num_units, k_size,
                    c_size, p_size, output_size):
    a1 = X # (20, 20)
    z2 = np.zeros((num_units, c_size, c_size)) # (5, 16, 16)
    a2 = z2.copy() # (5, 16, 16)
    pooling_grad = z2.copy() # (5, 16, 16)
    a3 = np.zeros((num_units, p_size, p_size)) # (5, 8, 8)
    z4 = np.zeros((output_size, 1)) # (10, 1)
    a4 = z4.copy() # (10, 1)
    
    for i in range(num_units):
        z2[i] = convolution(X, theta1[i], k_size, c_size) + b1[i] # (16, 16)
    a2 = sigmod(z2) # (5, 16, 16)
    for i in range(num_units):
        a3[i], pooling_grad[i] = maxPooling(a2[i], c_size, pf_size, p_size)
    
    temp_theta2 = theta2.reshape((output_size, num_units * p_size * p_size)) # (10, 5*8*8)
    temp_a3 = a3.reshape((num_units * p_size * p_size, 1)) #(5*8*8, 1)
    
    z4 = temp_theta2 @ temp_a3 + b2# (10, 1)
    a4 = sigmod(z4)
    return a1, z2, a2, pooling_grad, a3, z4, a4
a = forwardPropagate(X[0], theta1, b1, theta2, b2, num_units, k_size,
                    c_size, p_size, output_size)

In [147]:
def cost(weights, X, num_units, k_size,
                    c_size, p_size, output_size, lam=0.):
    theta1, theta2, b1, b2 = decode(weights, num_units, k_size, p_size, output_size)
    m = X.shape[0]
    J = 0.
    for i in range(m):
        a1, z2, a2, pooling_grad, a3, z4, a4 = forwardPropagate(X[0], theta1, b1, theta2, b2, 
        num_units, k_size, c_size, p_size, output_size)
        first_term = y[i] * np.log(a4)
        second_term = (1 - y[i]) * np.log(1 - a4)
        J += -np.sum(first_term + second_term)
    J /= m
    
    J += (float(lam) / (2 * m)) * \
        (np.sum(theta1 **2) + np.sum(theta2 ** 2))
    return J
a = cost(weights, tr_X[:1], num_units, k_size,
                    c_size, p_size, output_size)
a

2300.2960952125304

In [86]:
%%time
a = forwardPropagate(X[0], theta1, b1, theta2, b2, num_units, k_size,
                    c_size, p_size, output_size)
for i in a:
    print(i.shape)

(20, 20)
(5, 16, 16)
(5, 16, 16)
(5, 16, 16)
(5, 8, 8)
(10, 1)
(10, 1)
CPU times: user 28.4 ms, sys: 12.6 ms, total: 41 ms
Wall time: 29.5 ms


In [43]:
def tencode(theta1, theta2, b1, b2):
    return np.concatenate((theta1.flatten(), theta2.flatten(), b1.flatten(), b2.flatten()))
def tdecode(params, krow, kcol, hrow, hcol, num_units, output_size):
    theta1 = params[: krow * kcol * num_units].reshape((num_units, krow, kcol)) # (5, 5, 5)
    theta2 = params[krow * kcol * num_units: 
                   krow * kcol * num_units + 
                    hrow * hcol * output_size * num_units].reshape((num_units, output_size, hrow, hcol)) # (5, 10, 8,8)
    b1 = params[-num_units - output_size: - output_size].reshape((num_units, 1))
    b2 = params[-output_size:].reshape((output_size, 1))
    return theta1, theta2, b1, b2
tt1, tt2, tb1, tb2 = tdecode(weights,k_size, k_size, p_size, p_size, num_units, output_size)
tt1.shape, tt2.shape, tb1.shape, tb2.shape



((5, 5, 5), (5, 10, 8, 8), (5, 1), (10, 1))

In [44]:
def tconvolution(Xi, kernal, xrow, xcol, krow, kcol, rrow, rcol): #(20, 20) (5, 5)
    xrow, xcol = Xi.shape
    krow, kcol = kernal.shape
    rrow, rcol = xrow - krow + 1, xcol - kcol + 1
    res = np.zeros((rrow, rcol))    
    for i in range(rrow):
        for j in range(rcol):
            res[i][j] = np.sum(Xi[i:i+krow, j:j+kcol] * kernal)
    return res # (16, 16)

def tmaxPooling(conv):
    # use 2*2 pooling
    row, col = conv.shape
    res = np.zeros((row // 2, col // 2))
    pooling_grad = np.zeros(conv.shape)
    for i in range(0, row, 2):
        for j in range(0, col, 2):
            m = conv[i, j]
            mr, mc = i, j
            for p in range(i, i + 2):
                for q in range(j, j + 2):
                    if conv[p, q] > m:
                        m = conv[p, q]
                        mr, mc = p, q
            res[i // 2, j // 2] = m
            pooling_grad[mr, mc] = 1
    return res, pooling_grad

def tforwardPropagate(Xi, theta1, theta2, b1, b2, num_units, inrow, incol, krow, 
                     kcol, conrow, concol, hrow, hcol):
    a1 = Xi.reshape(inrow, incol) # (20, 20)
    z2 = np.zeros((num_units, conrow, concol)) # (5, 16, 16) 
    a2 = np.zeros((num_units, conrow, concol)) # (5, 16, 16)
    pooling_grad = np.zeros((num_units, conrow, concol)) # (5, 16, 16)
    a3 = np.zeros((num_units, hrow, hcol)) # (5, 8, 8) z3 = a3
    z4 = np.zeros((output_size, 1)) # (10, 1)
    a4 = np.zeros((output_size, 1)) # (10, 1)
    
    for i in range(num_units):
        z2[i] = tconvolution(a1, theta1[i], inrow, incol, krow, 
                            kcol, conrow, concol) + b1[i] # (16, 16)
        a2[i] = sigmod(z2[i]) # (16, 16)
        a3[i], pooling_grad[i] = tmaxPooling(a2[i]) # (8, 8) (16, 16)
        for j in range(output_size):
            z4[j] += np.sum(a3[i] * theta2[i,j])
    for i in range(output_size):
        z4[i] += b2[i]
    a4 = sigmod(z4)
    return a1, z2, a2, pooling_grad, a3, z4, a4

b = tforwardPropagate(X[0], tt1, tt2, tb1, tb2, num_units, in_size, in_size, k_size, k_size, c_size, c_size, p_size, p_size)


In [93]:
a[5] == b[5]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False]])

In [62]:
b[5]

array([[-0.40618021],
       [-0.15752495],
       [ 0.57749211],
       [-0.17013181],
       [ 0.05604191],
       [-0.47571607],
       [ 1.31990801],
       [ 0.24938998],
       [ 1.16883429],
       [-0.04913199]])

## 11 Predict

In [None]:
def predict(X, theta1, b1, theta2, b2, num_units, krow, kcol,
                    crow, ccol, prow, pcol, output_size):
    *t, h = forwardPropagate(X, theta1, b1, theta2, b2, num_units, krow, kcol,
                    crow, ccol, prow, pcol, output_size)
    return np.argmax(h) + 1

## 12 Comupte accuracy

In [None]:
def computeAccuracy(X, y, theta1, b1, theta2, b2, num_units, krow, kcol,
                    crow, ccol, prow, pcol, output_size):
    m = X.shape[0]
    correct = 0
    for i in range(m):
        ans = predict(X[i], theta1, b1, theta2, b2, num_units, krow, kcol,
                    crow, ccol, prow, pcol, output_size)
        correct += ans == y[i]
    return f"m:{m} correct:{correct} accuracy:{100 * correct / m}%"
#computeAccuracy(X, y, theta1, b1, theta2, b2, num_units, krow, kcol,
#                    crow, ccol, prow, pcol, output_size)


### The accuracy in all data

In [None]:
%%time
computeAccuracy(X, y, theta1, b1, theta2, b2, num_units, krow, kcol,
                    crow, ccol, prow, pcol, output_size)

## 13 Sigmod gradient

In [None]:
def sigmodGradient(z):
    t = expit(z)
    return t * (1 - t)

## 14 Backpropagation

In [150]:
def backPropagate(weights,  X, num_units, k_size,
                    c_size, pf_size, p_size, output_size, lam=0.):
    m = X.shape[0]
    theta1, theta2, b1, b2 = decode(weights, num_units, 
                                    k_size, p_size, output_size)
    J = 0.
    
    theta1_grad = np.zeros(theta1.shape) # (5, 5, 5)
    b1_grad = np.zeros(b1.shape) # (5, 1)
    theta2_grad = np.zeros(theta2.shape) # (5, 8, 10, 10)
    b2_grad = np.zeros(b2.shape) # (10, 1)
    
    for i in range(m):
        a1, z2, a2, pooling_grad, a3, z4, a4 = forwardPropagate(X[i], 
                    theta1, b1, theta2, b2, num_units, k_size,
                    c_size, p_size, output_size)
        J += -np.sum(y[i] * np.log(a4) + 
                    (1 - y[i]) * np.log(1 - a4)) # cost
        print(J)
        dt2 = a4 - y[i] # (10, 1)
        b2_grad += dt2 # (10, 1)
        temp_dt2 = dt2.reshape((1, output_size)) # (1, 10)
        temp_grad = a3.reshape((num_units * p_size * p_size, 1)) * temp_dt2  # (5*8*8, 10)
        theta2_grad += temp_grad.reshape((num_units, p_size, p_size, output_size))

        
        temp = theta2.reshape((num_units * p_size * p_size, output_size
                               )) @ dt2
        temp = temp.reshape((num_units, p_size, p_size))
        temp2 = np.zeros((num_units, c_size, c_size)) # (5, 16, 16)
        
        for j in range(num_units): #
            for p in range(0, c_size, pf_size):
                for q in range(0, c_size, pf_size):
                    temp2[j,p:p+pf_size,q:q+pf_size] = temp[j,p//pf_size,q//pf_size]
                    
        dt1 = temp2 * pooling_grad * z2 * (1 - z2) # (5, 16, 16)
        
        for j in range(num_units):
            b1_grad[j] = np.sum(dt1[j])
            for p in range(k_size):
                for q in range(k_size):
                    theta1_grad[j,p,q] += np.sum(dt1[j] * a1[p:p+c_size,q:q+c_size])
    
    J /= m
    theta1_grad /= m
    b1_grad /= m
    theta2_grad /=m
    b2_grad /= m
    
    #Regulation
        
    J += (float(lam) / (2 * m)) * (np.sum(theta1 ** 2) + np.sum(theta2 ** 2))
    theta1_grad += theta1 * lam / m
    theta2_grad += theta2 * lam / m
    
    return J, encode(theta1, b1, theta2, b2)
J, grad = backPropagate(weights,tr_X[:1], num_units, k_size,
                        c_size, pf_size, p_size, output_size)
J

2300.2960952125304


2300.2960952125304

NameError: name 'krow' is not defined

## 15 Gradient checking

In [151]:
def checkGradient(weights, X, num_units, k_size,
                    c_size, pf_size, p_size, output_size, lam=0.):
    eps = 1e-4
    n = len(weights)
    J, grad = backPropagate(weights, X, num_units, k_size,
                        c_size, pf_size, p_size, output_size)
    print(J)
    for i in range(10):
        x = random.randint(0, n - 1)
        epsvec = np.zeros(n)
        epsvec[x] = eps
        cost_high, t = backPropagate(weights + epsvec, X, num_units, k_size,
                        c_size, pf_size, p_size, output_size)
        cost_low, t = backPropagate(weights - epsvec, X, num_units, k_size,
                        c_size, pf_size, p_size, output_size)
        num_grad = (cost_high - cost_low) / (2 * eps)
        print(f"Element:{x} Num grad = {num_grad} BP grad = {grad[x]}")

In [152]:
%%time
checkGradient(weights, X[:1], num_units, k_size,
                    c_size, pf_size, p_size, output_size, 1.)

2300.2960952125304
2300.2960952125304
2300.306797223025
2300.285393282415
Element:2566 Num grad = 107.01970305035502 BP grad = 0.03433589467208856
2300.303047301531
2300.289143277596
Element:2306 Num grad = 69.52011967541694 BP grad = -0.029535759210391033
2300.303846912844
2300.2883436960765
Element:1744 Num grad = 77.51608383841813 BP grad = 0.0019928855592327988
2300.3019672715977
2300.2902233207487
Element:2837 Num grad = 58.719754244975775 BP grad = 0.11424388560255339
2300.302580655592
2300.2896099507243
Element:279 Num grad = 64.85352433855951 BP grad = -0.10762683928961711
2300.301934103046
2300.290256505109
Element:3255 Num grad = 58.38798968397896 BP grad = -0.0625046525154217
2300.3080309551956
2300.284159610813
Element:990 Num grad = 119.35672191384583 BP grad = -0.10171301664518845
2300.308153311204
2300.284037215896
Element:2449 Num grad = 120.58047653908943 BP grad = -0.07828921373543868
2300.3058961042275
2300.28629441587
Element:830 Num grad = 98.00844178698753 BP grad

In [113]:
a = np.array([1, 2, 3, 4])
b = np.ones((1, 4))
a * b

array([[1., 2., 3., 4.]])

In [154]:
!curl www.google.com

<html>
<head>
 <title>500 Internal Privoxy Error</title>
 <link rel="shortcut icon" href="http://config.privoxy.org/error-favicon.ico" type="image/x-icon"></head>
<body>
<h1>500 Internal Privoxy Error</h1>
<p>Privoxy encountered an error while processing your request:</p>
<p><b>Could not load template file <code>no-server-data</code> or one of its included components.</b></p>
<p>Please contact your proxy administrator.</p>
<p>If you are the proxy administrator, please put the required file(s)in the <code><i>(confdir)</i>/templates</code> directory.  The location of the <code><i>(confdir)</i></code> directory is specified in the main Privoxy <code>config</code> file.  (It's typically the Privoxy install directory).</p>
</body>
</html>
