# Data Preparation (MNIST)

In [1]:
import os
os.chdir("/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/BASIC_TOPICS/ML_GENERAL/PYTHON_IMPL/DATA")

In [2]:
import cPickle, gzip
import numpy as np
f = gzip.open('mnist.pkl.gz', 'rb')
data_train, data_dev, data_test = cPickle.load(f)
f.close()

In [6]:
import theano
import theano.tensor as T

In [14]:
def shared_dataset(data):
    X, Y = data
    sharedX = theano.shared(np.asarray(X,dtype=theano.config.floatX))
    sharedY = theano.shared(np.asarray(Y,dtype=theano.config.floatX))
    return sharedX, T.cast(sharedY, 'int32') # y's are labels, makes sense to store them as ints.

In [15]:
# SPLIT DATA AS SHARED (FOR PUBLIC ACCESS LATER)
X_train, Y_train = shared_dataset(data_train)
X_dev, Y_dev = shared_dataset(data_dev)
X_test, Y_test = shared_dataset(data_test)

In [16]:
print X_train.shape.eval(), Y_train.shape.eval()
print X_dev.shape.eval(), Y_dev.shape.eval()
print X_test.shape.eval(), Y_train.shape.eval()

[50000   784] [50000]
[10000   784] [10000]
[10000   784] [50000]


In [26]:
# ACCESS A BATCH
batchSize = 500
thirdBatchX = X_train[2*batchSize : 3*batchSize]
thirdBatchY = Y_train[2*batchSize : 3*batchSize]
print thirdBatchX.shape.eval(), thirdBatchY.shape.eval()
thirdBatchY[1].eval()

[500 784] [500]


array(7, dtype=int32)

# Learning

### A. Loss Functions

NB: $k$ can be interpreted as the index for *label*, $i$ as the index for *data point*.

**a. Zero-One Loss (nondifferentiable)**
* $ L = \sum_{i=0}^{|D|} I_{f(x_i)\neq y_i} $, where $ f(x) = argmax_k P(y=k|x,\theta) $

**b. Negative Log-Likelihood Loss (differentiable)**
* $ L = -\sum_{i=0}^{|D|} log P(Y = y_i | x_i, \theta) $, where $P \in [0,1] \Rightarrow log(P) \in [\infty,0] \Rightarrow -log(P) \in [0, \infty]$

In [None]:
# ZERO-ONE LOSS
zeroOneLoss = T.sum(T.neq(T.argmax(pYgivenX), y))

In [None]:
# NEG LL LOSS
nll = -T.sum(T.log(pYgiveX)[T.arange(y.shape[0]), y]) # retrieve the logP of the correct y labels.
    # vector-indexing:
    #  m = np.array(range(12)).reshape(3,4)
    #  array([[ 0,  1,  2,  3],
    #         [ 4,  5,  6,  7],
    #         [ 8,  9, 10, 11]])
    #  m[[0,1,2],[0,1,2]]
    #  array([ 0,  5, 10]) <= m[0,0], m[1,1], m[2,2]

### B. Learning Algorithms

**a. Vanilla Gradient Descent**
* Update: $ \theta_{k+1} = \theta_k - \eta\frac{\partial C(\theta_k)}{\partial \theta_k} $*
* Cycle: Grand update after each run of the entire dataset. 

**b. Single Stochastic Gradient Descent**
* Update: $ \theta_{k+1} = \theta_k - \eta\frac{\partial L(\theta_k, data_i)}{\partial\theta_k} $
* Cycle: Update for each data point.

**b. Batch Stochastic Gradient Descent**
* Update: $ \theta_{k+1} = \theta_k - \eta\frac{1}{m}\sum_{i=0}^{m}\frac{\partial L(\theta_k, data_i)}{\partial\theta_k} $
* Cycle: Update (w/ average gradient) for each batch of data points.

# Optimization

### A. Momentum

* Tuning: Controling the extent to which update inertiaing along the previous direction.
* $ \Delta\theta_{k+1} = \alpha\Delta\theta_k + (1-\alpha)\frac{\partial L(\theta_k,data_i)}{\partial\theta_k} $

### B. Regularization

* Tuning: Controling the magnitude of weights, avoid overfitting.
* $ L = L + \lambda\parallel\theta\parallel_p^p $, where $\parallel\theta\parallel_p = (\sum_{j=0}^{|\theta|} |\theta_j|^p)^\frac{1}{p}$
* L1/2 Regularization: $p = 1$/$p=2$

### C. Early Stopping

* Tuning: Avoid overfitting by stopping when a model's performance ceaces to improve sufficiently (by some threshold) on the development set.

# I/O

In [46]:
# SAVE
saveFile = open('path', 'wb')
cPickle.dump(W.get_value(borrow=True), saveFile, -1)
cPickle.dump(B.get_value(borrow=True), saveFile, -1)
    # borrow=True: occupied memory is available for use (more efficient when keeping original space is not necessary).
    # -1: HIGHEST_PROTOCOL flag, more efficient storage than numpy's default.

In [None]:
# READ
saveFile = open('path')
W.set_value(cPickle.load(saveFile), borrow=True)
B.set_value(cPickle.load(saveFile), borrow=True)