In [127]:
import numpy as np
import pandas as pd
def show_head(data): return pd.DataFrame(data).head()
from __future__ import division

Example: $X \rightarrow \hat{Y}$ where $X \in \mathbb{R}^{10}$ and $Y, \hat{Y} \in \mathbb{R}^{3}$

In [39]:
n_obs, n_attr, y_dim = 300, 10, 3

In [50]:
X = np.random.normal(size=(n_obs, n_attr))
labels = ['x%i'%(i+1) for i in range(n_attr)]
pd.DataFrame(X, columns=labels).head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10
0,-0.873972,-0.508249,-0.164328,-0.385385,2.05062,-0.197253,-1.169218,-1.03284,0.516665,0.605408
1,0.906717,1.124489,-0.497818,1.11825,-0.456531,1.86817,-0.569813,-1.51004,-1.998865,-0.768858
2,0.204375,0.427753,-0.830439,0.584823,-2.238915,-0.959731,-0.1775,0.724976,0.98541,0.075389
3,-1.49864,0.079875,0.45931,1.070876,1.713725,1.538438,-0.169546,0.22995,-0.664938,-2.320227
4,-0.32367,-0.571552,1.205599,1.27448,0.584872,-2.165623,0.492239,-0.038556,-0.403038,-0.940987


In [59]:
Y = np.random.normal(size=(n_obs, y_dim))
labels = ['y%i'%(i+1) for i in range(y_dim)]
pd.DataFrame(Y, columns=labels).head()

Unnamed: 0,y1,y2,y3
0,0.93038,-2.276331,1.346365
1,-0.570485,-0.953893,0.618086
2,0.193665,1.784209,-0.541683
3,1.239913,0.880638,0.011122
4,-0.699231,1.948209,0.766983


## With NumPy

In [55]:
W = np.random.normal(size=(n_attr, y_dim))
b = np.random.normal(size=(y_dim,))
Y_pred = np.dot(X, W) + b
pd.DataFrame(Y_pred).head()

Unnamed: 0,0,1,2
0,-3.067476,-6.160288,-1.66345
1,-2.236858,5.705564,11.82366
2,-2.432078,0.432405,0.033637
3,2.310404,0.918866,1.196588
4,-6.564663,-1.050074,-2.371812


In [75]:
loss = ((Y - Y_pred)**2).ravel().mean()
print 'Loss:', loss

Loss: 15.354680543742713


## With tensorflow

In [76]:
import tensorflow as tf

In [105]:
X_ph = tf.placeholder(tf.float32, shape=[None, n_attr])
Y_ph = tf.placeholder(tf.float32, shape=[None, y_dim])

W_tf = tf.Variable(tf.random.normal(shape=[n_attr, y_dim], stddev=1))
b_tf = tf.Variable(tf.random.normal(shape=[y_dim], stddev=1))

Y_pred_tf = tf.matmul(X_ph, W_tf) + b
loss_tf = tf.reduce_mean((Y_pred_tf - Y_ph)**2)

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    inputs = {X_ph: X, Y_ph: Y}
    outputs = [loss_tf, Y_pred_tf]
    loss_val, ypred = session.run(outputs, feed_dict=inputs)

In [108]:
pd.DataFrame(ypred).head()

Unnamed: 0,0,1,2
0,-1.596406,-0.189595,2.490952
1,-5.221771,0.621098,-1.117203
2,1.652274,1.918136,1.571005
3,-3.568698,3.566676,-2.138563
4,-3.141964,-4.376909,-1.213816


In [107]:
print 'Loss:', loss_val

Loss: 12.082802


### TF with Gradients

In [109]:
import tensorflow as tf

X_ph = tf.placeholder(tf.float32, shape=[None, n_attr])
Y_ph = tf.placeholder(tf.float32, shape=[None, y_dim])

W_tf = tf.Variable(tf.random.normal(shape=[n_attr, y_dim], stddev=1))
b_tf = tf.Variable(tf.random.normal(shape=[y_dim], stddev=1))

Y_pred_tf = tf.matmul(X_ph, W_tf) + b
loss_tf = tf.reduce_mean((Y_pred_tf - Y_ph)**2)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss_tf)
n_epochs = 100

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    inputs = {X_ph: X, Y_ph: Y}
    outputs = [train_op, Y_pred_tf]
    for i in range(n_epochs):
        _, ypred = session.run(outputs, feed_dict=inputs)

In [118]:
show_head(ypred)

Unnamed: 0,0,1,2
0,-2.159438,0.511755,0.890475
1,-1.949515,0.071186,0.585485
2,-1.499175,0.609335,1.028393
3,-1.933118,-0.015615,0.9701
4,-2.036771,0.518409,1.24355


## With PyTorch

In [170]:
import torch
import torch.nn as nn
import torch.optim as optim

In [191]:
# Parameters to tune
W = nn.Parameter(torch.randn(n_attr, y_dim)).double()
b = nn.Parameter(torch.randn(y_dim)).double()

In [175]:
# Set optimizer, here Stochastic Gradient Descent
# with 0.1 as learning rate
optimizer = optim.SGD([W, b], lr=0.1)
n_epochs = 25

In [192]:
for _ in range(n_epochs):
    # Reset gradient
    optimizer.zero_grad() 
    
    # Perform matmul
    Y_pred = torch.matmul(torch.from_numpy(X), W) + b
    
    # Compute loss
    loss = torch.mean((Y_pred - torch.from_numpy(Y)**2))
    
    # Backpropagate
    loss.backward()
    
    # Tune parameters: W, b
    optimizer.step()

In [205]:
ypred_np = Y_pred.data.numpy()
((ypred_np - Y)**2).mean()

## Keras and MNIST

In [129]:
mnist = tf.keras.datasets.mnist

(x_train,y_train), (x_test, y_test) = mnist.load_data()

In [131]:
# Normalize pixel values
x_train = x_train / 255
x_test = x_test / 255

In [135]:
img_shape = x_train.shape[1:]

In [137]:
# Hyper parameters
params = [
    tf.keras.layers.Flatten(input_shape=img_shape),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation=tf.nn.softmax)]

In [138]:
# Define model
m = tf.keras.models.Sequential(params)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [141]:
# Check if the model compiles
m.compile(optimizer='adam', 
          loss='sparse_categorical_crossentropy',
          metrics=['accuracy'])

In [142]:
# Run model
m.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x152d92650>

In [143]:
m.evaluate(x_test, y_test)



[0.06668182282214985, 0.9801]

The model produces a probability disitribution when predicting, due to the softmax. This means the output for each prediction is a vector of size 10, with each entry being the probability for that index being the correct one.

In [147]:
y_pred = m.predict(x_test)

In [155]:
# Ex, first entry
print 'y_pred size:', y_pred[0].shape
print 'sum(y_pred):', y_pred[0].sum()

y_pred size: (10,)
sum(y_pred): 1.0000001


In [158]:
print 'argmax(pred):', y_pred[0].argmax()
print 'ground_truth:', y_test[0]

argmax(pred): 7
ground_truth: 7
