In [1]:
import tensorflow as tf
import keras
import numpy as np

In [2]:
x = keras.layers.Dense(512, activation='relu')

In [3]:
type(x)

keras.layers.core.dense.Dense

In [4]:
input = np.array([[[-3, 3, -3],
            [3, -3, 3],
            [3, 3, -3]],
        [[4, 4, 4],
        [4, 4, 4],
        [4, 4, 4]]])

input.shape

(2, 3, 3)

In [5]:
def naive_relu(x):
    assert len(x.shape) == 2
    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i,j] = max(x[i,j], 0)
    return x

In [6]:
def naive_add(x, y):
    assert len(x.shape) == 2
    assert x.shape == y.shape

    x, y = x.copy(), y.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i,j] = x[i,j] + y[i,j]

    return x

In [7]:
X = np.random.random((32, 10))
y = np.random.random((10))

In [8]:
y = np.expand_dims(y, axis=0)
y.shape

(1, 10)

In [9]:
Y = np.concatenate([y] * 32, axis=0)
Y.shape

(32, 10)

In [10]:
def naive_add_matrix_vector(x,y):
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    assert x.shape[1] == y.shape[0]

    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i,j] += y[j]
    
    return x

In [11]:
x = np.random.random((64, 3, 32, 10))
y = np.random.random((32, 10))
z = np.maximum(x,y)
z.shape

(64, 3, 32, 10)

In [12]:
def naive_vector_dot(x,y):
    z = 0. 
    for i in range(x.shape[0]):
        z += x[i] * y[i]
    return z 

In [13]:
x = np.random.random((5,4))
y = np.random.random((4,6))

print(np.matmul(x,y))

[[0.10314316 0.41923467 0.50320051 0.72602344 0.27777207 0.17651648]
 [0.16083233 0.53805927 0.67390177 0.97285118 0.31434391 0.29593489]
 [0.25718475 0.69835304 0.92038317 0.49299886 0.44024203 0.15946456]
 [0.24948926 1.0610131  1.38097855 1.17197083 0.84085289 0.17317685]
 [0.25351602 0.81330438 0.91775186 0.90994554 0.49426909 0.21996989]]


In [15]:
def naive_matrix_vector_dot(x, y):
    assert x.shape[1] == y.shape[0]
    z = np.zeros(x.shape[0])

    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            z[i] += x[i, j] * y[j]
            
    return z

In [16]:
def naive_matrix_dot(x, y):

    assert len(x.shape) == 2
    assert len(y.shape) == 2
    assert x.shape[1] == y.shape[0]

    z = np.zeros(x.shape[0], y.shape[1])

    for i in range(x.shape[0]):
        for j in range(y.shape[1]):
            z[i, j] += naive_vector_dot(x[i, :], y[:, j])
    
    return z

In [20]:
# Linear Transforms

from math import cos, sin

def counter_clockwise_rotate(vector, theta):
    R = [[cos(theta), - sin(theta)],
         [sin(theta), cos(theta)]]
    
    return naive_matrix_vector_dot(R, vector)

def scale(horiz_factor, vert_factor, vector):

    S = [[horiz_factor, 0],
         [0, vert_factor]]
    
    return naive_matrix_vector_dot(S, vector)

In [21]:
# Affine Transforms : Combination of a linear transform and a translation (W.x + b)

In [22]:
def sgd_momentum(current_parameters, past_velocity, momentum, learning_rate):
    loss = current_parameters['loss']
    while loss > 0.01:
        w = current_parameters['w']
        loss = current_parameters['loss']
        gradient = current_parameters['gradient']

        velocity = past_velocity * momentum - learning_rate * gradient
        w = w + momentum * velocity - learning_rate * gradient
        past_velocity = velocity
        update_parameters(w)

In [25]:
def grad(x,y):
    pass

def f(x):
    return x

def g(x):
    return x

def h(x):
    return x

def j(x):
    return x

def fg(x):
    return f(g(x))

def fghj(x):
    return f(g(h(j(x))))

def chain_diff(x):
    y = fghj(x)
    grad(y, x) == grad(y, g(h(j(x)))) * grad(g(h(j(x))), h(j(x))) * grad(h(j(x)), j(x)) * grad(j(x), x)

In [26]:
x = tf.Variable(0.)

with tf.GradientTape() as tape:
    y = 2 * x + 3

grad_of_y_wrt_x = tape.gradient(y, x)

print(grad_of_y_wrt_x)

Metal device set to: Apple M1 Max

systemMemory: 64.00 GB
maxCacheSize: 24.00 GB

tf.Tensor(2.0, shape=(), dtype=float32)


2023-07-04 10:21:44.569990: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-07-04 10:21:44.570407: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [27]:
x = tf.Variable(tf.random.uniform((2,2)))

with tf.GradientTape() as tape:
    y = 2 * x + 3

grad_of_y_wrt_x = tape.gradient(y, x)

print(grad_of_y_wrt_x)

tf.Tensor(
[[2. 2.]
 [2. 2.]], shape=(2, 2), dtype=float32)


In [32]:
W = tf.Variable(tf.random.uniform((2,2)))
b = tf.Variable(tf.zeros((2,)))
x = tf.random.uniform((2,2))

with tf.GradientTape() as tape:
    y = tf.matmul(x, W) + b

grad_of_y_wrt_W_and_b = tape.gradient(y, [W,b])
print(grad_of_y_wrt_W_and_b)

[<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.2833612, 0.2833612],
       [1.2526826, 1.2526826]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 2.], dtype=float32)>]


In [40]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

train_images = train_images.reshape((60000, 28 * 28)).astype('float32') / 255.
test_images = test_images.reshape((10000, 28 * 28)).astype('float32') / 255. 

In [41]:
model = keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [42]:
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [43]:
model.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5


2023-07-04 10:45:47.724494: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-07-04 10:45:47.995650: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x286eb9940>

In [44]:
class NaiveDense:

    def __init__(self, input_size, output_size, activation) -> None:

        self.activation = activation

        w_shape = (input_size, output_size)
        w_initial_value = tf.random.uniform(w_shape, minval=0., maxval=1e-1)
        self.w = tf.Variable(w_initial_value)

        b_shape = (output_size, )
        b_initial_value = tf.zeros(b_shape)
        self.b = tf.Variable(b_initial_value)
    
    def __call__(self, inputs):
        return self.activation(tf.matmul(inputs, self.w) + self.b)
    
    @property
    def weights(self):
        return [self.w, self.b]

In [51]:
class NaiveSequential:

    def __init__(self, layers):
        self.layers = layers
    
    def __call__(self, inputs):
        x = inputs 
        for layer in self.layers:
            x = layer(x)
        return x 
    
    @property
    def weights(self):
        weights = []
        for layer in self.layers:
            weights += layer.weights
        return weights 

In [52]:
model = NaiveSequential([
    NaiveDense(input_size=28 * 28, output_size=512, activation=tf.nn.relu),
    NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
])

len(model.weights)

4

In [53]:
import math

class BatchGenerator:

    def __init__(self, images, labels, batch_size=128):

        assert len(images) == len(labels)
        self.index = 0
        self.images = images 
        self.labels = labels 
        self.batch_size = batch_size
        self.num_batches = math.ceil(len(images) / batch_size)
    
    def next(self):

        images = self.images[self.index : self.index + self.batch_size]
        labels = self.labels[self.index : self.index + self.batch_size]
        self.index += self.batch_size
        
        return images, labels

In [64]:
def update_weights(gradients, weights, learning_rate=1e-3, use_optimizer=True):

    if use_optimizer:
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
        optimizer.apply_gradients(zip(gradients, weights))
    else:
        for g, w in zip(gradients, weights):
            w.assign_sub(g * learning_rate)      # Tensorflow Variable equivalent of "-="

In [72]:
def one_training_step(model, images_batch, labels_batch):

    with tf.GradientTape() as tape:
        predictions = model(images_batch)
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(labels_batch, predictions)
        average_loss = tf.reduce_mean(per_sample_losses)
    
    gradients = tape.gradient(average_loss, model.weights)
    update_weights(gradients, model.weights, use_optimizer=True)
    
    return average_loss

In [73]:
def fit(model, images, labels, epochs, batch_size=128):
    for epoch_i in range(epochs):
        print(f'Epoch {epoch_i}')
        batch_generator = BatchGenerator(images, labels, batch_size)

        for batch_i in range(batch_generator.num_batches):
            images_batch, labels_batch = batch_generator.next()
            loss = one_training_step(model, images_batch=images_batch, labels_batch=labels_batch)
            if batch_i % 100 == 0:
                print(f'loss at batch {batch_i}: {loss:.2f}')

In [74]:
fit(model, train_images, train_labels, epochs=10, batch_size=128)

Epoch 0
loss at batch 0: 0.45
loss at batch 100: 0.43
loss at batch 200: 0.37
loss at batch 300: 0.45
loss at batch 400: 0.55
Epoch 1
loss at batch 0: 0.44
loss at batch 100: 0.42
loss at batch 200: 0.36
loss at batch 300: 0.44
loss at batch 400: 0.54
Epoch 2
loss at batch 0: 0.43
loss at batch 100: 0.41
loss at batch 200: 0.36
loss at batch 300: 0.43
loss at batch 400: 0.54
Epoch 3
loss at batch 0: 0.43
loss at batch 100: 0.40
loss at batch 200: 0.35
loss at batch 300: 0.42
loss at batch 400: 0.53
Epoch 4
loss at batch 0: 0.42
loss at batch 100: 0.39
loss at batch 200: 0.34
loss at batch 300: 0.42
loss at batch 400: 0.53
Epoch 5
loss at batch 0: 0.41
loss at batch 100: 0.38
loss at batch 200: 0.34
loss at batch 300: 0.41
loss at batch 400: 0.52
Epoch 6
loss at batch 0: 0.41
loss at batch 100: 0.38
loss at batch 200: 0.33
loss at batch 300: 0.41
loss at batch 400: 0.52
Epoch 7
loss at batch 0: 0.40
loss at batch 100: 0.37
loss at batch 200: 0.33
loss at batch 300: 0.40
loss at batch 40

In [75]:
predictions = model(test_images).numpy()
pred_labels = np.argmax(predictions, axis=1)
matches = pred_labels == test_labels
print(f'accuracy: {matches.mean():.2f}')

accuracy: 0.88
