<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_14_learning_to_write_like_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class Tensor (object):

    def __init__(self,data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):

        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None
        if(id is None):
            self.id = np.random.randint(0,100000)
        else:
            self.id = id

        self.creators = creators
        self.creation_op = creation_op
        self.children = {}

        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True

    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):

            if(grad is None):
                grad = Tensor(np.ones_like(self.data))

            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad

            # grads must not have grads of their own
            assert grad.autograd == False

            # only continue backpropping if there's something to
            # backprop into and if all gradients (from children)
            # are accounted for override waiting for children if
            # "backprop" was called on this variable directly
            if(self.creators is not None and
               (self.all_children_grads_accounted_for() or
                grad_origin is None)):

                if(self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)

                if(self.creation_op == "sub"):
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(self.grad.__neg__().data), self)

                if(self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)

                if(self.creation_op == "mm"):
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)

                if(self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.expand(dim,
                                                               self.creators[0].data.shape[dim]))

                if("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))

                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())

                if(self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))

                if(self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))

                if(self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))

                if(self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))

    def __add__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if(self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)

    def __sub__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if(self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    def sum(self, dim):
        if(self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)

        if(self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_"+str(dim))
        return Tensor(new_data)

    def transpose(self):
        if(self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")

        return Tensor(self.data.transpose())

    def mm(self, x):
        if(self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self,x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))

    def sigmoid(self):
        if(self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)),
                          autograd=True,
                          creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if(self.autograd):
            return Tensor(np.tanh(self.data),
                          autograd=True,
                          creators=[self],
                          creation_op="tanh")
        return Tensor(np.tanh(self.data))

    def index_select(self, indices):

        if(self.autograd):
            new = Tensor(self.data[indices.data],
                         autograd=True,
                         creators=[self],
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)
        return softmax_output

    def cross_entropy(self, target_indices):

        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp,
                                       axis=len(self.data.shape)-1,
                                       keepdims=True)

        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t),-1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()

        if(self.autograd):
            out = Tensor(loss,
                         autograd=True,
                         creators=[self],
                         creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)


    def __repr__(self):
        return str(self.data.__repr__())

    def __str__(self):
        return str(self.data.__str__())

class Layer(object):

    def __init__(self):
        self.parameters = list()

    def get_parameters(self):
        return self.parameters

class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()


class SGD(object):

    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha

    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0

    def step(self, zero=True):

        for p in self.parameters:

            p.data -= p.grad.data * self.alpha

            if(zero):
                p.grad.data *= 0


class Linear(Layer):

    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)

        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        return input.mm(self.weight)+self.bias.expand(0,len(input.data))


class Sequential(Layer):

    def __init__(self, layers=list()):
        super().__init__()

        self.layers = layers

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input

    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params


class Embedding(Layer):

    def __init__(self, vocab_size, dim):
        super().__init__()

        self.vocab_size = vocab_size
        self.dim = dim

        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)

        self.parameters.append(self.weight)

    def forward(self, input):
        return self.weight.index_select(input)


class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()


class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()


class CrossEntropyLoss(object):

    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        return input.cross_entropy(target)


class RNNCell(Layer):

    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)

        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden

    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)

In [None]:
import sys
import random
import math
from collections import Counter
import numpy as np

# Freeze the seed for reproducability.
np.random.seed(0)

f = open("shakespeare.txt", "r")
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
indices = np.array(list(map(lambda x: word2index[x], raw)))

In [None]:
# Hyperparameters
alpha = 0.05
epochs = 1_000
batch_size = 32
input_dim = len(vocab)
output_dim = len(vocab)
bptt = 16
n_batches = int((indices.shape[0] / (batch_size)))

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int((n_batches-1)/ bptt)
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)


embed = Embedding(vocab_size=input_dim, dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=output_dim)

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=alpha)


In [None]:
print(f"raw[0:5]: {raw[0:5]}")
print(f"indices[0:5]: {indices[0:5]}")
print(f"batched_indices[0:5]: {batched_indices[0:5]}")
print(f"input_batches[0][0:5]: {input_batches[0][0:5]}")
print(f"target_batches[0][0:5]: {target_batches[0][0:5]}")

raw[0:5]: THE S
indices[0:5]: [ 8 49  9 32 50]
batched_indices[0:5]: [[ 8 55 32 38 33 32 28 28 32 53 30 32 14 33 14 10 41 28 35 32 33 32 32 51
  45 53 53 32 32 10  0  4]
 [49 15 51 32 35 21  0 58 60 32 38 45 41  4 53 28 32 58 45 39 20 53 39 30
  38 20 58 39 45 28 48 32]
 [ 9 28 33 45 33 33 38 48 30 14 32  0 32 32 32 58 51  6  0 45 32 45 28 33
  32 32 51 10  0 58 58 53]
 [32 30 32  0 30 55 33 10 14 30  0 32 41 10  5 47 30 10 48 47 21 47 58 33
  45 14 28 33 32 48 45 39]
 [50 32 38  5 32 27  0 32 45 33 28 36 28 45 47  4 14 33 17 47 45 33 47  4
  21  0 30  0 14 10 53 33]]
input_batches[0][0:5]: [[ 8 55 32 38 33 32 28 28 32 53 30 32 14 33 14 10 41 28 35 32 33 32 32 51
  45 53 53 32 32 10  0  4]
 [49 15 51 32 35 21  0 58 60 32 38 45 41  4 53 28 32 58 45 39 20 53 39 30
  38 20 58 39 45 28 48 32]
 [ 9 28 33 45 33 33 38 48 30 14 32  0 32 32 32 58 51  6  0 45 32 45 28 33
  32 32 51 10  0 58 58 53]
 [32 30 32  0 30 55 33 10 14 30  0 32 41 10  5 47 30 10 48 47 21 47 58 33
  45 14 28 33 32 48 45 39

In [None]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    input = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
        # Temperature for sampling: higher = greedier
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()

        # Samples from pred
        m = (temp_dist > np.random.rand()).argmax()
        c = vocab[m]
        input = Tensor(np.array([m]))
        s += c

    return s

def train(iterations=100):
    for j in range(iterations):
        total_loss = 0
        n_loss = 0
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if (t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            for loss in losses:
                ""
            loss.backward()
            optim.step()
            total_loss += loss.data
            loss_str = np.exp(total_loss) / (batch_i + 1)
            log = f"Iter: {j} - Batch: {batch_i+1}/{len(input_batches)} - Loss: {loss_str}"

            if (batch_i == 0):
                log += " - " + generate_sample(70, '\n').replace('\n', ' ')
            if (batch_i % 10 == 0 or batch_i - 1 == len(input_batches)):
                print(log)
        optim.alpha *= 0.99

train()


Iter: 0 - Batch: 1/184 - Loss: 26.92697147965263 - eoeoeo ono oooooooooonoonooooonoooooooooonoonononooooooooooonoooooonoo
Iter: 0 - Batch: 11/184 - Loss: 2530290404938072.5
Iter: 0 - Batch: 21/184 - Loss: 4.490789308467799e+30
Iter: 0 - Batch: 31/184 - Loss: 5.3173478202778674e+44
Iter: 0 - Batch: 41/184 - Loss: 1.5989553238068635e+58
Iter: 0 - Batch: 51/184 - Loss: 4.2376314221215725e+71
Iter: 0 - Batch: 61/184 - Loss: 1.3088443310339625e+85
Iter: 0 - Batch: 71/184 - Loss: 1.0314156214186484e+99
Iter: 0 - Batch: 81/184 - Loss: 8.921351753514044e+112
Iter: 0 - Batch: 91/184 - Loss: 5.5622244364080484e+125
Iter: 0 - Batch: 101/184 - Loss: 1.7614908128182694e+139
Iter: 0 - Batch: 111/184 - Loss: 2.033891852604758e+151
Iter: 0 - Batch: 121/184 - Loss: 3.788865056903866e+164
Iter: 0 - Batch: 131/184 - Loss: 4.768255224929501e+177
Iter: 0 - Batch: 141/184 - Loss: 1.3091859933680031e+190
Iter: 0 - Batch: 151/184 - Loss: 1.596731312878909e+202
Iter: 0 - Batch: 161/184 - Loss: 9.26427925072226

KeyboardInterrupt: 

In [25]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))
relu = lambda x: (x>0).astype(float) * x

weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1,0,0.1]))

print("Sigmoid activations:")
activations = list()
for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("Sigmoid gradients:")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = (activation * (1 - activation) * gradient)
    gradient = gradient.dot(weights.transpose())
    print(gradient)

print("Relu activations:")
activations = list()
for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)

print("Relu gradients:")
gradient = np.ones_like(activation)
for activation in reversed(activations):
      gradient = (activation > 0) * gradient
      gradient = gradient.dot(weights.transpose())
      print(gradient)

Sigmoid activations:


ValueError: shapes (3,) and (2,2) not aligned: 3 (dim 0) != 2 (dim 0)

In [35]:
class Linear(Layer):

    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = bias

        if (self.bias):
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)

        self.parameters.append(self.weight)

        if (self.bias):
            self.parameters.append(self.bias)

    def forward(self, input):
        if (self.bias):
            return input.mm(self.weight)+self.bias.expand(0,len(input.data))
        return input.mm(self.weight)

class LSTMCell (Layer):

    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        self.hf = Linear(n_hidden, n_hidden, bias=False)
        self.hi = Linear(n_hidden, n_hidden, bias=False)
        self.ho = Linear(n_hidden, n_hidden, bias=False)
        self.hc = Linear(n_hidden, n_hidden, bias=False)

        self.w_ho = Linear(n_hidden, n_output, bias = False)

        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()

        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]

        f = (self.xf.forward(input) + self.hf.forward(prev_hidden)).sigmoid()
        i = (self.xi.forward(input) + self.hi.forward(prev_hidden)).sigmoid()
        o = (self.xo.forward(input) + self.ho.forward(prev_hidden)).sigmoid()
        g = (self.xc.forward(input) + self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g)
        h = o * c.tanh()
        output = self.w_ho.forward(h)
        return output, (h,c)

    def init_hidden(self, batch_size=1):
        h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        h.data[:,0] += 1
        c.data[:,0] += 1
        return (h,c)

In [None]:
import sys
import random
import math
from collections import Counter
import numpy as np

# Freeze the seed for reproducability.
np.random.seed(0)

# Prepare the dataset.
f = open("shakespeare.txt", "r")
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
indices = np.array(list(map(lambda x: word2index[x], raw)))

# Hyperparameters
alpha = 0.05
epochs = 100
batch_size = 16
input_dim = len(vocab)
output_dim = len(vocab)
hidden_dim = 512
bptt = 25
min_loss = 1_000

# Batch the dataset
n_batches = int((indices.shape[0] / (batch_size)))

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int((n_batches-1)/ bptt)
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

# Initialize the model.
embed = Embedding(vocab_size=input_dim, dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=output_dim)
# This seemed to help training
model.w_ho.weight.data *= 0

# Loss function
criterion = CrossEntropyLoss()
# Optimizer
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=alpha)

# Training loop
for j in range(epochs):
        total_loss = 0
        n_loss = 0
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            # Detach hidden state from previous graph
            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))
            batch_loss = 0
            for t in range(bptt):
                input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)

                target = Tensor(target_batches[batch_i][t], autograd=True)
                loss = criterion.forward(output, target)

                loss.backward()
                optim.step()

                # Detach hidden state
                hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))

                batch_loss += loss.data

            total_loss += batch_loss / bptt
            epoch_loss = np.exp(total_loss / (batch_i+1))
            log = f"Iter: {j} - Batch: {batch_i+1}/{len(input_batches)} - Loss: {epoch_loss}"

            if (batch_i == 0):
                log += " - " + generate_sample(70, '\n').replace('\n', ' ')
            if (batch_i % 10 == 0 or batch_i - 1 == len(input_batches)):
                print(log)
        optim.alpha *= 0.99

Iter: 0 - Batch: 1/235 - Loss: 60.65867496848204 - nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn nnnnnnnnnnnnnnnnn
Iter: 0 - Batch: 11/235 - Loss: 26.485573399719428
Iter: 0 - Batch: 21/235 - Loss: 20.637684070316354
Iter: 0 - Batch: 31/235 - Loss: 17.610659804462472
Iter: 0 - Batch: 41/235 - Loss: 15.697698161018572
Iter: 0 - Batch: 51/235 - Loss: 14.584184372991414
Iter: 0 - Batch: 61/235 - Loss: 13.68921950422112
Iter: 0 - Batch: 71/235 - Loss: 12.998557024774252
Iter: 0 - Batch: 81/235 - Loss: 12.471154658312816
Iter: 0 - Batch: 91/235 - Loss: 12.05421103329198
Iter: 0 - Batch: 101/235 - Loss: 11.665176993495805
Iter: 0 - Batch: 111/235 - Loss: 11.346881896201026
Iter: 0 - Batch: 121/235 - Loss: 11.063402353078773
Iter: 0 - Batch: 131/235 - Loss: 10.777177018962998
Iter: 0 - Batch: 141/235 - Loss: 10.516338419410673
Iter: 0 - Batch: 151/235 - Loss: 10.32922678346088
Iter: 0 - Batch: 161/235 - Loss: 10.15114488385659
Iter: 0 - Batch: 171/235 - Loss: 9.987193874227888
Iter: 0 