In [120]:
import torch, numpy as np

In [121]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [122]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [123]:
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [124]:
learning_rate = 1e-6

In [125]:
for i in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(0,h)
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    
    if i % 100 == 0 : print(i, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 27593631.785217494
100 568.2650642251199
200 3.6413915402158183
300 0.041659720527441935
400 0.0005919630245118249


In [126]:
import torch, time

In [127]:
dtype = torch.float
device = torch.device("cuda:0")

In [128]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [129]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [130]:
learning_rate = 1e-6

In [131]:


for i in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if i%100 == 0 : print(i, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29846228.0
100 279.85955810546875
200 1.0986402034759521
300 0.007404740434139967
400 0.0001923736126627773


In [132]:
dtype = torch.float
device = torch.device('cpu')

In [133]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [134]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [135]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [136]:
learning_rate = 1e-6

In [137]:
for i in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if i%100 == 0 : print(i, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

0 30922402.0
100 646.9390258789062
200 3.579761505126953
300 0.034081801772117615
400 0.0006646442925557494


In [138]:
import tensorflow as tf, warnings

In [139]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [140]:
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

In [141]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    w1 = tf.Variable(tf.random_normal((D_in, H)))
    w2 = tf.Variable(tf.random_normal((H, D_out)))

In [142]:
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

loss = tf.reduce_sum((y - y_pred) ** 2.0)

grad_w1, grad_w2 = tf.gradients(loss, [w1,w2])
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate*grad_w1)
new_w2 = w2.assign(w2 - learning_rate*grad_w2)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    x_val = np.random.randn(N, D_in)
    y_val = np.random.randn(N, D_out)
    
    for i in range(500):
        loss_val, _, _ = sess.run([loss, new_w1, new_w2], feed_dict={x: x_val, y: y_val})
        if i % 100 == 0 : print(loss_val)



32318466.0
532.0097
1.9431864
0.013704651
0.00031014593


In [143]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for i in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    if i % 100 == 0 : print(i, loss)
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate*param.grad
        

0 tensor(649.2838, grad_fn=<MseLossBackward>)
100 tensor(2.5249, grad_fn=<MseLossBackward>)
200 tensor(0.0506, grad_fn=<MseLossBackward>)
300 tensor(0.0031, grad_fn=<MseLossBackward>)
400 tensor(0.0003, grad_fn=<MseLossBackward>)


In [144]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr =learning_rate)
for i in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    if i % 100 == 0 : print(i, loss)
    
    model.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 tensor(690.7722, grad_fn=<MseLossBackward>)
100 tensor(50.1065, grad_fn=<MseLossBackward>)
200 tensor(0.8835, grad_fn=<MseLossBackward>)
300 tensor(0.0117, grad_fn=<MseLossBackward>)
400 tensor(0.0002, grad_fn=<MseLossBackward>)


In [152]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [153]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for i in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if i % 100 == 0 : print(i, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 649.64111328125
100 1.672514796257019
200 0.02133091725409031
300 0.0007520373910665512
400 4.049967901664786e-05


In [168]:
import random
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [169]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [170]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

In [171]:
model = DynamicNet(D_in, H, D_out)

In [172]:
criterion = torch.nn.MSELoss(reduction='sum')

In [173]:
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-4, momentum=0.9)

In [174]:
for i in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if i % 100 == 0 : print(i, loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 tensor(655.6825, grad_fn=<MseLossBackward>)
100 tensor(25.0975, grad_fn=<MseLossBackward>)
200 tensor(1.6326, grad_fn=<MseLossBackward>)
300 tensor(1.0169, grad_fn=<MseLossBackward>)
400 tensor(2.1358, grad_fn=<MseLossBackward>)
