<a href="https://colab.research.google.com/github/weilainicolas/Deep-Learning/blob/master/L8_fully_connected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

In [0]:
#export
def get_data():
  MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'
  path=datasets.download_data(MNIST_URL,ext='.gz'); 
  with gzip.open(path,'rb') as f:
    ((x_train, y_train),(x_valid, y_valid),_)=pickle.load(f, encoding='latin-1')
  return map(tensor,(x_train, y_train, x_valid,y_valid))

def normalize(x,m,s): return (x-m)/s

In [0]:
x_train, y_train, x_valid,y_valid =get_data()

In [5]:
train_mean, train_std= x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [0]:
x_train= normalize(x_train,train_mean, train_std)
x_valid= normalize(x_valid, train_mean, train_std)

In [0]:
x_valid.mean(), x_valid.std()

(tensor(-0.4425), tensor(3.2296))

In [0]:
def near (a, b): return torch.allclose(a,b, rtol=1e-3, atol=1e-5)

In [8]:
r,c=x_train.shape
cat=y_train.max()+1
r,c,cat

(50000, 784, tensor(10))

# **Foundation Version**


In [0]:
# hidden layer: number of input 
rh = 50

In [0]:
# kaiming he init
w1=torch.randn(c,rh)*math.sqrt(2./c)
b1=torch.zeros(rh)
w2=torch.randn(rh,1)
b2=torch.zeros(1)

In [0]:
def lin(x,w,b): return x@w+b

In [0]:
def relu(x): return x.clamp_min(0.)-0.5

In [0]:
t=relu(lin(x_train, w1,b1))

In [13]:
t.mean(), t.std()

(tensor(0.0407), tensor(0.8250))

In [11]:
torch.nn.init.kaiming_normal_(w1,mode='fan_out')

tensor([[ 0.0014, -0.0064,  0.0898,  ..., -0.0093, -0.0257,  0.0457],
        [-0.0338,  0.0107,  0.0939,  ..., -0.0457,  0.0727,  0.0074],
        [ 0.0561,  0.0029, -0.0080,  ..., -0.0301, -0.0467, -0.0097],
        ...,
        [-0.0128,  0.0221,  0.0638,  ..., -0.0459, -0.0086,  0.0314],
        [ 0.0335,  0.0029,  0.0062,  ...,  0.0156, -0.0094,  0.0027],
        [ 0.0332,  0.0704, -0.0305,  ..., -0.0167, -0.0400,  0.0309]])

# **Model**

In [0]:
def model(x):
  l1=lin(x,w1,b1)
  l2=relu(l1)
  l3=lin(l2,w2,b2)
  return l3

In [16]:
%time result=model(x_valid)

CPU times: user 27.5 ms, sys: 115 µs, total: 27.6 ms
Wall time: 35.8 ms


In [17]:
result.shape

torch.Size([10000, 1])

#**Loss Function**

In [0]:
def mse(output,targ): return (output.squeeze(-1)-targ).pow(2).mean()

In [0]:
y_train, y_valid= y_train.float(), y_valid.float()

In [0]:
preds=model(x_train)

In [21]:
preds.shape

torch.Size([50000, 1])

In [22]:
mse(preds,y_train)

tensor(57.0966)

#**Gradient & Backward pass**

In [0]:
def mse_grad(inp,targ):
  inp.g=2.*(inp.squeeze()-targ).unqueeze(-1)/inp.shape[0]

In [0]:
def relu_grad(inp,out):
  inp.g=(inp>0).float()*out.g

In [0]:
def lin_grad(inp, out, w, b):
  inp.g= out.g@ w.t() 
  w.g= (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
  b.g= out.g.sum(0)

In [0]:
def f_b(inp,targ):
  l1=lin(inp, w1,b1)
  l2=relu(l1)
  out=lin(l2,w2,b2)
  loss= mse(out, targ)

  mse_grad(out,targ)
  lin_grad(l2, out, w2, b2)
  relu_grad(l1,l2)
  lin_grad(inp, l1, w1, b1)

#**refactory to classes**

In [0]:
class relu():
  def __call__(self,inp):
    self.inp=inp
    self.out=inp.clamp_min(0.)-0.5
    return self.out
  
  def backward(self): 
    self.inp.g=(self.int>0).float()*self.out.g

In [0]:
class linear():
  def __init__(slef, w, b):
    self.w = w
    self.b = b 
  def __call__(self, inp): 
    self.inp=inp
    self.out= inp @ self.w + self.b 
    return self.out
  def backward(self): 
    self.inp.g= self.out.g @ self.w.t()
    self.w.g= (self.inp.unsqueeze(-1) * self.out.g.unsqeeze(-1)).sum(0)
    self.b.g= self.out.g.sum(0)


In [0]:
class mse(): 
  def __call__(self,inp, targ):
    self.inp = inp
    self.targ = targ
    self.out = (self.inp.unsqueeze()-self.targ).pow(2).mean()
  def backward(self):
    self.inp.g= 2*(self.inp.unqueeze()-self.targ).unsqueeze(-1)/self.targ.shape[0]


In [0]:
class model ():
  def __init__(self, w1,b1,w2,b2):
    self.layer= [lin(w1,b1),relu(), lin(w2,b2)]
    self.loss=mse()
  def __call__(self, x, targ):
    for l in self.layer: x=l(x)
    return self.loss(x, targ)
  def backward(self):
    self.loss.backards()
    for l in reversed(self.layer): 
      l.backward()

# modulize 

In [0]:
class Module ():
  def __call__(self, *args):
    self.args= args
    self.out=self.forward(*args)
    return self.out
  def forward(self): raise Exception('Not Implemented')
  def backward(self): self.bwd(self.out, *self.args)

In [0]:
class relu(Module):
  def forward(self, inp): return inp.clamp_min(0.)-0.5
  def bwd(self, out, inp): inp.g=(inp>0).float()*out.g

In [0]:
class lin(Module):
  def __init__(self,w,b): self.w, self.b= w,b
  def forward(self, inp): return inp@self.w + self.b
  def bwd(self, out, inp): 
    inp.g= out.g @ self.w.t()
    self.w.g = inp.t()@ out.g
    self.b.g = out.g.sum(0)

In [0]:
class mse(Module):
  def forward(self, inp, targ): return (inp.squeeze()-targ).pow(2).mean()
  def bwd(self, out, inp, targ): inp.g= 2*(inp.squeeze()-targ).unsqueeze(-1)/targ.shape[0]

In [0]:
class model():
  def __init__(self,w1,b1,w2,b2): 
    self.layers =[lin(w1,b1),relu(),lin(w2,b2)]
    self.loss=mse()
  def __call__(self, x, targ):
    for l in self.layers: x= l(x)
    return self.loss(x,targ)
  def backward(self): 
    self.loss.backward()
    for l in reversed(self.layers): l.backward()

In [0]:
w1.g, b1.g, w2.g, b2.g =[None]*4
m1=model(w1,b1,w2,b2)

In [18]:
%time loss=m1(x_train,y_train)

CPU times: user 120 ms, sys: 2.32 ms, total: 122 ms
Wall time: 130 ms


In [19]:
%time m1.backward()

CPU times: user 230 ms, sys: 641 µs, total: 231 ms
Wall time: 235 ms


#Test the results

In [0]:
w1g=w1.g.clone()
b1g=b1.g.clone()
w2g=w2.g.clone()
b2g=b2.g.clone()
ig=x_train.g.clone()

In [0]:
xt2= x_train.clone().requires_grad_(True)
w12= w1.clone().requires_grad_(True)
b12= b1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [0]:
def mse_t(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [0]:
def forward(inp, targ):
  l1=inp @ w12 + b12
  l2= l1.clamp_min(0.)-0.5
  out = l2@ w22 + b22
  return mse_t(out,targ)

In [0]:
loss2=forward(xt2, y_train)

In [0]:
loss2.backward()

In [35]:
near(w1g,w12.grad)
near(b1g,b12.grad)
near(w2g,w22.grad)
near(b2g,b22.grad)
near(ig,xt2.grad)

True