In [31]:
%matplotlib inline
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
from torch import nn

In [5]:
def get_dataloader_workers(): #@save
    """Use 4 processes to read the data."""
    return 4

In [6]:
def load_data_fashion_mnist(batch_size, resize=None): #@save
    """Download the Fashion-MNIST dataset and then load it into memory."""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root="../data",
                                                    train=True,
                                                    transform=trans,
                                                    download=True)
    mnist_test = torchvision.datasets.FashionMNIST(root="../data",
                                                    train=False,
                                                    transform=trans,
                                                    download=True)
    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
            num_workers=get_dataloader_workers()),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
            num_workers=get_dataloader_workers()))


In [7]:
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)

In [8]:
'''
Each example in the raw dataset is a 28×28 image. In this section, we will flatten each image,
treating them as vectors of length 784.
Because our
dataset has 10 classes, our network will have an output dimension of 10. Consequently, our weights
will constitute a 784 × 10 matrix and the biases will constitute a 1 × 10 row vector. 
'''
num_inputs = 784
num_outputs = 10
'''
Returns a tensor of random numbers drawn from separate normal distributions 
whose mean and standard deviation are given.
'''



In [58]:
net=nn.Sequential(nn.Flatten(),nn.Linear(num_inputs,num_outputs))

In [59]:
y = torch.tensor([0, 2])
y_hat = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y_hat[[0, 1], y]

tensor([0.1000, 0.5000])

In [60]:
def init_weights(layer_type):
    if layer_type==nn.Linear:
        nn.init.normal_(layer_type.weight,std=0.01)
        
net.apply(init_weights)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=10, bias=True)
)

In [61]:
optimizer=torch.optim.SGD(net.parameters(),lr=0.1)
loss=nn.CrossEntropyLoss()


In [63]:
'''
model.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly.
More details: It sets the mode to train (see source code). You can call either model.eval() or model.train(mode=False) to tell that you are testing. It is somewhat intuitive to expect train function to train model but it does not do that.
It just sets the mode.

loss.backward() computes dloss/dx for every parameter x which has requires_grad=True '''

'\nmodel.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly.\nMore details: It sets the mode to train (see source code). You can call either model.eval() or model.train(mode=False) to tell that you are testing. It is somewhat intuitive to expect train function to train model but it does not do that.\nIt just sets the mode.\n\nloss.backward() computes dloss/dx for every parameter x which has requires_grad=True '

In [71]:
def accuracy(y_hat,y):
    if y_hat.shape[0]>0 and y_hat.shape==y.shape:
        pass   
    y_hat=y_hat.argmax(1)
    correct_pred= y_hat.type(y.dtype)==y
    
    return float(correct_pred.type(y.dtype).sum())

In [75]:
def train(net,train_iter,loss,optim):
    
    if isinstance(net,torch.nn.Module):
        net.train()
#     acc=Accumulator()
    data_count,total_loss,acc=0,0,0
    for X,y in train_iter:
        data_count+=y.numel()
        y_hat=net(X)
        acc+=accuracy(y_hat,y)
        l=loss(y_hat,y)
        total_loss=+l
        if isinstance(optim,torch.optim.Optimizer):
            optim.zero_grad()
            l.backward()
            optim.step()
            
           
        
    return total_loss,data_count,acc

In [76]:
num_epochs=10
for i in range(num_epochs):
    total_loss,data_count,acc=train(net,train_iter,loss,optimizer)
#     print(net[1].weight)
    print('EPoch loss-',total_loss/data_count,acc/data_count)

EPoch loss- tensor(8.5561e-06, grad_fn=<DivBackward0>) 0.85095
EPoch loss- tensor(5.6701e-06, grad_fn=<DivBackward0>) 0.8519833333333333
EPoch loss- tensor(7.0816e-06, grad_fn=<DivBackward0>) 0.8528
EPoch loss- tensor(6.7737e-06, grad_fn=<DivBackward0>) 0.8533833333333334
EPoch loss- tensor(7.2044e-06, grad_fn=<DivBackward0>) 0.8540666666666666
EPoch loss- tensor(5.6699e-06, grad_fn=<DivBackward0>) 0.8541833333333333
EPoch loss- tensor(6.8782e-06, grad_fn=<DivBackward0>) 0.85565
EPoch loss- tensor(4.7540e-06, grad_fn=<DivBackward0>) 0.85695
EPoch loss- tensor(7.3248e-06, grad_fn=<DivBackward0>) 0.8569666666666667
EPoch loss- tensor(8.4440e-06, grad_fn=<DivBackward0>) 0.8571333333333333


In [47]:
train(net,train_iter,loss,optimizer)

tensor(2.1482, grad_fn=<NllLossBackward>)
tensor(2.1482, grad_fn=<NllLossBackward>)


In [9]:
'''

Given
a matrix X we can sum over all elements (by default) or only over elements in the same axis, i.e.,
the same column (axis 0) or the same row (axis 1).
'''

X=torch.tensor([[1.0,2.0,3.0],[4.0,5.0,6.0]])

In [15]:
print(X.sum(0,keepdim=True).shape) #torch.Size([1, 3])

print(X.sum(0,keepdim=False).shape) #torch.Size([3])

torch.Size([1, 3])
torch.Size([3])


In [16]:
'''
We are now ready to implement the softmax operation. Recall that softmax consists of three steps:
(i) we exponentiate each term (using exp); 
(ii) we sum over each row (we have one row per example
in the batch) to get the normalization constant for each example; 
(iii) we divide each row by its
normalization constant, ensuring that the result sums to 1. Before looking at the code, let us recall
how this looks expressed as an equation:

'''

def softmax(X):
    X_exp=torch.exp(X)
    normalization=X_exp.sum(1,keepdim=True)
    return X_exp/normalization

X = torch.normal(0, 1, (2, 5))
X_prob = softmax(X)
X_prob, X_prob.sum(1)

(tensor([[0.4651, 0.0576, 0.0860, 0.1832, 0.2081],
         [0.1316, 0.1910, 0.4990, 0.0988, 0.0798]]),
 tensor([1.0000, 1.0000]))

In [27]:
def net(X):
    return softmax(torch.matmul(X.T,W)+b)

tensor([-1.1405, -1.3529])

In [26]:
\

tensor(-2.6700)