# PyTorch
a brief intro
### How to install
Create some sort of virtual environment (preferably Conda or Poetry). Then follow the instructions on PyTorch's [website](https://pytorch.org/) to install the package.

In [5]:
# some standard imports
import torch
import numpy

In [15]:
# PyTorch Tensors
# A tensor is an array with an arbitrary number of dimensions (a matrix is a 2-tensor)
my_tensor = torch.tensor([1,2,3])       # we can create a tensor from a python array
print(my_tensor.shape)                  # we can see it's shape by calling .shape (which is very useful for debugging)
print(torch.tensor(numpy.ones((2,3))))  # tensors can be created from numpy arrays as well
torch.ones((2,3))                       # in fact torch can do most things numpy can
print(torch.randn(3, 3))                # it can also create normally distributed tensors
print(torch.ones((1,2), dtype=torch.float32))  # all tensor creation functions have this dtype optional parameter
print(torch.ones((1,2), dtype=torch.float16))  # which tells torch what C++ primitive to represent it as 
print(torch.ones((1,2), dtype=torch.int32))    # (float 32 being a 32 bit IEEE 754 float, the most common for ML)

torch.Size([3])
tensor([[1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[-0.4194, -0.6726,  0.6190],
        [ 0.0851, -1.7131, -0.1297],
        [-0.8416,  1.8870,  1.9356]])
tensor([[1., 1.]])
tensor([[1., 1.]], dtype=torch.float16)
tensor([[1, 1]], dtype=torch.int32)


In [37]:
# PyTorch indexing
a = torch.tensor([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])
# if we want the second row first column element we can write the following (indicies are 0 indexed)
print(a[1][0])              # remeber arrays are indexed row, then column (notice this is still a tensor)
print(a[1][0].item())       # if we want a Python integer we need to call item on a scalar (0-tensor)
# we can also use array slicing
print(a[:][0])              # we can also take all rows and take the first of those rows
print(a[:, 0])              # this means take the first element of all rows note: it's diffrent from a[:][0]!!!
print(a[:2, 0])             # take the first element of all the rows until the third row (not inclusive)
print(a[1:, 0])             # take the first element of all rows after the second row (inclusive)
# we can also specify that we want all elements from an arbitrary number of dimensions with ...
b = torch.ones(1,2,3,4,5,6)
print(b[0, ..., 1].shape)   # we take the first element of the first dimension, all elements of dimensions 2-5
                            # and then the second element of the last dimension which causes the tensor to flatten out

tensor(4)
4
tensor([1, 2, 3])
tensor([1, 4, 7])
tensor([1, 4])
tensor([4, 7])
torch.Size([2, 3, 4, 5])


In [41]:
# PyTorch Arithmatic
a = torch.tensor([[1,1],[2,2]])
b = torch.tensor([[3,3], [4,4]])
# if we wanted to add 2 to every element in a we could do a for loop
a_plus_two = torch.empty_like(a)
for i in range(a.shape[0]):
    for j in range(a.shape[1]):
        a_plus_two[i][j] = a[i][j] + 2
# however this is very slow and doesn't allow us to change a's shape (for example if we want to add another dimension)
# so instead we can let PyTorch handle this using something called "broadcasting"
# which is where we let PyTorch's C++ engine do the math for us which is *much* faster than we can achieve in python
# here is the correct way to add 2 to every element:
print(a + 2)                   # pytorch supports element wise operations with standard opperands
print(b + a)                   # pytorch also supports *element-wise* operations between tensors for most opperands
print(a * b)                   # this is *element-wise* multiplication
print(a @ b)                   # to do *matrix* multiplication we need this @ sign

# PyTorch also allows us to operate on tensors of diffrent sizes given their last n dimensions are the same shape 
# it does this by "casting" the smaller tensor to a higher dimension by repeatedly copying it and then doing the operation
a = torch.ones((2,3,4,5,6))
b = torch.ones((4,5,6)) * 2
print((a*b).shape)

tensor([[3, 3],
        [4, 4]])
tensor([[4, 4],
        [6, 6]])
tensor([[3, 3],
        [8, 8]])
tensor([[ 7,  7],
        [14, 14]])
torch.Size([2, 3, 4, 5, 6])


In [63]:
# auto-grad
# here we tell x to track it's gradients (note that torch can only track floating point tensor gradients)
# when this is set torch will automatically track the computational graph for x 
x = torch.tensor([1.,2.,3.], requires_grad=True)
y = (x * 2).mean()  # torch can only calculate gradients from a scalar (0-tensor) output
y.backward()        # this tells torch to calculate the upstream gradients from y, note that backward has no "s"
print(x.grad)

# updating parameters
# here we tell our optimizer (stochasitic gradient decent) to modify x with a gradient multiplier of 1
optimizer = torch.optim.SGD([x], lr=1)  # all optimizers are classes so here it is initialized
optimizer.step()                        # this tells the optimizer to update it's tracked parameters (ie x -= x.grad*lr)
print(x)                                # note that x is now x -= x.grad*lr

# zero-ing gradient
print(x.grad)           # note that x is still storing it's gradient, so if we want to calculate a new gradient
optimizer.zero_grad()   # we have to zero it's gradient through the optimizer   
print(x.grad)

tensor([0.6667, 0.6667, 0.6667])
tensor([0.3333, 1.3333, 2.3333], requires_grad=True)
tensor([0.6667, 0.6667, 0.6667])
None


In [93]:
# lets look at a standard dense network implmentation
# try removing some pieces of this network to see how the output changes
from torch import nn

class MyNetwork(nn.Module):   # all NN's must inherit from nn.Module since it defines some important behvior 
    def __init__(self,layer_sizes: tuple[int, ...]) -> None:
        super(MyNetwork, self).__init__()   # first we initialize nn.Module
        self.layers = []
        for in_size, out in zip(layer_sizes[:-1], layer_sizes[1:]): # this makes in and out offset by 1
            self.layers.append(nn.Linear(in_size, out))     # nn.Linear is a dense layer
            self.layers.append(nn.ReLU())                   # here we have our activation function
        self.layers[-1] = nn.Sigmoid()                      # we replace the last ReLU with sigmoid for classification
        self.layers = nn.ModuleList(self.layers)            # then we register these layers with nn.Module using this (try removing this)

    def forward(self, x) -> torch.Tensor:  
        """
        This function is called whenever some input is passed into the model
        """
        for layer in self.layers:   # we pass x through each layer
            x = layer(x)
        return x
    
# here you can see an example network by printing out all of the registered layers
a = MyNetwork([10,32,64,8,2])
print(a)
# as a sanity check lets try passing in a vector 
x = torch.ones((32, 10))    # note this is of shape [batch, input], pytorch handels batches implicitly
print(x.shape)
print(a(x).shape)      # since our layer sizes goes from 10 to 2 we should expect the output to be [32,2]

MyNetwork(
  (layers): ModuleList(
    (0): Linear(in_features=10, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=8, bias=True)
    (5): ReLU()
    (6): Linear(in_features=8, out_features=2, bias=True)
    (7): Sigmoid()
  )
)
torch.Size([32, 10])
torch.Size([32, 2])


In [113]:
# optimizing a network
a = MyNetwork([2,3]) # lets create a simple network
optimizer = torch.optim.Adam(a.parameters(), lr=1e-2) # to access our model's trainable parameters we use .parameters()
criterion = torch.nn.CrossEntropyLoss()               # our loss function is also a class
x = torch.ones((2, 2))
y = a(x)                    # here we passed x through our network
label = torch.ones_like(y)  # lets say our target is to get our model to output 1's
loss = criterion(y, label)  # cross entropy takes predictions, then target -> int
loss.backward()             # remeber this is how we backprop
print("first layer grad:\n", a.layers[0].weight.grad, end="\n\n") # this lets us see the gradients on the weight of the first layer
print("first layer weight:\n", a.layers[0].weight, end="\n\n")
optimizer.step()
print("first layer weight post update:\n", a.layers[0].weight, end="\n\n")
optimizer.zero_grad()

first layer grad:
 tensor([[-0.0059, -0.0059],
        [-0.0359, -0.0359],
        [ 0.0417,  0.0417]])

first layer weight:
 Parameter containing:
tensor([[-0.5906,  0.3870],
        [-0.1813, -0.5382],
        [ 0.0853,  0.1262]], requires_grad=True)

first layer weight post update:
 Parameter containing:
tensor([[-0.5806,  0.3970],
        [-0.1713, -0.5282],
        [ 0.0753,  0.1162]], requires_grad=True)



In [121]:
# using hardware accelerators
# cuda is for nvidia graphics, mps for m-series mac chips, and cpu runs on your cpu
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
x = torch.ones((2,3))
# we can use .to to make a *copy* of the vector on the device
x_on_device = x.to(device) 
print(x)            # this should look the same as it originally did
print(x_on_device)  # but if you have a device this one will say device='cuda:0' (or mps)

a = MyNetwork([2,3])
print(next(a.parameters()).device)
a.to(device)       # models, however, are moved to the new device rather than copied
print(next(a.parameters()).device)

tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')
cpu
cuda:0


### So What Can PyTorch Do?
PyTorch is incredibly powerful and, better yet, it has amazing documentation. To get a better idea of what it can do I reccomend looking at the documentation for some of the most vital parts. In particular [torch.optim](https://pytorch.org/docs/stable/optim.html), [torch.nn](https://pytorch.org/docs/stable/nn.html), and [torch.nn loss functions](https://pytorch.org/docs/stable/nn.html) are a good place to start.