# What is a tensor?

Based on: https://docs.pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html

In [2]:
import torch
import numpy as np

### Initilizing  Tensors

In [3]:
# initializ a tensor from data
data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)
print("x_data: \n", x_data)
print("Shape of tensor x_data: ", x_data.shape)
print(f"Device tensor is stored on: {x_data.device}")


x_data: 
 tensor([[1, 2],
        [3, 4]])
Shape of tensor x_data:  torch.Size([2, 2])
Device tensor is stored on: cpu


In [None]:
# initlizing tensors from NumPy array
np_array = np.array(data)
print("\nnp_array: \n", np_array)
print("Shape of np_array: ", np_array.shape)
print("Datatype of np_array: ", np_array.dtype)

x_np = torch.from_numpy(np_array)
print("\nx_np: \n", x_np)
print("Shape of tensor x_np: ", x_np.shape)
print("Datatype of tensor x_np: ", x_np.dtype)


np_array: 
 [[1 2]
 [3 4]]
Shape of np_array:  (2, 2)
Datatype of np_array:  int64

x_np: 
 tensor([[1, 2],
        [3, 4]])
Shape of tensor x_np:  torch.Size([2, 2])
Datatype of tensor x_np:  torch.int64


In [5]:
# we can also create a tensor from another tensor
x_zeros = torch.zeros_like(x_data) # retains the properties of x_data - shape, datatype
print("\nx_zeros: \n", x_zeros)

# rand_like returns a tensor with the same size as input that is filled with random numbers from a uniform distribution on the interval [0, 1)
# the input of rand_like is a tensor
# https://docs.pytorch.org/docs/stable/generated/torch.rand_like.html
x_rand = torch.rand_like(input=x_data, dtype=torch.float) # overrides the datatype of x_data
print("\nx_rand: \n", x_rand)
print("Datatype of tensor x_rand: ", x_rand.dtype)


x_zeros: 
 tensor([[0, 0],
        [0, 0]])

x_rand: 
 tensor([[0.9151, 0.6225],
        [0.1912, 0.4041]])
Datatype of tensor x_rand:  torch.float32


In [6]:
# we can create tensors using random or constant values
# rand returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
# difference between rand and rand_like is that rand requires the shape of the tensor to be passed as argument
# https://docs.pytorch.org/docs/stable/generated/torch.rand.html
x_random = torch.rand(2,3) # random tensor of shape 2x3
print("\nx_random: \n", x_random)
ones_tensor = torch.ones(2,3) # tensor of ones of shape 2x3
print("\nones_tensor: \n", ones_tensor)
zeros_tensor = torch.zeros(2,3) # tensor of zeros of shape 2x3
print("\nzeros_tensor: \n", zeros_tensor)

# shape can also be a tuple
shape = (3,4)
rand_tensor = torch.rand(size=shape)
print("\nRandom Tensor of shape ", shape, ": \n", rand_tensor)


x_random: 
 tensor([[0.6013, 0.7835, 0.3469],
        [0.8089, 0.8830, 0.7346]])

ones_tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]])

zeros_tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])

Random Tensor of shape  (3, 4) : 
 tensor([[0.7980, 0.9271, 0.8042, 0.4433],
        [0.3317, 0.8314, 0.0136, 0.9851],
        [0.8103, 0.1372, 0.4657, 0.2968]])


### Tensor Operations

In [None]:
# all tensor operations found here: https://docs.pytorch.org/docs/stable/torch.html
torch_data = torch.tensor(data=([1,2], [3,4]))

# We move our tensor to the GPU if available
if torch.cuda.is_available():
  tensor = torch_data.to('cuda')
  print(f"Device tensor is stored on: {tensor.device}") # Device tensor is stored on: cuda:0
else:
  print(f"Device tensor is stored on: {torch_data.device}")

Device tensor is stored on: cpu


In [9]:
# indexing and slicing
x_data = torch.rand(3,4)
print("\nx_data: \n", x_data)

# we take the first two rows and columns 1 and 2
tensor_slice = x_data[0:2, 1:3]
print("\ntensor_slice: \n", tensor_slice)


x_data: 
 tensor([[0.6720, 0.5291, 0.3283, 0.4763],
        [0.4882, 0.9211, 0.8340, 0.4412],
        [0.3542, 0.8879, 0.4125, 0.6154]])

tensor_slice: 
 tensor([[0.5291, 0.3283],
        [0.9211, 0.8340]])


In [10]:
# we can also change values using indexing
x_data[:,1] = 0
print("\nx_data after setting column 1 to zeros: \n", x_data)


x_data after setting column 1 to zeros: 
 tensor([[0.6720, 0.0000, 0.3283, 0.4763],
        [0.4882, 0.0000, 0.8340, 0.4412],
        [0.3542, 0.0000, 0.4125, 0.6154]])


In [11]:
# we can join tensors
# dim=0 refers to rows
# https://docs.pytorch.org/docs/stable/generated/torch.cat.html
t1 = torch.cat([x_data, x_data], dim=0) # concatenate along rows
print("\nt1 - concatenated along rows: \n", t1)

# dim=1 refers to columns
t2 = torch.cat([x_data, x_data], dim=1) # concatenate along columns
print("\nt2 - concatenated along columns: \n", t2)


t1 - concatenated along rows: 
 tensor([[0.6720, 0.0000, 0.3283, 0.4763],
        [0.4882, 0.0000, 0.8340, 0.4412],
        [0.3542, 0.0000, 0.4125, 0.6154],
        [0.6720, 0.0000, 0.3283, 0.4763],
        [0.4882, 0.0000, 0.8340, 0.4412],
        [0.3542, 0.0000, 0.4125, 0.6154]])

t2 - concatenated along columns: 
 tensor([[0.6720, 0.0000, 0.3283, 0.4763, 0.6720, 0.0000, 0.3283, 0.4763],
        [0.4882, 0.0000, 0.8340, 0.4412, 0.4882, 0.0000, 0.8340, 0.4412],
        [0.3542, 0.0000, 0.4125, 0.6154, 0.3542, 0.0000, 0.4125, 0.6154]])


In [12]:
# we can also use stack to join tensors
# dim refers to the dimension along which the tensors will be stacked
# dim (int, optional) – dimension to insert. Has to be between 0 and the number of dimensions of concatenated tensors (inclusive). Default: 0
# https://docs.pytorch.org/docs/stable/generated/torch.stack.html
x_data = torch.rand(3,4)
print("\nx_data: \n", x_data)
print("Shape of tensor x_data: ", x_data.shape, "\n")

t3 = torch.stack([x_data, x_data], dim=0) # stacks tensors along a new dimension
print("Shape of stacked tensor t3 with dim=0: ", t3.shape)
print("t3: \n", t3, "\n")

t4 = torch.stack([x_data, x_data], dim=1) # stacks tensors along a new dimension
print("Shape of stacked tensor t4 with dim=1: ", t4.shape)
print("t4: \n", t4, "\n")

t5 = torch.stack([x_data, x_data], dim=2) # stacks tensors along a new dimension
print("Shape of stacked tensor t5 with dim=2: ", t5.shape)
print("t5: \n", t5, "\n")


x_data: 
 tensor([[0.8882, 0.2182, 0.5246, 0.0376],
        [0.4700, 0.8707, 0.7633, 0.5844],
        [0.1757, 0.1779, 0.9806, 0.4966]])
Shape of tensor x_data:  torch.Size([3, 4]) 

Shape of stacked tensor t3 with dim=0:  torch.Size([2, 3, 4])
t3: 
 tensor([[[0.8882, 0.2182, 0.5246, 0.0376],
         [0.4700, 0.8707, 0.7633, 0.5844],
         [0.1757, 0.1779, 0.9806, 0.4966]],

        [[0.8882, 0.2182, 0.5246, 0.0376],
         [0.4700, 0.8707, 0.7633, 0.5844],
         [0.1757, 0.1779, 0.9806, 0.4966]]]) 

Shape of stacked tensor t4 with dim=1:  torch.Size([3, 2, 4])
t4: 
 tensor([[[0.8882, 0.2182, 0.5246, 0.0376],
         [0.8882, 0.2182, 0.5246, 0.0376]],

        [[0.4700, 0.8707, 0.7633, 0.5844],
         [0.4700, 0.8707, 0.7633, 0.5844]],

        [[0.1757, 0.1779, 0.9806, 0.4966],
         [0.1757, 0.1779, 0.9806, 0.4966]]]) 

Shape of stacked tensor t5 with dim=2:  torch.Size([3, 4, 2])
t5: 
 tensor([[[0.8882, 0.8882],
         [0.2182, 0.2182],
         [0.5246, 0.5246],
 

In [13]:
# multiplying tensors
# https://docs.pytorch.org/docs/stable/generated/torch.randint.html
x_data = torch.randint(low=0, high=10, size=(2, 3))
y_data = torch.randint(low=0, high=10, size=(2, 3))
print("x_data: \n", x_data)
print("y_data: \n", y_data)

# element-wise multiplication
z_data = x_data * y_data
print("\nz_data (element-wise multiplication): \n", z_data)

# alternate syntax
z_data_alt = torch.mul(x_data, y_data)
print("\nz_data_alt (element-wise multiplication using torch.mul): \n", z_data_alt)

x_data: 
 tensor([[9, 9, 4],
        [1, 2, 0]])
y_data: 
 tensor([[5, 1, 1],
        [6, 8, 7]])

z_data (element-wise multiplication): 
 tensor([[45,  9,  4],
        [ 6, 16,  0]])

z_data_alt (element-wise multiplication using torch.mul): 
 tensor([[45,  9,  4],
        [ 6, 16,  0]])


In [14]:
# matrix multiplication
x_data = torch.randint(low=0, high=10, size=(2, 3))
y_data = torch.randint(low=0, high=10, size=(3, 2))
print("\nx_data: \n", x_data)
print("y_data: \n", y_data)

z_data = torch.matmul(x_data, y_data)
print("\nz_data (matrix multiplication using torch.matmul): \n", z_data)

# using Transpose for matrix multiplication
y_data = torch.randint(low=0, high=10, size=(2, 3))

# using .t() method for transpose
y_dataT = torch.t(y_data) 
z_data_alt = torch.matmul(x_data, y_dataT)
print("\nz_data_alt (matrix multiplication using torch.mm and transpose): \n", z_data_alt)

# alternative for transpose using .T attribute
z_data_alt = torch.matmul(x_data, y_data.T)
print("\nz_data_alt (matrix multiplication using torch.mm and alternative transpose): \n", z_data_alt)

# alternately we can use the @ operator for matrix multiplication
z_data_at = x_data @ y_dataT
print("\nz_data_at (matrix multiplication using @ operator): \n", z_data_at)


x_data: 
 tensor([[2, 9, 4],
        [8, 6, 8]])
y_data: 
 tensor([[8, 8],
        [8, 9],
        [7, 7]])

z_data (matrix multiplication using torch.matmul): 
 tensor([[116, 125],
        [168, 174]])

z_data_alt (matrix multiplication using torch.mm and transpose): 
 tensor([[ 53, 108],
        [ 54, 136]])

z_data_alt (matrix multiplication using torch.mm and alternative transpose): 
 tensor([[ 53, 108],
        [ 54, 136]])

z_data_at (matrix multiplication using @ operator): 
 tensor([[ 53, 108],
        [ 54, 136]])


In [None]:
# we can also convert tensor to numpy
tensor = torch.rand(3,4)
print("\ntensor: \n", tensor)
print("Shape of tensor: ", tensor.shape)
print("Datatype of tensor: ", tensor.dtype)

numpy_array = tensor.numpy()
print("\nnumpy_array converted from tensor: \n", numpy_array)
print("Shape of numpy_array: ", numpy_array.shape)
print("Datatype of numpy_array: ", numpy_array.dtype)


tensor: 
 tensor([[0.4962, 0.0981, 0.4911, 0.7548],
        [0.3078, 0.8141, 0.4804, 0.9459],
        [0.2776, 0.7519, 0.4350, 0.5456]])
Shape of tensor:  torch.Size([3, 4])
Datatype of tensor:  torch.float32

numpy_array converted from tensor: 
 [[0.49616057 0.09806406 0.49114263 0.75475955]
 [0.3078339  0.8140762  0.48040283 0.94590724]
 [0.2775588  0.7518571  0.43497956 0.54560083]]
Shape of numpy_array:  (3, 4)
Datatype of numpy_array:  float32


In [None]:
# or change numpy array to tensor
numpy_array = np.random.rand(3,4)
print("\nnumpy_array: \n", numpy_array)
print("Shape of numpy_array: ", numpy_array.shape)
print("Datatype of numpy_array: ", numpy_array.dtype)

tensor_from_numpy = torch.from_numpy(numpy_array)
print("\ntensor_from_numpy: \n", tensor_from_numpy)
print("Shape of tensor_from_numpy: ", tensor_from_numpy.shape)
print("Datatype of tensor_from_numpy: ", tensor_from_numpy.dtype)


numpy_array: 
 [[0.40100405 0.78795929 0.15280297 0.86562914]
 [0.86783988 0.63264958 0.21388667 0.73238266]
 [0.36621513 0.78463168 0.41143303 0.7874434 ]]
Shape of numpy_array:  (3, 4)
Datatype of numpy_array:  float64

tensor_from_numpy: 
 tensor([[0.4010, 0.7880, 0.1528, 0.8656],
        [0.8678, 0.6326, 0.2139, 0.7324],
        [0.3662, 0.7846, 0.4114, 0.7874]], dtype=torch.float64)
Shape of tensor_from_numpy:  torch.Size([3, 4])
Datatype of tensor_from_numpy:  torch.float64


In [None]:
# we use in_place operations to save memory
x = torch.rand(3,4)
print("Original tensor x: \n", x)
print("Memory address of x before in-place operation: ", hex(id(x)))
x.add_(5)  # in-place addition
print("\nTensor x after in-place addition of 5: \n", x)
print("Memory address of x after in-place operation: ", hex(id(x)))
# other similar are copy_ or zero_ or fill_ or t_
# "In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss of history. Hence, their use is discouraged."

Original tensor x: 
 tensor([[0.4424, 0.2747, 0.9263, 0.8910],
        [0.5453, 0.8486, 0.7580, 0.5367],
        [0.2699, 0.9591, 0.8523, 0.2504]])
Memory address of x before in-place operation:  0x74d1aeba3070

Tensor x after in-place addition of 5: 
 tensor([[5.4424, 5.2747, 5.9263, 5.8910],
        [5.5453, 5.8486, 5.7580, 5.5367],
        [5.2699, 5.9591, 5.8523, 5.2504]])
Memory address of x after in-place operation:  0x74d1aeba3070


## Difference between torch.mm, torch.matmul, torch.mul and torch.bmm

### torch.mul

In [None]:
# 1. torch.mul - Element-wise multiplication (works on any shape, no matrix multiplication)
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[5, 6], [7, 8]])
print("a:\n", a, "\n")
print("b:\n", b, "\n")
print("torch.mul(a, b):\n", torch.mul(a, b), "\n")
print("Same as a * b:\n", a * b)

a:
 tensor([[1, 2],
        [3, 4]]) 

b:
 tensor([[5, 6],
        [7, 8]]) 

torch.mul(a, b):
 tensor([[ 5, 12],
        [21, 32]]) 

Same as a * b:
 tensor([[ 5, 12],
        [21, 32]])


### torch.mm

In [26]:
# torch.mm - Strict 2D matrix multiplication (only works with 2D tensors)

x = torch.tensor([[1, 2, 3], [4, 5, 6]])  # 2x3
print("x (2x3):\n", x, "\n")
print(f"shape of x: {x.shape}, dimensions: {x.dim()}")
print(f"x is a {x.dim()}D tensor\n")
print("y (3x2):\n", y, "\n")

y = torch.tensor([[7, 8], [9, 10], [11, 12]])  # 3x2
print(f"shape of y: {y.shape}, dimensions: {y.dim()}")
print(f"y is a {y.dim()}D tensor\n")
print("torch.mm(x, y) (2x2):\n", torch.mm(x, y), "\n")

# torch.mm does NOT work with 1D or 3D tensors
print("\ntorch.mm only works with 2D tensors")
try:
    vec = torch.tensor([1, 2, 3])
    torch.mm(vec, vec)
except RuntimeError as e:
    print(f"shape of vec: {vec.shape}, dimensions: {vec.dim()}")
    print(f"vec is a {vec.dim()}D tensor")
    print(f"Error with 1D tensors: {e}")

x (2x3):
 tensor([[1, 2, 3],
        [4, 5, 6]]) 

shape of x: torch.Size([2, 3]), dimensions: 2
x is a 2D tensor

y (3x2):
 tensor([[ 7,  8],
        [ 9, 10],
        [11, 12]]) 

shape of y: torch.Size([3, 2]), dimensions: 2
y is a 2D tensor

torch.mm(x, y) (2x2):
 tensor([[ 58,  64],
        [139, 154]]) 


torch.mm only works with 2D tensors
shape of vec: torch.Size([3]), dimensions: 1
vec is a 1D tensor
Error with 1D tensors: self must be a matrix


### torch.matmul

In [None]:
# torch.matmul - General matrix multiplication (supports broadcasting)
x = torch.tensor([[1, 2, 3], [4, 5, 6]])  # 2x3
y = torch.tensor([[7, 8], [9, 10], [11, 12]])  # 3x2
print("torch.matmul(x, y):\n", torch.matmul(x, y))

print("\nWorks with 1D vectors (dot product):")
v1 = torch.tensor([1, 2, 3])
v2 = torch.tensor([4, 5, 6])
print("v1:", v1)
print(f"shape of v1: {v1.shape}, dimensions: {v1.dim()}")
print("v2:", v2)
print(f"shape of v2: {v2.shape}, dimensions: {v2.dim()}")
print("torch.matmul(v1, v2):", torch.matmul(v1, v2), "\n")

# works with higher dimensions and broadcasts:
# batch of matrices multiplied by single matrix
batch = torch.randn(3, 2, 4)  # 3 matrices of size 2x4 
mat = torch.randn(4, 5)  # single matrix of size 4x5

# The single matrix (4x5) is broadcast and multiplied with each of the 3 matrices (2x4), giving 3 result matrices (2x5)
result = torch.matmul(batch, mat)  # broadcasts to 3x2x5

print("Batch: \n", batch)
print(f"batch shape: {batch.shape}, dimensions: {batch.dim()}", "\n")
print("mat: \n", mat)
print(f"mat shape: {mat.shape}, dimensions: {mat.dim()}")
print(f"\n torch.matmul(batch, mat) shape: {result.shape}")

torch.matmul(x, y):
 tensor([[ 58,  64],
        [139, 154]])

Works with 1D vectors (dot product):
v1: tensor([1, 2, 3])
shape of v1: torch.Size([3]), dimensions: 1
v2: tensor([4, 5, 6])
shape of v2: torch.Size([3]), dimensions: 1
torch.matmul(v1, v2): tensor(32) 

Batch: 
 tensor([[[ 0.4326, -1.2180, -1.6467, -0.7321],
         [ 0.7143, -0.1420, -1.1062,  0.3012]],

        [[-2.0892,  1.0409, -0.3907, -0.8132],
         [-0.0974, -1.1037,  0.8534, -0.1865]],

        [[ 1.6558, -1.2848, -1.7281,  0.8026],
         [-0.6132,  0.4482, -1.3220, -0.7041]]])
batch shape: torch.Size([3, 2, 4]), dimensions: 3 

mat: 
 tensor([[ 1.2829,  0.7644,  1.5887,  1.3469, -0.4371],
        [-0.7535,  2.4394,  0.9881, -0.9113, -1.1043],
        [-0.7483,  1.8146, -1.3206,  0.3291,  1.3107],
        [-0.6737, -0.1068,  0.1839, -0.7033,  1.5295]])
mat shape: torch.Size([4, 5]), dimensions: 2

 torch.matmul(batch, mat) shape: torch.Size([3, 2, 5])


### torch.bmm

In [41]:
# torch.bmm requires EXACTLY 3D tensors with same batch size
batch1 = torch.randn(10, 3, 4) # 10 matrices of size 3x4
batch2 = torch.randn(10, 4, 5) # 10 matrices of size 4x5
result = torch.bmm(batch1, batch2)  # results in 10x3x5
print(f"batch1 shape: {batch1.shape}")
print(f"batch2 shape: {batch2.shape}")
print(f"torch.bmm(batch1, batch2) shape: {result.shape}")

# Example where matmul works but bmm doesn't (broadcasting)
batch_a = torch.randn(5, 2, 3)  # 5 matrices of 2x3
batch_b = torch.randn(3, 4)     # single matrix of 3x4

print(f"\nbatch_a shape: {batch_a.shape}, batch_b shape: {batch_b.shape}")
print("torch.matmul can broadcast:", torch.matmul(batch_a, batch_b).shape)
try:
    torch.bmm(batch_a, batch_b)
except RuntimeError as e:
    print(f"torch.bmm cannot broadcast: {e}")

batch1 shape: torch.Size([10, 3, 4])
batch2 shape: torch.Size([10, 4, 5])
torch.bmm(batch1, batch2) shape: torch.Size([10, 3, 5])

batch_a shape: torch.Size([5, 2, 3]), batch_b shape: torch.Size([3, 4])
torch.matmul can broadcast: torch.Size([5, 2, 4])
torch.bmm cannot broadcast: batch2 must be a 3D tensor


## Broadcasting

In [None]:
# Broadcasting - Operating on tensors with different shapes
# Broadcasting allows PyTorch to automatically expand tensors to compatible shapes

# Scalar broadcasting
a = torch.tensor([[1, 2, 3], [4, 5, 6]])
scalar = 10
print("a (2x3):\n", a)
print("scalar:", scalar)
print("\na + scalar (scalar is broadcast to match a's shape):\n", a + scalar) #scalar 10 becomes [[10, 10, 10], [10, 10, 10]]


a (2x3):
 tensor([[1, 2, 3],
        [4, 5, 6]])
scalar: 10

a + scalar (scalar is broadcast to match a's shape):
 tensor([[11, 12, 13],
        [14, 15, 16]])


In [None]:
# 1D to 2D broadcasting
matrix = torch.tensor([[1, 2, 3], [4, 5, 6]])  # 2x3
vector = torch.tensor([10, 20, 30])  # 3
print("matrix (2x3):\n", matrix)
print("vector (3,):", vector)
print("\nmatrix + vector:\n", matrix + vector)

#  vector [10, 20, 30] is broadcast to:
# [[10, 20, 30],
#  [10, 20, 30]]

matrix (2x3):
 tensor([[1, 2, 3],
        [4, 5, 6]])
vector (3,): tensor([10, 20, 30])

matrix + vector:
 tensor([[11, 22, 33],
        [14, 25, 36]])


In [44]:
# Column vector broadcasting
matrix = torch.tensor([[1, 2, 3], [4, 5, 6]])  # 2x3
col_vector = torch.tensor([[10], [20]])  # 2x1
print("matrix (2x3):\n", matrix)
print("col_vector (2x1):\n", col_vector)
print("\nmatrix + col_vector:\n", matrix + col_vector)

# col_vector [[10], [20]] is broadcast to:
# [[10, 20, 30],
#  [10, 20, 30]]

matrix (2x3):
 tensor([[1, 2, 3],
        [4, 5, 6]])
col_vector (2x1):
 tensor([[10],
        [20]])

matrix + col_vector:
 tensor([[11, 12, 13],
        [24, 25, 26]])


In [None]:
# "Two tensors are broadcastable if:"
# "1. Each tensor has at least one dimension, AND"
# "2. When iterating over dimensions from right to left:
#     - Dimensions are equal, OR"
#     - One of them is 1, OR"
#     - One of them doesn't exist"

# Example of compatible shapes
a = torch.randn(3, 1, 4)
b = torch.randn(1, 5, 4)
c = a + b
print(f"\na shape: {a.shape} (3, 1, 4)")
print(f"b shape: {b.shape} (1, 5, 4)")
print(f"Result shape: {c.shape}")

#   a: (3, 1, 4)
#   b: (1, 5, 4)
#       ↑  ↑  ↑
#     dim0 dim1 dim2

# Dim 2 (rightmost): a[4] vs b[4] → 4 == 4 ✓ Result: 4
# Dim 1 (middle): a[1] vs b[5] → 1 can broadcast to 5 ✓ Result: 5
#     - When one dimension is 1, it stretches/repeats to match the other
# Dim 0 (leftmost): a[3] vs b[1] → 1 can broadcast to 3 ✓ Result: 3
#     - Again, 1 broadcasts to match 3

# Final result: (3, 5, 4)

# What actually happens:
# - Tensor a (3, 1, 4): The middle dimension [1] is repeated 5 times
# - Tensor b (1, 5, 4): The first dimension [1] is repeated 3 times

# So effectively:
# - a becomes (3, 5, 4) by copying its single column 5 times
# - b becomes (3, 5, 4) by copying its single matrix 3 times


a shape: torch.Size([3, 1, 4]) (3, 1, 4)
b shape: torch.Size([1, 5, 4]) (1, 5, 4)
Result shape: torch.Size([3, 5, 4])


In [48]:
# Example 5: Incompatible shapes
x = torch.randn(3, 4)
y = torch.randn(3, 5)
print(f"x shape: {x.shape} (3, 4)")
print(f"y shape: {y.shape} (3, 5)")
print("\nDimension by dimension:")
print("  Dim 1: 4 != 5 and neither is 1 ✗")
try:
    z = x + y
except RuntimeError as e:
    print(f"\nError: {e}")


#   x: (3, 4)
#   y: (3, 5)
#       ↑  ↑
#      dim0 dim1

# 1. Dim 1 (rightmost): x[4] vs y[5] → 4 ≠ 5 and neither is 1 ✗ FAIL

# Broadcasting stops here!

# Why it fails:
# - For broadcasting to work, dimensions must be:
# - Equal (e.g., 4 == 4), OR
# - One of them is 1 (e.g., 1 can broadcast to any size), OR
# - One doesn't exist (e.g., (3,) can broadcast with (3, 4))

# Since 4 ≠ 5 and neither dimension is 1, PyTorch cannot automatically expand either tensor to make them compatible.

# If we wanted them to work:
# - Change y to (3, 4) → dimensions match
# - Change y to (3, 1) → the 1 can broadcast to 4
# - Change x to (3, 1) → the 1 can broadcast to 5

# But with 4 and 5, there's no way to broadcast - PyTorch won't guess which one you want!

x shape: torch.Size([3, 4]) (3, 4)
y shape: torch.Size([3, 5]) (3, 5)

Dimension by dimension:
  Dim 1: 4 != 5 and neither is 1 ✗

Error: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
