In [None]:
import math
import torch
import torch.nn.functional as F
from torch import nn

##### Example 1

In [None]:
x = [
    torch.tensor(1),
    torch.tensor(2),
    torch.tensor(3),
    torch.tensor(4)
]

In [None]:
output = torch.cat(x, -1)

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [None]:
x = torch.arange(1, 9).reshape(2, 4)

In [None]:
x

tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])

In [None]:
torch.cat(x, dim=-1)

TypeError: cat() received an invalid combination of arguments - got (Tensor, dim=int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


- `seq_len` (sequence length) is 3
- `d_h` (head dimensionality) is 2
- `D` (model dimensionality) is 6

In [None]:
output_per_head = [
    [
        [0.00, 0.01],
        [0.10, 0.11],
        [0.20, 0.21]
    ],    
    [
        [1.00, 1.01],
        [1.10, 1.11],
        [1.20, 1.21]
    ],
    [
        [2.00, 2.01],
        [2.10, 2.11],
        [2.20, 2.21]
    ]
]

In [None]:
output_per_head = torch.tensor(output_per_head).float()

In [None]:
output_per_head

tensor([[[0.0000, 0.0100],
         [0.1000, 0.1100],
         [0.2000, 0.2100]],

        [[1.0000, 1.0100],
         [1.1000, 1.1100],
         [1.2000, 1.2100]],

        [[2.0000, 2.0100],
         [2.1000, 2.1100],
         [2.2000, 2.2100]]])

In [None]:
torch.cat(output_per_head, dim=-1)

TypeError: cat() received an invalid combination of arguments - got (Tensor, dim=int), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)


`shape(expected_output)` = `[seq_lengh x D]`

In [None]:
expected_output = [
    [0.00, 0.01, 1.00, 1.01, 2.00, 2.01],
    [0.10, 0.11, 1.10, 1.11, 2.10, 2.11],
    [0.20, 0.21, 1.20, 1.21, 2.20, 2.21]
]

##### Example

In [None]:
from torch import nn

Write the initialize part in `MultiHeadAttention`. Explain each variable and line

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model=4, num_heads=2):
        super().__init__()
        
        # calculate the dimensionality per head
        self.d_h = d_model // num_heads
        
        assert self.d_h * num_heads == d_model
        
        self.d_model = d_model
        self.num_heads = num_heads
        
        # go from d_model to d per head
        self.linear_qs = nn.ModuleList([
            nn.Linear(d_model, self.d_h) for _ in range(num_heads)
        ])
        self.linear_ks = nn.ModuleList([
            nn.Linear(d_model, self.d_h) for _ in range(num_heads)
        ])        
        self.linear_vs = nn.ModuleList([
            nn.Linear(d_model, self.d_h) for _ in range(num_heads)
        ])

**Explain**

Variables
- `d_model`: the dimension of a embedding vector that represent each word
- `num_heads`: the number of heads
- `d_head`: the dimension
- `linear_qs`: a list of `num_heads` matrix transformation maps the embedding vector to `Q`

Code
- `nn.Linear(d_model, self.d_h) for _ in range(num_heads)`
    + For each head, create a matrix 

It will take data with `[batch_size x seq_len x d_model]`

In [None]:
attention = MultiHeadAttention(d_model=6, num_heads=3)

##### Example 4

In [None]:
attention_weight_1 = torch.tensor([[0.1, 0.2, 0.3],
                                   [1.1, 1.2, 1.3],
                                   [2.1, 2.2, 2.3]])
attention_weight_2 = torch.tensor([[3.1, 3.2, 3.3],
                                   [4.1, 4.2, 4.3],
                                   [5.1, 5.2, 5.3]])
attention_weight_3 = torch.tensor([[6.1, 6.2, 6.3],
                                   [7.1, 7.2, 7.3],
                                   [8.1, 8.2, 8.3]])

In [None]:
import torch

These are attention weights from each head

In [None]:
attention_weight_1.shape, attention_weight_2.shape, attention_weight_3.shape

(torch.Size([3, 3]), torch.Size([3, 3]), torch.Size([3, 3]))

Compute the attention weight of multi-head attention. And what is its final shape?

In [None]:
final_attention_weights = torch.stack([
    attention_weight_1,
    attention_weight_2,
    attention_weight_3
])

In [None]:
final_attention_weights.shape

torch.Size([3, 3, 3])

##### Example 5

In [None]:
attention_weight_1 = torch.tensor([[0.1, 0.2, 0.3],
                                   [1.1, 1.2, 1.3],
                                   [2.1, 2.2, 2.3]])

In [None]:
attention_weight_2 = torch.tensor([[3.1, 3.2, 3.3, 3.4],
                                   [4.1, 4.2, 4.3, 4.4],
                                   [5.1, 5.2, 5.3, 5.4],
                                   [6.1, 6.2, 6.3, 6.4]])

In [None]:
attention_weight_3 = torch.tensor([[7.1, 7.2, 7.3, 7.4],
                                   [8.1, 8.2, 8.3, 8.4],
                                   [9.1, 9.2, 9.3, 9.4],
                                   [10.1, 10.2, 10.3, 10.4]])

In [None]:
attention_weight_1.shape, attention_weight_2.shape, attention_weight_3.shape

(torch.Size([3, 3]), torch.Size([4, 4]), torch.Size([4, 4]))

In [None]:
attention_weight_1 = torch.nn.functional.pad(attention_weight_1, (1, 1, 1, 1), mode="constant", value=0)

In [None]:
attention_weight_2 = torch.nn.functional.pad(attention_weight_2, (0, 0, 1, 1), mode="constant", value=0)


In [None]:
attention_weight_1.shape, attention_weight_2.shape, attention_weight_3.shape

(torch.Size([5, 5]), torch.Size([6, 4]), torch.Size([4, 4]))

In [None]:
attention_weights = torch.stack([attention_weight_1, attention_weight_2, attention_weight_3])

RuntimeError: stack expects each tensor to be equal size, but got [3, 3] at entry 0 and [7, 7] at entry 1

##### Example 5

In [None]:
x = torch.tensor([[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9]])

In [None]:
import torch.nn.functional as F

In [None]:
x

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

Create a tensor `y` from `x` as bellow

In [None]:
y = F.pad(x, (1, 1, 1, 1))

In [None]:
y

tensor([[0, 0, 0, 0, 0],
        [0, 1, 2, 3, 0],
        [0, 4, 5, 6, 0],
        [0, 7, 8, 9, 0],
        [0, 0, 0, 0, 0]])

In [None]:
y.shape

torch.Size([5, 5])

In [None]:
# Import the math module to use math functions
import math

# Define the diameter of the flywheel
diameter = 0.36

# Define the angular position formula
def theta(t):
  return (2.0 * t**3)

# Calculate theta at t1
t1 = 2.0
theta_t1 = theta(t1)

# Calculate theta at t2
t2 = 5.0
theta_t2 = theta(t2)

# Calculate the distance the particle moves
distance = diameter * (theta_t2 - theta_t1)

# Calculate the average angular velocity
avg_angular_velocity = (theta_t2 - theta_t1) / (t2 - t1)

# Calculate the instantaneous angular velocities at t1 and t2
instantaneous_angular_velocity_t1 = 3 * (2.0 * t1**2)
instantaneous_angular_velocity_t2 = 3 * (2.0 * t2**2)

# Print the results
print("Theta at t1 (radians):", theta_t1)
print("Theta at t1 (degrees):", math.degrees(theta_t1))
print("Theta at t2 (radians):", theta_t2)
print("Theta at t2 (degrees):", math.degrees(theta_t2))
print("Distance moved:", distance)
print("Average angular velocity (rad/s):", avg_angular_velocity)
print("Average angular velocity (rev/min):", (avg_angular_velocity / (2 * math.pi)) * 60)
print("Instantaneous angular velocity at t1 (rad/s):", instantaneous_angular_velocity_t1)
print("Instantaneous angular velocity at t1 (rev/min):", (instantaneous_angular_velocity_t1 / (2 * math.pi)) * 60)
print("Instantaneous angular velocity at t2 (rad/s):", instantaneous_angular_velocity_t2)
print("Instantaneous angular velocity at t2 (rev/min):", (instantaneous_angular_velocity_t2 / (2 * math.pi)) * 60)


Theta at t1 (radians): 16.0
Theta at t1 (degrees): 916.7324722093172
Theta at t2 (radians): 250.0
Theta at t2 (degrees): 14323.94487827058
Distance moved: 84.24
Average angular velocity (rad/s): 78.0
Average angular velocity (rev/min): 744.8451336700701
Instantaneous angular velocity at t1 (rad/s): 24.0
Instantaneous angular velocity at t1 (rev/min): 229.1831180523293
Instantaneous angular velocity at t2 (rad/s): 150.0
Instantaneous angular velocity at t2 (rev/min): 1432.394487827058
