# HW 3.1  

In [10]:
import torch.nn as nn
import torch



inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
     [0.55, 0.87, 0.66],
     [0.57, 0.85, 0.64],
     [0.22, 0.58, 0.33],
     [0.77, 0.25, 0.10],
     [0.05, 0.80, 0.55]]
    )
d_in = inputs.shape[1]
d_out = 2


In [11]:
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Parameter(torch.rand(d_in, d_out))
        self.w_key = nn.Parameter(torch.rand(d_in, d_out))
        self.w_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.w_key
        queries = x @ self.w_query
        values = x @ self.w_value
        atten_score = queries @ keys.T
        atten_weight = torch.softmax( atten_score / keys.shape[-1]**0.5, dim = -1)
        context_vactor = atten_weight @ values
        return context_vactor





class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)
        atten_score = queries @ keys.T
        atten_weight = torch.softmax( atten_score / keys.shape[-1]**0.5, dim = 1)
        context_vactor = atten_weight @ values
        return context_vactor


In [12]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
sa_v2 = SelfAttention_v2(d_in, d_out)

In [13]:
sa_v2(inputs)


tensor([[0.5085, 0.3508],
        [0.5084, 0.3508],
        [0.5084, 0.3506],
        [0.5074, 0.3471],
        [0.5076, 0.3446],
        [0.5077, 0.3493]], grad_fn=<MmBackward0>)

In [14]:
sa_v1.w_key = nn.Parameter(sa_v2.w_key.weight.T)
sa_v1.w_query = nn.Parameter(sa_v2.w_query.weight.T)
sa_v1.w_value = nn.Parameter(sa_v2.w_value.weight.T)

sa_v1(inputs)


tensor([[0.5085, 0.3508],
        [0.5084, 0.3508],
        [0.5084, 0.3506],
        [0.5074, 0.3471],
        [0.5076, 0.3446],
        [0.5077, 0.3493]], grad_fn=<MmBackward0>)

.weight 是 nn.Linear 這類層中儲存權重參數的屬性。  
sa_v2.W_query.weight 時，你其實是取出 W_query 這個線性層（通常是 nn.Linear）裡的核心參數矩陣，也就是那個負責將輸入張量做線性轉換的矩陣。

# HW3.2  

In [15]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        attn_scores = queries @ keys.transpose(1,2)
        attn_scores.masked_fill_(
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = attn_weights @ values
        return context_vec

        

In [24]:


class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self, d_in, d_out, context_lenght, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CausalAttention(d_in, d_out, context_lenght, dropout, qkv_bias)
             for _ in range(num_heads)]
        )

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)
    

### d_in指輸入一次數據的維度>> 3, d_out指輸出一次數據的維度 >>1, context_vec的維度取決d_out x num_heads=2 >> 1x2 =2

In [None]:
torch.manual_seed(123)
batch = torch.stack((inputs, inputs), dim = 0) # 要注意
print(batch)
context_lenght = batch.shape[1]
print(context_lenght, batch.shape)
# context_lenght = 6
d_in, d_out = 3, 1 ### this
mha = MultiHeadAttentionWrapper(d_in, d_out, context_lenght, 0.0, num_heads=2)
context_vec = mha(batch)

print(context_vec, context_vec.shape)



tensor([[[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]],

        [[0.4300, 0.1500, 0.8900],
         [0.5500, 0.8700, 0.6600],
         [0.5700, 0.8500, 0.6400],
         [0.2200, 0.5800, 0.3300],
         [0.7700, 0.2500, 0.1000],
         [0.0500, 0.8000, 0.5500]]])
6 torch.Size([2, 6, 3])
tensor([[[-0.5740,  0.2216],
         [-0.7320,  0.0155],
         [-0.7774, -0.0546],
         [-0.6979, -0.0817],
         [-0.6538, -0.0957],
         [-0.6424, -0.1065]],

        [[-0.5740,  0.2216],
         [-0.7320,  0.0155],
         [-0.7774, -0.0546],
         [-0.6979, -0.0817],
         [-0.6538, -0.0957],
         [-0.6424, -0.1065]]], grad_fn=<CatBackward0>) torch.Size([2, 6, 2])


In [58]:
# Real MHA 
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias = False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by nim_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out//num_heads
        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys = self.w_key(x)
        queries = self.w_query(x)
        values = self.w_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1,2)
        values = values.transpose(1,2)
        queries = queries.transpose(1,2)

        attn_scores = queries @ keys.transpose(2,3)
        mask_bool= self.mask.bool() [:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores/ keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)
        context_vec = context_vec.contiguous().view(b,num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)
        return context_vec


In [72]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
     [0.55, 0.87, 0.66],
     [0.57, 0.85, 0.64],
     [0.22, 0.58, 0.33],
     [0.77, 0.25, 0.10],
     [0.05, 0.80, 0.55]]
    )

torch.manual_seed(123)
batch = torch.stack((inputs, inputs), dim = 0) # 要注意
batch_size, context_lenght, d_in = batch.shape # torch.Size([2, 6, 3])
print(batch.shape)
d_out = 2 ### this
mha = MultiHeadAttention(d_in, d_out, context_lenght, 0.0, num_heads=2)
context_vec = mha(batch)

print(context_vec, context_vec.shape)

torch.Size([2, 6, 3])
tensor([[[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]],

        [[0.3190, 0.4858],
         [0.2943, 0.3897],
         [0.2856, 0.3593],
         [0.2693, 0.3873],
         [0.2639, 0.3928],
         [0.2575, 0.4028]]], grad_fn=<ViewBackward0>) torch.Size([2, 6, 2])


# HW3.3

In [70]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken as tken


class GPTDataset_v1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.targeet_ids = []
        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+1+max_length] 
            self.input_ids.append(torch.tensor(input_chunk))
            self.targeet_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):   # 輸入指定位置的資料時，回傳相應的input與target
        return self.input_ids[index], self.targeet_ids[index]
    

def crate_dataloader_v1(text, batch_size=4, max_length=1024, stride=1024, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tken.get_encoding("gpt2")
    dataset = GPTDataset_v1(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=0
        )
    return dataloader
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text=f.read()

dataloader = crate_dataloader_v1(raw_text, batch_size=4, max_length=1024, stride=1024, shuffle=False)
data_iter= iter(dataloader)
inputs, targets = next(data_iter)

vocab_size = 50257
output_dim =768
torch.manual_seed(123)

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) #初始化設定embedding layer
token_embeddings = token_embedding_layer(inputs) #輸入需嵌入的資料
print(token_embeddings.shape)

max_length = 1024
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 1024, 768])
torch.Size([1024, 768])
torch.Size([4, 1024, 768])


In [76]:
batch_size, context_lenght, d_in = input_embeddings.shape 
d_out = d_in
num_heads=12
print(input_embeddings.shape, num_heads)
mha = MultiHeadAttention(d_in, d_out, context_lenght, 0.0, num_heads)
context_vec = mha(batch)

print(context_vec, context_vec.shape)

torch.Size([4, 1024, 768]) 12


RuntimeError: mat1 and mat2 shapes cannot be multiplied (12x3 and 768x768)