In [169]:
with open("/Users/vinithlankireddy/Projects/LLMs/GPT-2/f/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

raw_text[:100]

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

In [170]:
len(raw_text)

20479

In [171]:
import re

text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"\'] | -- | \s)', text)
# result = [item for item in result ]
result

['Hello', ', ', 'world', '. ', 'Is this-- a test?']

In [172]:
result = [item.strip() for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'Is this-- a test?']

In [173]:
# preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
# preprocessed

In [174]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [175]:
len(preprocessed)

4690

In [176]:
all_words = sorted(set(preprocessed))
vocab_len = len(all_words)
vocab_len

1130

In [177]:
vocab = {token : i for i,token in enumerate(all_words)}

In [178]:
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


In [179]:
int_to_str = {i:s for s,i in vocab.items()}
for i,item in enumerate(int_to_str.items()):
    print(item)
    if i>=10:
        break

(0, '!')
(1, '"')
(2, "'")
(3, '(')
(4, ')')
(5, ',')
(6, '--')
(7, '.')
(8, ':')
(9, ';')
(10, '?')


In [180]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [ item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?"()\'])', r'\1', text)
        return text


In [181]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [182]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [183]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [184]:
len(vocab.items())

1132

In [185]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [186]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [187]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [188]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

In [189]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integer = tokenizer.encode(text, allowed_special="all")
integer

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 1659,
 617,
 34680,
 27271,
 13]

In [190]:
string = tokenizer.decode(integer)
string

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [191]:
with open("/Users/vinithlankireddy/Projects/LLMs/GPT-2/f/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
enc_text[:11]

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257]

In [192]:
len(enc_text)

5145

In [193]:
context_size = 5

enc_sample = enc_text[:20]

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

I ---->  H
I H ----> AD
I HAD ---->  always
I HAD always ---->  thought
I HAD always thought ---->  Jack


In [194]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt ,allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids)- max_length , stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1 : i+max_length +1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



In [195]:
tokenizer = tiktoken.get_encoding("gpt2")

dataset = GPTDatasetV1(text, tokenizer, 3,2)
dataset[:10]

([tensor([15496,    11,   466]),
  tensor([466, 345, 588]),
  tensor([ 588, 8887,   30]),
  tensor([   30,   220, 50256]),
  tensor([50256,   554,   262]),
  tensor([  262,  4252, 18250]),
  tensor([18250,  8812,  2114]),
  tensor([2114, 1659,  617]),
  tensor([  617, 34680, 27271])],
 [tensor([ 11, 466, 345]),
  tensor([ 345,  588, 8887]),
  tensor([8887,   30,  220]),
  tensor([  220, 50256,   554]),
  tensor([ 554,  262, 4252]),
  tensor([ 4252, 18250,  8812]),
  tensor([8812, 2114, 1659]),
  tensor([ 1659,   617, 34680]),
  tensor([34680, 27271,    13])])

In [196]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [197]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=2, max_length=5, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807],
        [ 367, 2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807, 3619],
        [2885, 1464, 1807, 3619,  402]])]


In [198]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[2885, 1464, 1807, 3619,  402],
        [1464, 1807, 3619,  402,  271]]), tensor([[ 1464,  1807,  3619,   402,   271],
        [ 1807,  3619,   402,   271, 10899]])]


In [199]:
input_ids = torch.tensor([2, 3, 5, 1])

In [200]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [201]:
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [202]:
embedding_layer(input_ids)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)

In [243]:
vocab_size = 50257
output_dim =256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [244]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets= next(data_iter)

In [245]:
len(dataloader)

160

In [246]:
len(data_iter)

160

In [247]:
inputs.shape

torch.Size([8, 4])

In [248]:
print(inputs[0])

tensor([  40,  367, 2885, 1464])


In [249]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [250]:
query = inputs[1]

attntion_score2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attntion_score2[i] = torch.dot(x_i,query)

print(attntion_score2)

tensor([ 2956967., 16597455., 29439276., 30712224., 23737832.,  2581577.,
        23769278., 12535636.])


In [211]:
attntion_weight2_tmp  = attntion_score2/attntion_score2.sum()

print(attntion_weight2_tmp)
attntion_weight2_tmp.sum()

tensor([0.0208, 0.1166, 0.2068, 0.2158, 0.1668, 0.0181, 0.1670, 0.0881])


tensor(1.)

In [212]:
def softmax_naive(x):
    return torch.exp(x)/torch.exp(x).sum(dim=0)

attntion_weight2_naive = softmax_naive(attntion_score2)
attntion_weight2_naive.sum()

tensor(nan)

In [213]:
# def softmax_naive(x, dim=0):
#     # x = x - x.max(dim=dim, keepdim=True).values
#     exp_x = torch.exp(x)
#     return exp_x / exp_x.sum(dim=dim, keepdim=True)

# attntion_weight2_naive = softmax_naive(attntion_score2)
# attntion_weight2_naive.sum()


In [214]:
attntion_weight2_naive

tensor([nan, nan, nan, nan, nan, nan, nan, nan])

In [215]:
print(attntion_score2)

tensor([ 2956967., 16597455., 29439276., 30712224., 23737832.,  2581577.,
        23769278., 12535636.])


In [216]:
attn_weights_2 = torch.softmax(attntion_score2, dim=0)
attn_weights_2

tensor([0., 0., 0., 1., 0., 0., 0., 0.])

In [217]:
query = inputs[1] 

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)

tensor([15632.,   438.,  2016.,   257.])


In [218]:
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[ 10602810,   2956967,  12248115,   6978434,   7386869,   2167885,
           3789053,   2173569],
        [  2956967,  16597455,  29439275,  30712225,  23737833,   2581577,
          23769278,  12535636],
        [ 12248115,  29439275, 172789970, 173633406,  26126256,  11545183,
          27807641,  10279714],
        [  6978434,  30712225, 173633406, 248681573,  20282744,   9945629,
          19716576,   6538361],
        [  7386869,  23737833,  26126256,  20282744,  38229585,   3396994,
          36833363,  20132377],
        [  2167885,   2581577,  11545183,   9945629,   3396994,    993378,
           3057054,   1406905],
        [  3789053,  23769278,  27807641,  19716576,  36833363,   3057054,
          37133686,  20023897],
        [  2173569,  12535636,  10279714,   6538361,  20132377,   1406905,
          20023897,  10978278]])


In [232]:
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

RuntimeError: "softmax_lastdim_kernel_impl" not implemented for 'Long'

In [242]:
x=torch.randn(1,3)
print(x)
s = torch.softmax(x, dim=-1)
s

tensor([[ 2.2139,  0.8846, -1.2506]])


tensor([[0.7716, 0.2042, 0.0241]])

In [236]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):
    def __init__(self,d_in, d_out):
        super().__init__()
        self.W_query=nn.Parameter(torch.rand(d_in, d_out))
        self.W_key=nn.Parameter(torch.rand(d_in, d_out))
        self.W_value=nn.Parameter(torch.rand(d_in, d_out))

    def forward(self,x):
        queries = x @ self.W_query
        keys = x@ self.W_key 
        values = x@self.W_value 

        attention_score = queries @ keys.T
        attention_weights = torch.softmax(
            attention_score/keys.shape[-1]**0.5 , dim=-1
        )
        context_vec = attention_weights @ values
        return context_vec



In [237]:
import torch.nn as nn

class SelfAttention_v2(nn.Module):
    def __init__(self,d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query=nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_key=nn.Linear(d_in, d_out,bias = qkv_bias)
        self.W_value=nn.Linear(d_in, d_out,bias = qkv_bias)

    def forward(self,x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        attention_score = queries @ keys.T
        attention_weights = torch.softmax(
            attention_score/keys.shape[-1]**0.5 , dim=-1
        )
        context_vec = attention_weights @ values
        return context_vec



In [238]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

RuntimeError: expected m1 and m2 to have the same dtype, but got: long long != float

In [None]:
class CausalAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length,dropout qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Embedding(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Embedding(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Embedding(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length,context_length),1))


    def forward(self, x):
        b, num_tokens , d_in = x.shape
        query = x @ self.W_query
        key = x @ self.W_key
        value = x @ self.W_value

        attn_socres = query @ key.transpose(1,2)
        attn_scores.masked_fill_(
            self.mask.bool()[: num_tokens, : num_tokens], -torch.inf
        )
        attn_weights = torch.softmax(
            attn_scores/key.shape[-1]**0.5, dim=-1
        )
        context_vec = attn_weights @ value
        return context_vec

        



In [None]:
class MultiHeadAttentionWrapper(nn.Module):
    def __init__(self,d_in, d_out, context_length,dropout,num_heads,qkv_bias = False):
        super().__init__()
        self.heads =nn.ModuleList(
        [CausalAttention(d_in,d_out,context_length,dropout) 
         for _ in range(num_heads)]
        )
    def forward(self,x):
        return torch.cat([head(x) for head in self.heads], dim=-1)
        

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, din, dout, context_len,dropout,num_heads , qkv_bias = False):
        super().__init__()
        assert(dout%num_heads==0),\
            "d_out must be divisible by num_heads"
        self.dout =dout
        self.num_heads =num_heads
        self.head_dim = dout//num_heads

        self.W_query = nn.Linear(din,dout, bias=qkv_bias)
        self.W_key = nn.Linear(din, dout, bias=qkv_bias)
        self.W_value = nn.Linear(din,dout,bias=qkv_bias)
        self.out_proj = nn.Linear(dout,dout)
        self.dropout = nn.Dropout(dropout)
        self.mask = torch.triu(torch.ones(context_len,context_len), -1)
        # self.register_buffer(
        #     "mask",
        #     torch.triu(torch.ones(context_len,context_len),
        #     diagonal =1)                
        #     )
        
    def forward(self,x):
        b, num_tokens, din = x.shape

        keys = self.W_key(x)
        values = self.W_value(x)
        queries = self.W_query(x)

        keys =keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b,num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens , self.num_heads, self.head_dim)

        keys =keys.transpose(1,2)
        values = values.transpose(1,2)
        queries = queries.transpose(1,2)

        attn_score = queries @ keys.transpose(2,3)

        mask_bool = self.mask.bool()[:num_tokens, : num_tokens]

        attn_score.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_score/keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1,2)

        context_vec = context_vec.contiguous().view(b,num_tokens, self.dout)
        context_vec  = self.out_proj(context_vec)

        return context_vec


In [None]:
import torch
import torch.nn as nn

class batchNorm(nn.Module):
    def __init__(self, num_features, eps =1e-5, momentum=0.1):
        super().__init__()
        self.beta = nn.Parameter(torch.zeros(num_features))
        self.gamma = nn.Parameter(torch.ones(num_features))
        self.eps =eps
        self.momentum =momentum
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.ones(num_features)

    def forward(self, x):
        if self.training:
            mean = torch.mean(x, dim=0)
            var = torch.var(x, dim =0)

            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var
  
        else:
            mean = self.running_mean
            var =self.running_var
        
        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma*x_hat + self.beta

In [None]:
class layerNormCNN(nn.Model):
    def __init__(self, num_channels, eps =1e-5):
        self.num_channels = num_channels
        self.eps = eps
        self.beta = nn.Parameter(torch.zeros(num_channels))
        self.gamma = nn.Parameter(torch.ones(num_channels))

    def forward(self, x):
        # (C, H, W)
        mean = x.mean(dim = (1,2,3), bias =False)
        var= x.var(dim =(1,2,3), bias = False)

        x_hat = (x - mean)/torch.sqrt(var + self.eps)
        return self.gamma* x_hat + self.beta

In [None]:
class layerNormTransform(nn.Model):
    def __init__(self, normalized_shape, eps =1e-5):
        self.eps = eps
        self.beta = nn.Parameter(torch.zeros(normalized_shape))
        self.gamma = nn.Parameter(torch.ones(normalized_shape))

    def forward(self, x):
        # (B, seq_len, embed)
        mean = x.mean(dim =-1, bias =False)
        var= x.var(dim =-1, bias = False)

        x_hat = (x - mean)/torch.sqrt(var + self.eps)
        return self.gamma* x_hat + self.beta

In [None]:
class InstanceNorm(nn.Model):
    def __init__(self, num_channels, eps =1e-5):
        self.eps = eps
        self.beta = nn.Parameter(torch.zeros(num_channels))
        self.gamma = nn.Parameter(torch.ones(num_channels))

    def forward(self, x):
        # (B, seq_len, embed)
        mean = x.mean(dim =-1, bias =False)
        var= x.var(dim =-1, bias = False)

        x_hat = (x - mean)/torch.sqrt(var + self.eps)
        return self.gamma* x_hat + self.beta

In [None]:
class InstanceNorm(nn.Module):
    def __init__(self, num_channels, eps=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(num_channels))
        self.beta = nn.Parameter(torch.zeros(num_channels))
        self.eps = eps

    def forward(self, x):
        # x: (N, C, H, W)
        mean = x.mean(dim=(2, 3), keepdim=True)
        var = x.var(dim=(2, 3), keepdim=True, unbiased=False)
        x_hat = (x - mean) / torch.sqrt(var + self.eps)
        return self.gamma.view(1, -1, 1, 1) * x_hat + self.beta.view(1, -1, 1, 1)
