In [1]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import math
from torch.nn import functional as F

In [None]:
from scipy.stats import binom
prob = binom.cdf(9, 200, 0.05)
print(prob)

In [None]:
@dataclass
class GPTConfig:
    """Hyper Parameters for GPT"""
    block_size: int = 254
    vocab_size: int = 65
    n_layer: int = 6
    n_embd: int = 384
    n_head: int = 8

class CasualSelfAttention(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        assert config.n_embd % config.n_layer == 0
        # key query value projection for all heads,but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        #output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        ## bias/mask following OpenAI/HF naming
        self.register_buffer("bias",torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size))
        
        def forward(self,x):
            B,T,C = x.size()#batch size,sequence length,embedding  dimensionality(n_embd)
            
            #calculate query ,key,value for all heads in batch and move head forward to be the batch 
            #nh in "number of heads",hs is "head size" an C(number of channels) = nh*hs
            #e.g. in GPT-2 (124),n_head = 12,hs = 64,so nh*hs=C=768 channels in Transformer 
            qkv = self.c_attn(x)
            q,k,v = qkv.split(self.n_embd,dim = 2)
            k = k.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,nh,T,hs)
            q = q.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,
            v = v.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,nh,T,hs)
            #attention (materialize the large(T,T)matrix for all the queries and keys)
            att = (q @ k.transpose(-2,-1))*(1.0/math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T]== 0,float('-inf'))
            att = F.softmax(att,dim=-1)
            y = att @ v #(B,nh,T,T)x(B,nh,T,hs)

class MLP(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd,4* config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')#can use approximate = none also
        self.c_proj = nn.Linear(4*config.n_embd,config.n_embd)
        
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = nn.CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    # residual network
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x+  self.mlp(self.ln_2(x))
        return x
       
class GPT(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),#weights of tocken embedding
            wpe = nn.Embedding(config.block_size, config.n_embd),# weights of position embedding
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size,bias=False)