In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
#------------------------
import os,sys
import numpy as np
#sys.path.append('D:/projects/base/app/modules') 
dir_path = os.path.abspath("")
print(f"DIRECTORY:\t\t<{dir_path}>")
sys.path.append(dir_path[:-14])
from fun_colors import *
#------------------------


# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

BINdir=getDrive()+"book\\gutenburg_BIN\\char_64"
dirlist=os.listdir(BINdir)
print( BINdir+'/'+dirlist[0] )
train_data = np.fromfile(BINdir+'/'+dirlist[0], dtype=np.int64)

train_data_torch = torch.from_numpy(train_data).type(torch.long)
print( 'train_data_torch.dtype', train_data_torch.dtype )
print( 'train_data_torch.type()', train_data_torch.type() )

# print( dict_arr )
# print( 'train_data', train_data[:200] )

#meta
#meta = { 'vocab_size': len(arr), 'itos': itos, 'stoi': stoi, 'uint': 32 }
with open(getDrive()+"book/gutenburg_BIN\metas\gutenburg_bin-RBT-char_meta_int64.pkl", 'rb') as f: meta = pickle.load(f)

stoi = meta['stoi']
itos = meta['itos']
vocab_size = meta['vocab_size']

DIRECTORY:		<c:\Users\jump3\Desktop\TextGen-ML-SHREC-SURG24\PyTorch-Model>
D:\book\gutenburg_BIN\char_64/GB_pg1.bin
train_data_torch.dtype torch.int64
train_data_torch.type() torch.LongTensor


In [4]:
def get_batch():
    try:
        # generate a small batch of data of inputs x and targets y
        data = train_data_torch# if split == 'train' else dict_arr
        ix = torch.randint(len(data) - block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
        x, y = x.to(device), y.to(device)
        return x, y
    except Exception as e:
        # print('SPLITTTT', split)
        # print('ix',ix)
        # t=[data[i:i+block_size] for i in ix]
        # for i in t: print(i.dtype,i)
        # print('pre-x',[data[i:i+block_size] for i in ix])
        # print('x',x)
        # print('y',y)
        print("\n\n============================\nDATA======");print(data)
        print("\n\n============================\nix======");print(ix)
        print("\n\n============================\npre-x======")
        t=[data[i:i+block_size] for i in ix]
        for i in t: print(i.dtype,i)
        print(e)

@torch.no_grad()
def estimate_loss():
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch()
        logits, loss = model(X, Y)
        losses[k] = loss.item()
    out = losses.mean()
    model.train()
    return out

In [5]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [7]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# ------------------
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses:.4f}")

    # sample a batch of data
    xb, yb = get_batch()

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
# ------------------
# save model
print("\n\n///////////////////////////////////////////////////////////////\nSAVING...\n")
import datetime
dstr=f"{datetime.datetime.now().date()}_{datetime.datetime.now().hour}_{datetime.datetime.now().minute}"
torch.save(model, dir_path+f'/Models/PyTorch_{dstr}.pt')

1.223669 M parameters
step 0: train loss 9.1307
step 100: train loss 2.9235
step 200: train loss 2.6447
step 300: train loss 2.5173
step 400: train loss 2.4129
step 500: train loss 2.3214
step 600: train loss 2.2276
step 700: train loss 2.1626
step 800: train loss 2.0851
step 900: train loss 2.0315
step 1000: train loss 1.9739
step 1100: train loss 1.9247
step 1200: train loss 1.8793
step 1300: train loss 1.8537
step 1400: train loss 1.8168
step 1500: train loss 1.7610
step 1600: train loss 1.7547
step 1700: train loss 1.7325
step 1800: train loss 1.7098
step 1900: train loss 1.6838
step 2000: train loss 1.6587
step 2100: train loss 1.6539
step 2200: train loss 1.6348
step 2300: train loss 1.6247
step 2400: train loss 1.5884
step 2500: train loss 1.5584
step 2600: train loss 1.5806
step 2700: train loss 1.5499
step 2800: train loss 1.5201
step 2900: train loss 1.5258
step 3000: train loss 1.5059
step 3100: train loss 1.5014
step 3200: train loss 1.4922
step 3300: train loss 1.4742
step

In [9]:
# ------------------
# generate from the model
print("\n\n///////////////////////////////////////////////////////////////\nGENERATING...\n")
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(fun_decode(m.generate(context, max_new_tokens=2000)[0].tolist(),itos))



///////////////////////////////////////////////////////////////
GENERATING...

 has I desricts of pretendence

is not explore any chain time of permission will license consequel to that establish and enemies in 20, discopution
which the nating currence of the
gravestify respectivide shall nearly as great under Cases to abalien which us to likely may.
The Foundation is Compensation, rece, be oppressedy. But discerrent'en used the sugberne in possible Union, "Wo, who pass a certain
and must nearly executive by the Congress, the supreme, rivided the original raising our
the Desirent United States. Thite0s
 Section 9, 12sice Bich, our Fortting, belonger, with calendod;



**TIS DEhecu glo find remend abisk out mark on removed,
but what the Recess, but present
resort. If a believers bets, houfor the executed, Madember, 1974
co: "


Let be reduceed in any our Provided, and
muritive the copyright hold to the Junish shall be terms; be laid edyuuely ca biliting Congerest.
We disparged to ligh

In [None]:
# ------------------
#reload model test
print("\n\n///////////////////////////////////////////////////////////////\nLOADING...\n")
model2 = torch.load(dir_path+f'/Models/PyTorch_{dstr}.pt')
model2.eval()

print("\n\n///////////////////////////////////////////////////////////////\nGENERATING2...\n")
m2 = model2.to(device)
print(fun_decode(m2.generate(context, max_new_tokens=2000)[0].tolist(),itos))

In [None]:
print("\n\n///////////////////////////////////////////////////////////////\nLOADING...\n")
model = torch.load(dir_path+f'/Models/PyTorch_2024-06-10_2_18.pt')
model.eval()
m = model.to(device)


# ------------------
print("\n\n///////////////////////////////////////////////////////////////\nTRAINING...\n")
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses:.4f}")
    # sample a batch of data
    xb, yb = get_batch()
    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    
# ------------------
# save model
print("\n\n///////////////////////////////////////////////////////////////\nSAVING...\n")
import datetime
dstr=f"{datetime.datetime.now().date()}_{datetime.datetime.now().hour}_{datetime.datetime.now().minute}"
torch.save(model, dir_path+f'/Models/PyTorch_{dstr}.pt')


# ------------------
print("\n\n///////////////////////////////////////////////////////////////\nGENERATING...\n")
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(fun_decode(m.generate(context, max_new_tokens=2000)[0].tolist(),itos))