In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F 
import matplotlib.pyplot as plt 
import einops 
from tqdm import trange 
import wandb 

import rotary_embedding
import gpt 
import data 

device = 'cuda' if torch.has_cuda else 'cpu'
print(device)

cuda


In [3]:
vocab_size, encode = data._init_data() 
config = gpt.ModelArgs() 
lr, bs = config.lr, config.batch_size

In [4]:
model = gpt.gpt_model(vocab_size=vocab_size).to(device) 
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters.')

28.482137 M parameters.


In [5]:
project_name = 'GPT with max trainable rotary emb dim' 

wandb.init(
    project='Rotary Embedding', 
    entity='uuzall', 
    sync_tensorboard=True, 
    name=project_name, 
)

writer = torch.utils.tensorboard.SummaryWriter(f'runs/{project_name}')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
2023-09-09 10:12:36.555083: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[34m[1mwandb[0m: Currently logged in as: [33muuzall[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016667934200086165, max=1.0…

In [6]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

losses, val_losses, val_loss, best_val_loss, best_val_iter = list(), list(), 0, 100, 0

max_iters = 10000
global_step = 0 
for iter in (loop := trange(max_iters)): 
	x, y = data.dataloader('train')
	
	logits = model(x.to(device))

	B, T, C = logits.shape 
	logits = logits.view(B*T, C)
	targets = y.view(B*T)
	loss = F.cross_entropy(logits, targets.to(device))
	writer.add_scalar('train_losses/loss', loss.item(), global_step)
	loss.backward()
	optimizer.step() 
	model.zero_grad()

	if iter % 512 == 0: 
		loss_dim = 100
		val_loss_tensor = torch.zeros((loss_dim))
		for j in range(loss_dim): 
			x, y = data.dataloader('val')
			with torch.no_grad(): 
				logits = model(x.to(device))

				B, T, C = logits.shape 
				logits = logits.view(B*T, C)
				targets = y.view(B*T)
				val_loss = F.cross_entropy(logits, targets.to(device)) 
			val_loss_tensor[j] = val_loss.item()
		val_loss = val_loss_tensor.mean().item()
		writer.add_scalar('test_losses/loss', val_loss, global_step)
		if val_loss < best_val_loss: 
			best_val_loss = val_loss
			best_val_iter = iter
			torch.save(model.state_dict(), f'models/gpt_best_performing.pth')

	loop.set_description(f'Iterations: {iter+1}/{max_iters}')
	loop.set_postfix(loss=loss.item(), val_loss=val_loss, best_val_loss=best_val_loss, best_iter=best_val_iter)
	global_step += 1 

2023-09-09 10:12:42.742867: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-09 10:12:42.787399: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-09 10:12:42.788900: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Iterations: 10000/10000: 100%|██████████| 10000/10000 [1:35:17<00:00,  1.75it/s, best_iter=6144, best_val_loss=0.974, loss=0.736, val_loss=1]    


1. Best performing: 0.995 (iteration 7000) Dim: 32
2. Best Performing: 0.973 (iteration 6000) Dim: max (192)
3. Best Performing: 0.974 (iteration 6144) Dim: max + trainable