In [None]:
!apt install zip -y

Reading package lists... 0%

# Init

In [2]:
import torch
from datasets import load_dataset
import transformers

max_memory_gib = torch.cuda.get_device_properties('cuda').total_memory / 2 ** 30
torch.cuda.set_per_process_memory_fraction(min(1.0, 11 / max_memory_gib))
print(f"Setting memory limit to {min(1.0, 11 / max_memory_gib) * 100:.2f}%")

data = load_dataset("wikitext", "wikitext-2-v1")['train']
model_name = 'gpt2-large'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Setting memory limit to 92.36%


Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

# Training

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

from simple_trainer import train_epoch, to, forw_backw
from wandb_logger import WanDBWriter
from config import Config
from itertools import repeat
from dataset import BatchDataset
from torch.utils.data import DataLoader

In [5]:
device = 'cuda:0'
measure_speed = True
batch_size = 2

if measure_speed:
    batch_size = 1

config = Config(
    device=device,
    grad_accum_steps=64/batch_size,
    batch_size=batch_size,
    overfit_batch=False
)

model = model.to(device)

texts = list(filter(lambda x: len(x) != 0, data['text']))
dataset = BatchDataset(texts, config.batch_size, tokenizer)
dataloader = DataLoader(dataset, batch_size=None, sampler=None)

if not measure_speed:
    model.gradient_checkpointing_enable()

if config.overfit_batch:
    dataloader = repeat(next(iter(dataloader)))

In [6]:
torch.cuda.memory_allocated() / 1024 ** 3

2.98956298828125

In [7]:
logger = WanDBWriter(config)

[34m[1mwandb[0m: Currently logged in as: [33mtimothyxp[0m (use `wandb login --relogin` to force relogin)


In [8]:
param_groups = [[]]
cur_group = 0
group_borders = [9, 18, 27, 100]


for name, param in model.named_parameters():
    if name.startswith(f'transformer.h.{group_borders[cur_group]}'):
        param_groups.append([])
        cur_group += 1 
        
    param_groups[-1].append(param)
    
opts = []

for param_group in param_groups:
    opts.append(
        torch.optim.Adam(param_group, lr=1e-4)
    )

torch.cuda.memory_allocated() / 1024 ** 3

2.98956298828125

In [9]:
if not measure_speed:
    train_epoch(model, opts, dataloader, config, logger)
else:
    forw_backw(model, dataloader, config, logger)

  0%|          | 0/23766 [00:00<?, ?it/s]

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 256])
batch torch.Size([1, 512])


RuntimeError: CUDA out of memory. Tried to allocate 98.00 MiB (GPU 0; 11.91 GiB total capacity; 10.84 GiB already allocated; 16.81 MiB free; 11.00 GiB allowed; 10.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.cuda.memory_allocated() / 1024 ** 3

In [None]:
torch.cuda.memory_reserved() / 1024 ** 3

In [None]:
torch.cuda.empty_cache()

In [None]:
optimizer.zero_grad(set_to_none=True)
torch.cuda.memory_allocated() / 1024 ** 3

In [None]:
with torch.cuda.amp.autocast():
    out = model(**to(next(dataloader), device))

torch.cuda.memory_allocated() / 1024 ** 3

In [None]:
torch.cuda.max_memory_allocated() / 1024 ** 3

In [None]:
out.loss.backward()
torch.cuda.memory_allocated() / 1024 ** 3

In [None]:
del out
torch.cuda.empty_cache()
torch.cuda.memory_allocated() / 1024 ** 3