In [1]:
import os
os.environ['HF_HOME'] = '/workspace/cache/huggingface/'
os.chdir('/workspace/FutureGPT2/src/')

import numpy as np
from torch import optim, nn, Tensor
from torch.nn import functional as F
import torch
import wandb
from transformers import GPT2Config, GPT2Model, AutoTokenizer
import transformers
import lightning as L
from inspect import signature, _ParameterKind
import copy
import gc
import datasets
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import WandbLogger
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from itertools import repeat

from models.gpt_model import *
from data.arithmetic import *
from models.myopic_model import *

%load_ext autoreload
%autoreload 2

In [2]:
if torch.cuda.get_device_capability()[0] >= 8:
    torch.set_float32_matmul_precision('high')

In [3]:
wandb.login(key='os.environ[WANDB_API_KEY]', relogin=True)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/wwu/.netrc


True

In [4]:
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
Token = {v: k for k, v in tokenizer.get_vocab().items()}

In [6]:
max_digits=8
train = DataLoader(
    MultiplicationDataset(
        size=10_000_000, 
        x_max_digits=max_digits,
        y_max_digits=max_digits,
        tokenizer=tokenizer
    ), 
    batch_size=512,
    num_workers=95,
)
NAME = f'ARITH_GPT2_MYOPIC_MAX{max_digits}_REVERSE_RANDINIT'
PROJ = 'LAISR_FUTURE_ARITH'
wandb_logger = WandbLogger(
    name=NAME,
    project=PROJ,
    log_model=False,   # Only save checkpoints locally
)
lr_monitor = LearningRateMonitor()
checkpoint_callback = ModelCheckpoint(
    dirpath="/workspace/checkpoints",
    filename=NAME + "_{global_step}_{train_loss:.2f}",
    every_n_epochs=1,
    save_top_k=1,
    monitor='train_loss',
    mode='min',
)
early_stop_callback = EarlyStopping(
    monitor='train_loss',
    divergence_threshold=10000,
    min_delta=0.00,
    patience=100000,
    verbose=False,
    mode='min',
)
trainer = L.Trainer(
    fast_dev_run=False,
    logger=wandb_logger,
    val_check_interval=0.1,
    #check_val_every_n_epoch=5,
    callbacks=[checkpoint_callback, early_stop_callback, lr_monitor],
    max_epochs=1,
    enable_progress_bar=True,
)
config = AutoConfig.from_pretrained(model_name)
myopic_model = AutoModelForCausalLM.from_config(config=config)
model = LitMyopicModel(
    myopic_model=myopic_model,
    orig_model=None,    # set to None (default) for cutgrad training [use own detached hidden state or kv]
    loss_type='myopic_loss',
    to_myopic=to_myopic_gpt2,
    from_kv=False,
    layer_past = [None for _ in range(len(myopic_model.transformer.h))]
)
wandb_logger.watch(model.myopic_model, log='all', log_graph=False)
trainer.fit(
    model=model,
    train_dataloaders=train,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/wwu/.local/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:198: Attribute 'myopic_model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['myopic_model'])`.
[34m[1mwandb[0m: Currently logged in as: [33mwilswu[0m. Use [1m`wandb login --relogin`[0m to force relogin


/home/wwu/.local/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:43: attribute 'myopic_model' removed from hparams because it cannot be pickled
/home/wwu/.local/lib/python3.10/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/wwu/.local/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /workspace/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name         | Type            | Params
-------------------------------------------------
0 | myopic_model | GPT2LMHeadModel | 124 M 
-------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)


NUM TRAINING STEPS 19532


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
