In [12]:
import os
os.environ['HF_HOME'] = '/workspace/cache/huggingface/'
os.chdir('/workspace/FutureGPT2/src/')


import numpy as np
from torch import optim, nn, Tensor
from torch.nn import functional as F
import torch
import wandb
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
import transformers
from inspect import signature, _ParameterKind
import copy
import gc
import datasets
from torch.utils.data import DataLoader
from datasets import load_dataset, load_from_disk
from matplotlib import pyplot as plt
from itertools import islice
from copy import deepcopy
from glob import glob

from models.myopic_model import *
from models.gpt_model import *

import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.utilities.deepspeed import convert_zero_checkpoint_to_fp32_state_dict


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# 5M examples sampled from the-pile. truncated to len 64
train = load_dataset(
    'EleutherAI/pile-deduped-pythia-random-sampled', 
    split='train'
)
train = train.rename_column('Tokens', 'input_ids')
train = train.remove_columns([c for c in train.column_names if c != 'input_ids'])
train = train.cast_column('input_ids', datasets.Sequence(datasets.Value('int64')))
train = train.with_format('torch')
train_loader = DataLoader(train, batch_size=32)#, num_workers=96)

In [13]:
train = load_from_disk('/workspace/corpus/the_pile/pile_PYTHIA_64tokens_20M')['test']
train = train.cast_column('input_ids', datasets.Sequence(datasets.Value('int64')))
train = train.with_format('torch')
train_loader = DataLoader(train, batch_size=32)#, num_workers=96)

Casting the dataset:   0%|          | 0/200001 [00:00<?, ? examples/s]

In [14]:
x = next(iter(train_loader))['input_ids'].to('cuda')

In [None]:
outs = []
for path in glob('/workspace/checkpoints/checkpoints_PYTHIA/checkpoints/*'):
    out = path.replace('/checkpoints_PYTHIA/checkpoints/', '/checkpoints_PYTHIA_UNSHARD/')
    print(path.split('/')[-1])
    convert_zero_checkpoint_to_fp32_state_dict(path, out)
    outs.append(out)

In [10]:
outs[0].split('-')[4]

'2.8b_lr_6.40e'

In [20]:
for out in glob('/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/*'):
    print(out)
    state = torch.load(out)['state_dict']
    state = {'.'.join(k.split('.')[1:]): v for k, v in state.items()}
    config = AutoConfig.from_pretrained('EleutherAI/pythia-' + out.split('-')[4].split('_')[0])
    model = AutoModelForCausalLM.from_config(config=config)
    model.load_state_dict(state)
    model = model.to('cuda')
    out = model(input_ids=x, labels=x)
    print(out.loss.item())

/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1.4b_lr_8.00e-05_warmup_5.00e-02_global_step=1627.0_train_loss=2.71.ckpt
2.6146345138549805
/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-14m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=4.27.ckpt
4.178765296936035
/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-160m_lr_2.40e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.13.ckpt
3.055554151535034
/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-1b_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=2.78.ckpt
2.665820360183716
/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-31m_lr_4.00e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.84.ckpt
3.7549569606781006
/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE1

In [None]:

# lightning deepspeed has saved a directory instead of a file
prefix = '/workspace/checkpoints/checkpoints_PYTHIA/'
save_path = prefix+\
'PYTHIA-MYOPIC_model_name-pythia-1b_lr-0.00011999999999999999_global_step=1220.0_train_loss=3.01.ckpt'
output_path = save_path.replace('checkpoints_PYTHIA', 'checkpoints_PYTHIA_UNSHARD')
print(output_path)


In [4]:
path = '/workspace/checkpoints/checkpoints_PYTHIA_UNSHARD/PYTHIA-PILE10M64-MYOPIC-fp16_model_name_pythia-70m_lr_1.20e-04_warmup_5.00e-02_global_step=1627.0_train_loss=3.48.ckpt'

In [8]:
state = torch.load(path)['state_dict']
state = {'.'.join(k.split('.')[1:]): v for k, v in state.items()}
config = AutoConfig.from_pretrained('EleutherAI/pythia-' + path.split('-')[4].split('_')[0])
config.upcast_attn = True
model = AutoModelForCausalLM.from_config(config=config)
model.load_state_dict(state)
model = model.to('cuda')

In [15]:
with torch.autocast(dtype=torch.float16, device_type='cuda'):
    out = model(input_ids=x, labels=x)
    out.loss

In [16]:
out.loss

tensor(3.4142, device='cuda:0', grad_fn=<NllLossBackward0>)

In [7]:
config.upcast_attn

False

In [17]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-14m')
Token = {v: k for k, v in tokenizer.get_vocab().items()}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
''.join(Token[t.item()] for t in x[0])

'AvoidingĠprematurityĠinĠelectiveĠrepeatĠcesareanĠsection.ĠAĠroleĠforĠamnioticĠfluidĠphosphatidylglycerol.ĊAĠprospectiveĠstudyĠwasĠundertakenĠinĠ107ĠelectiveĠrepeatĠcesareanĠdeliveriesĠinĠwhichĠtheĠlecithin/sphingomyelinĠ(L/S)ĠratioĠwasĠpositive.ĠTheĠfrequencyĠofĠneonatalĠcomplications'