In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from evo import *

params = {
    'd_model': 512,
    'nhead': 8,
    'dim_feedforward': 2048,
    'dropout': 0.1,
    'encoder_layers': 3,
    'decoder_layers': 3,
    'max_len': 64
}

evolver = Evolver(**params)
ps_evolver = PointerStyleEvolver(**params)

In [None]:
from data import *
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_loader = supervised_loader(
    path='data/ud/ud_train_3.0.jsonl',
    max_len=10,
    tokenizer=tokenizer,
    batch_size=4,
    cache_prefix=None,
    all_tokens=True,
    limit=20,
    sampler=StratifiedInfiniteSampler
)

eval_loader = unsupervised_loader(
    path='data/toy/toy.jsonl',
    max_len=10,
    tokenizer=tokenizer,
    batch_size=4,
    sampler=StratifiedInfiniteSampler
)

In [None]:
from run import apply_edits

traj_input_ids, _, traj_edit_tgts, _ = next(iter(train_loader))

apply_edits(traj_input_ids[:, 0], tuple(map(lambda x: x[:, 0], traj_edit_tgts)))

In [None]:
from torch.optim import AdamW

kwargs = {
    'train_loader': train_loader,
    'eval_loader': eval_loader,
    'train_steps': 1,
    'eval_steps': 2,
    'grad_accum_steps': 1,
    'clip_gradients': False,
    'checkpoint_at': 20,
    'eval_at': 1
}

print('STARTING REGULAR EVOLVER')
train_evolver(evolver, AdamW(evolver.parameters(), lr=3e-4), None, **kwargs)

print('STARTING PS EVOLVER')
train_evolver(ps_evolver, AdamW(ps_evolver.parameters(), lr=3e-4), None, **kwargs)

## multihead pointer

In [None]:
import torch
from trans import MultiheadPointer

pointer = MultiheadPointer(512, 8)

mem = torch.randn(3, 10, 512)
tgt = torch.randn(3, 5, 512)
src_pad_mask = torch.full((3, 10), True)
src_pad_mask[:, :7] = False

idx_weights = pointer(tgt, mem, key_padding_mask=src_pad_mask)
idx_weights

## regressions

In [None]:
!python evo.py --config=configs/toy/sup-toy.json --local

In [None]:
!python evo.py --config=configs/toy/ps-unsup-toy.json

In [None]:
# !python evo.py --config=configs/toy/ps-sup-toy.json
!python evo.py --config=configs/toy/ps-sup-prefix-toy.json --local

In [None]:
!python evo.py --config=configs/toy/den-toy.json

In [None]:
!python evo.py --config=configs/toy/ar-d-toy.json

In [None]:
# !python evo.py --config=configs/toy/ar-toy.json --local
!python evo.py --config=configs/toy/ar-prefix-toy.json --local

## evolver sampling

In [None]:
import torch
from evo import PointerStyleEvolver

model = PointerStyleEvolver(pointer_attn=True)
model.load_state_dict(torch.load('ps-sup-imdb-pattn_20240822_235245-9900.pt', map_location='cpu')['model'])
_ = model.eval()

In [None]:
from run import sample_trajectory
from utils import BT
from data import get_input_ids
from run import sample
from const import *

inputs = ['hello my name is', '']
input_ids = get_input_ids(inputs, max_len=512, tokenizer=BT)

traj_ids, traj_edits = sample_trajectory(model, input_ids, 5, {}, verbose=True)
# edit_tgts, src = sample(model, input_ids, None)

In [None]:
from data import elaborate
list(zip(*elaborate(traj_edits)))

## conditional generation

In [None]:
from evo import Transformer
from data import SequenceDataset, InfiniteSampler
from utils import BT

ds = SequenceDataset.from_trajectories(
    path='data/toy/toy.jsonl',
    denoising=False,
    max_len=10,
    tokenizer=BT,
    batch_size=2,
    sampler=InfiniteSampler
)

In [None]:
import torch
from torch.utils.data import DataLoader
from const import *

loader = DataLoader(ds, batch_size=2)
input_ids, output_ids = next(iter(loader))

input_ids[:, 1] = 2
input_ids[:, 2] = 102
print(input_ids)

logits = torch.randn_like(output_ids, dtype=torch.float)[:, :-1]
print(logits)

logits[(input_ids[:, 1:] != PAD_TOKEN_ID) & (input_ids[:, 1:] != EOS_TOKEN_ID)]

## autoregressive sampling?

In [None]:
import torch
from evo import Transformer
from utils import BT
from data import get_input_ids

model = Transformer(encoder_layers=12, decoder_layers=0)
model.load_state_dict(torch.load('ar-imdb_20240831_231923-0.pt', map_location='cpu')['model'])
_ = model.eval()

inputs = ['', 'hello', 'hello my']
input_ids = get_input_ids(inputs, tokenizer=BT, max_len=512)

input_ids

In [None]:
from run import sample_ar

output_ids = sample_ar(model, input_ids, limit=10, verbose=True)

In [None]:
from data import SequenceDataset
from utils import BT

dataset = SequenceDataset.from_trajectories('data/imdb/imdb_train_4.jsonl', denoising=False, max_len=512, tokenizer=BT)

# loader = DataLoader(dataset, batch_size=1, shuffle=True)
dataset

In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(dataset, batch_size=1, shuffle=True)
input_ids, output_ids = next(iter(loader))

## gpt2 eval

In [None]:
from run import compute_ppl
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

a, b = compute_ppl(model, tokenizer, 'hello my name is tj', 'cpu')

## dagger

In [None]:
# prompted sampling...

import torch
from evo import PointerStyleEvolver

model = PointerStyleEvolver(
    d_model=64,
    nhead=8,
    max_len=10,
    dim_feedforward=256,
    dropout=0,
    encoder_layers=3,
    decoder_layers=3,
    pointer_attn=True
)

model.load_state_dict(torch.load('checkpoints/ps-sup-prefix-toy_20240902_153226-500.pt')['model'])

_ = model.eval()

from const import *
from run import sample

input_ids = torch.zeros(3, 10, dtype=torch.long)
input_ids[:, 0] = BOS_TOKEN_ID
input_ids[:, 1] = 1037 # 'a'
input_ids[:, 2] = 1037
input_ids[:, 3] = EOS_TOKEN_ID
input_ids[:, 4:] = PAD_TOKEN_ID

prefix_mask = ~(input_ids.eq(PAD_TOKEN_ID) | input_ids.eq(EOS_TOKEN_ID))

edit_tgts, _  = sample(
    model, input_ids,
    src=None, M=5, threshold=0, resample_at=1e9,
    prefix_mask=prefix_mask, verbose=True
)

elaborate(edit_tgts)

In [41]:
s1 = BT.decode(input_ids[1:-1])
s1 = 'hello'
s1.find('[SEP]')

# s1[:s1.find('[SEP]')-1]

-1

In [63]:
BT.decode(2026)

'm y'

In [71]:
from data import get_simalign_tgts, get_input_ids, elaborate, get_traj_edit_tgts
from utils import BT, ALIGN

s1, s2 = 'the red cat', 'the cat red'

input_ids, output_ids = get_input_ids([s1, s2], max_len=512, tokenizer=BT)

edit_tgts = get_simalign_tgts(input_ids[1:], output_ids[1:])

In [18]:
print(input_ids)
print(output_ids)

tensor([ 101, 7592, 2026, 2171, 2003, 1056, 3501,  102,    0,    0])
tensor([ 101, 1056, 3501, 2003, 2026, 2171,  102,    0,    0,    0])


In [3]:
BT.decode(input_ids)

BT.decode()

'[CLS] hello my name is tj [SEP] [PAD] [PAD]'

In [3]:
# training loop

import torch
from evo import train_dagger, PointerStyleEvolver
from data import unsupervised_loader, StratifiedInfiniteSampler
from utils import BT
from torch.optim import AdamW
from run import sample

model = PointerStyleEvolver(
    d_model=64,
    nhead=8,
    max_len=10,
    dim_feedforward=256,
    dropout=0,
    encoder_layers=3,
    decoder_layers=3,
    pointer_attn=True,
)

model.eval()

optim = AdamW(model.parameters(), lr=3e-4)

train_loader = unsupervised_loader(
    path='data/toy/toy.jsonl',
    max_len=10,
    tokenizer=BT,
    batch_size=2,
    sampler=StratifiedInfiniteSampler
)

pf_params = {
    'M': 3,
    'threshold': 1,
    'resample_at': 20 # need to fix this...
}

train_dagger(model, optim, train_loader, 100, 1e9, 1e9, pf_params, max_iters=10)

  from .autonotebook import tqdm as notebook_tqdm
2024-09-13 22:22:28,916 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-uncased
tokenizing trajectories: 100%|██████████| 4/4 [00:00<00:00, 1893.80it/s]
INFO:train:step: 0


torch.Size([6, 10])
torch.Size([6, 1, 5])
torch.Size([6, 10])
torch.Size([6, 2, 5])
torch.Size([6, 10])
torch.Size([6, 3, 5])
torch.Size([6, 10])
torch.Size([6, 4, 5])
torch.Size([6, 10])
torch.Size([6, 5, 5])
torch.Size([6, 10])
torch.Size([6, 6, 5])
torch.Size([6, 10])
torch.Size([6, 7, 5])
torch.Size([6, 10])
torch.Size([6, 8, 5])
torch.Size([6, 10])
torch.Size([6, 9, 5])
torch.Size([2, 10])
torch.Size([2, 10, 5])
torch.Size([2, 1, 512, 5])
torch.Size([2, 10])
torch.Size([2, 1, 512, 5])


ValueError: too many values to unpack (expected 2)

In [81]:
# from run import get_one_hot_align_ids

# input_ids = torch.randint(1, 10, size=(1, 10))
# output_ids = torch.randint(1, 10, size=(1, 10))

# get_one_hot_align_ids(input_ids, output_ids)[0].shape

from data import get_simalign_tgts

get_simalign_tgts()

TypeError: get_simalign_tgts() missing 2 required positional arguments: 's1' and 's2'