In [None]:
%load_ext autoreload
%autoreload 2

import sys
if '..' not in sys.path:
    sys.path.append('..')

In [None]:
import conllu

with open('../data/ud/en_ewt-ud-train.conllu', 'r') as f:
    sentences = conllu.parse(f.read())

In [None]:
!python ../dep.py ../data/ud/en_ewt-ud-train.conllu ../data/ud_train.jsonl --redundant=1 --weight=0.1

In [None]:
!python ../dep.py ../data/ud/en_ewt-ud-dev.conllu ../data/ud_dev.jsonl --redundant=3 --weight=0.1

In [None]:
from data import TrajectoryDataset
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TrajectoryDataset.from_disk(
    path='../data/ud/ud_train.jsonl',
    max_len=64,
    tokenizer=tokenizer
)

eval_dataset = TrajectoryDataset.from_disk(
    path='../data/ud/ud_dev.jsonl',
    max_len=64,
    tokenizer=tokenizer,
    limit=100
)

In [None]:
from torch.utils.data import DataLoader
from data import StratifiedInfiniteSampler

train_loader = DataLoader(
    train_dataset,
    batch_size=2,
    sampler=StratifiedInfiniteSampler(train_dataset, 2),
    collate_fn=lambda x: zip(*x)
)

eval_loader = DataLoader(
    eval_dataset,
    batch_size=1,
    shuffle=True
)

In [None]:
from model import Evolver
from torch.optim import AdamW

evolver = Evolver(
    d_model=512,
    nhead=8,
    max_len=64,
    encoder_layers=6,
    decoder_layers=6,
    device='cpu'
)

optim = AdamW(evolver.parameters(), lr=3e-4)

In [None]:
from train import train_evolver

train_evolver(
    evolver, optim, None,
    train_loader, eval_loader,
    train_steps=1,
    grad_accum_steps=1,
    checkpoint_at=2,
    eval_at=1,
    num_particles=5,
    threshold=2,
    temperature=1.0,
    device='cpu',
    prefix='test-local'
)

In [None]:
from train import evaluate_evolver

evaluate_evolver(evolver, eval_loader, 'cpu')

In [None]:
!python ../train.py \
    --train ../data/ud/ud.jsonl \
    --eval ../data/ud/en_ewt-ud-dev.conllu \
    --config ../configs/ud.json \
    --prefix ud-1.0.0 \
    --device cpu

In [None]:
from data import Seq2SeqDataset
from transformers import BertTokenizer

dataset = Seq2SeqDataset.from_trajectories(
    '../data/ud/ud.jsonl',
    denoising=True,
    max_len=64,
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
)

In [None]:
from torch.utils.data import DataLoader
from data import StratifiedInfiniteSampler

loader = DataLoader(
    dataset,
    batch_size=128,
    sampler=StratifiedInfiniteSampler(dataset, 128)
)