In [None]:
%load_ext autoreload
%autoreload 2

## sanity check overfit

In [None]:
import torch.nn.functional as F
from de import DependencyEvolver
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = DependencyEvolver(
    d_model=16,
    dim_feedforward=8,
    nhead=1,
    dropout=0,
    N=5,
    encoder_layers=1,
    decoder_layers=1,
    tok_v=tokenizer.vocab_size,
    rel_v=2,
    pos_v=3
)

In [None]:
import torch
from torch.optim import AdamW

batch = (
    (8823, 2),
    [
        torch.tensor([[[1, 0, 2, 0, 3]]]),
        torch.tensor([[[-1, -1, 1, -1, -1]]]),
        torch.tensor([[[-1, 2, -1, 2, -1]]]),
        torch.tensor([[[-1, 0, -1, 1, -1]]]),
        torch.tensor([[[-1, 0, -1, 1, -1]]]),
        torch.tensor([[[-1, 2016, -1, 2833, -1]]])
    ]
)

train_loader = [batch for _ in range(1000)]
eval_loader = [batch for _ in range(20)]
optim = AdamW(model.parameters())

In [None]:
model._train(optim, train_loader, eval_loader, len(train_loader), 20, 1, 1e9, 100)

## full data creation pipeline

In [None]:
import conllu

with open('./data/ud/en_gum-ud-dev.conllu', 'r') as f:
    sentences = conllu.parse(f.read())

### construct adjacent sequences

In [None]:
def family_tree(parsed):
    children = {tok['id']: [] for tok in parsed}
    for tok in parsed:
        if tok['head'] is None: return None
        if tok['head'] != 0: children[tok['head']].append(tok['id'])
        
    i, root = next((i, tok) for (i, tok) in enumerate(parsed) if tok['head'] == 0)

    seqs = [[(root['form'], root['upos'], root['deprel'], i, True, -1)]]
    cur_leaves = [root['id']]
    all_leaves = [root['id']]

    while cur_leaves:
        seq = []
        next_leaves = []
        
        for i, tok in enumerate(parsed):
            if tok['id'] in all_leaves or tok['head'] in all_leaves:
                seq.append((
                    tok['form'], tok['upos'], tok['deprel'],
                    i, tok['head'] in cur_leaves, None
                ))
                if tok['head'] in cur_leaves: next_leaves.extend(children[tok['id']])
                
        for i, (form, upos, deprel, j, is_leaf, _) in enumerate(seq):
            tok = next(t for t in parsed if t['form'] == form and t['upos'] == upos and t['deprel'] == deprel)
            if tok['head'] == 0:
                par = -1
            else:
                par = next((j for j, (p_form, p_upos, p_deprel, _, _, _) in enumerate(seq)
                            if p_form == parsed[tok['head']-1]['form']
                            and p_upos == parsed[tok['head']-1]['upos']
                            and p_deprel == parsed[tok['head']-1]['deprel']),
                            None)
            
            seq[i] = (form, upos, deprel, j, is_leaf, par)
        
        seqs.append(seq)
        cur_leaves = next_leaves
        all_leaves.extend(next_leaves)
        
    return seqs

### tokenizers

In [None]:
tok_map = {}
rel_map = {}
pos_map = {}

with open('vocab/gum_tok.vocab', 'r') as f:
    for i, line in enumerate(f):
        tok_map[line.strip()] = i

with open('vocab/rel.vocab', 'r') as f:
    for i, line in enumerate(f):
        rel_map[line.strip()] = len(tok_map) + i
        
with open('vocab/pos.vocab', 'r') as f:
    for i, line in enumerate(f):
        pos_map[line.strip()] = len(tok_map) + len(rel_map) + i

In [48]:
print(len(tok_map), len(rel_map), len(pos_map))

17755 69 17


### get labels

In [None]:
from de import INS_ID, CPY_ID, PRO_ID, EOS_ID

def label(seqs, N):
    traj = [[] for _ in range(6)]

    for i in range(1, len(seqs)-1):
        a, b = seqs[i], seqs[i+1]
    
        op_list = [-1]
        cpy_list = [-1]
        par_list = [-1]
        tok_list = [-1]
        pos_list = [-1]
        rel_list = [-1]
        
        prev = {i: is_leaf for (_, _, _, i, is_leaf, _) in a}
        for i, (tok, pos, rel, j, _, par) in enumerate(b[:N-2]):
            if j in prev:
                op_list.append(PRO_ID if prev[j] else CPY_ID)
                cpy_list.append(next(i for i, t in enumerate(a) if t[3] == j) + 1)
                par_list.append(-1)
                tok_list.append(-1)
                rel_list.append(-1)
                pos_list.append(-1)
            else:
                op_list.append(INS_ID)
                cpy_list.append(-1)
                par_list.append(par + 1)
                tok_list.append(tok_map.get(tok, len(tok_map) + len(rel_map) + len(pos_map)))
                rel_list.append(rel_map[rel])
                pos_list.append(pos_map[pos])
                
        op_list.append(EOS_ID)
        for i, list in enumerate([op_list, cpy_list, par_list, tok_list, pos_list, rel_list]):
            list.extend([-1 for _ in range(N-len(list))])
            traj[i].append(list)
            
    return traj

In [None]:
from tqdm import tqdm

traj_list = []

type1 = 0
type2 = 0

for parsed in tqdm(sentences):
    seqs = family_tree(parsed)
    if seqs is None:
        type1 += 1
        continue
    
    try:
        traj_list.append(label(seqs, 64))
    except TypeError as e:
        type2 += 1
        continue

In [None]:
import pickle
with open('data/gum/dev_1.pkl', 'wb') as f:
    pickle.dump(traj_list, f)