In [1]:
import os

from IPython.display import display, Markdown, Image

In [2]:
REPO_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
TASK_PATH = os.path.join(REPO_PATH, "tasks", "11-nn.md")
DATA_PATH = '/home/dima/Projects/UD_Ukrainian-IU'

In [3]:
def show_markdown(path):
    with open(path, 'r') as fh:
        content = fh.read()
    display(Markdown(content))

In [4]:
show_markdown(TASK_PATH)

# I. Нейромережі

У цьому завданні вам треба спробувати покращити з використанням нейромереж одну з двох ваших попередніх робіт:

1. Доробити парсер залежностей
2. Доробити класифікатор звернень до служби 1551

## Парсер залежностей

На основі FNN створіть класифікатор типу залежності. Для цього використайте:
- [UD-корпус для української мови](https://github.com/UniversalDependencies/UD_Ukrainian-IU/)
- парсер, який ви розробили в завданні 8 (або, якщо вам не вдалося реалізувати свій парсер, то можна взяти за основу [код із практичного заняття](../lectures/08-dep-parser-uk.ipynb))
- [векторні представлення слів для української мови](http://lang.org.ua/en/models/#anchor4)

Також переробіть свій парсер так, щоб замість використання ознак, визначених вручну, він покладався для вибору наступного переходу на передбачення LSTM-нейромережі, яка на вхід отримує поточні слова з тегами зі стеку та буферу (по 3 слова). Опис подібної мережі можна побачити у [цій статті](https://arxiv.org/pdf/1708.08959.pdf).

Обрахуйте якість класифікації та LAS для вашого парсера.

## Класифікатор звернень до служби 1551

Переробіть класифікатор звернень, який ви розробляли у завданні 10, так, щоб він використовував FNN на векторі документу та LSTM на векторах окремих слів. Порівняйте результати.

# II. Курсовий проєкт

Для свого курсового проєкту побудуйте рішення (чи кілька рішень), що перевершують по якості ваше базове рішення. Якість міряйте розробленими раніше метриками.
Код для курсового проєкту повинен бути у вашому репозиторії. У директорії `students/` в теці з вашим іменем збережіть файл з посиланням на код вашого рішення. Опишіть ваші результати.

# Оцінка

I. За виконання одного з завдань ви можете отримати 80 балів. Якщо бажаєте, то можете виконати обидва і отримати 120 балів :)

II. За покращене рішення з курсового проєкту ви можете отримати 20 балів.


## Dependancy parser

In [5]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

In [6]:
%%time

with open(os.path.join(DATA_PATH, "uk_iu-ud-train.conllu"), "r") as f:
    train_trees = parse(f.read())

with open(os.path.join(DATA_PATH, "uk_iu-ud-dev.conllu"), "r") as f:
    test_trees = parse(f.read())

CPU times: user 3.02 s, sys: 136 ms, total: 3.15 s
Wall time: 3.16 s


In [7]:
print(len(train_trees), len(test_trees))

5496 672


In [8]:
def print_tree(tree):
    for node in tree:
        head = node["head"]
        print("{} <-- {}".format(node["form"],
                             tree[head - 1]["form"]
                             if head > 0 else "root"))

def check_tree(tree):
    for n in tree:
        if not isinstance(n["id"], int):
            return True
    return False

In [9]:
tree = train_trees[0]
print_tree(tree)

У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


In [10]:
print("Bad trees: " )
print("Train:", len(list(filter(check_tree, train_trees))))
print("Test:", len(list(filter(check_tree, test_trees))))

Bad trees: 
Train: 197
Test: 16


In [11]:
clean_train_trees = list(filter(lambda t: not check_tree(t), train_trees))
clean_test_trees = list(filter(lambda t: not check_tree(t), test_trees))

print(len(clean_train_trees), len(clean_test_trees))

5299 656


In [12]:
def intersects(n1, n2):
    s1 = n1['id'] if n1['head'] > n1['id'] else n1['head']
    e1 = n1['head'] if n1['head'] > n1['id'] else n1['id']
    s2 = n2['id'] if n2['head'] > n2['id'] else n2['head']
    e2 = n2['head'] if n2['head'] > n2['id'] else n2['id']
    
    return (s1 < s2 and e1 > s2 and e2 > e1) or (s2 < s1 and e2 > s1 and e1 > e2)

def non_projective(tree):
    for n1 in tree:
        for n2 in tree:
            if n1['id'] < n2['id'] and intersects(n1, n2):
                return True
            
    return False

In [13]:
non_projective_train_trees = list(filter(non_projective, clean_train_trees))
non_projective_test_trees = list(filter(non_projective, clean_test_trees))

print(len(non_projective_train_trees), len(non_projective_test_trees))

414 57


In [14]:
np_tree = non_projective_train_trees[21]
print_tree(np_tree)

Звісно <-- було
не <-- було
було <-- root
жодного <-- способу
способу <-- було
дізнатись <-- способу
чи <-- спостерігають
спостерігають <-- дізнатись
за <-- вами
вами <-- спостерігають
саме <-- цей
у <-- проміжок
цей <-- проміжок
проміжок <-- спостерігають
часу <-- проміжок
. <-- було


In [15]:
projective_train_trees = list(filter(lambda t: not non_projective(t), clean_train_trees))
projective_test_trees = list(filter(lambda t: not non_projective(t), clean_test_trees))

print(len(projective_train_trees), len(projective_test_trees))

4885 599


### Design actions and the oracle

In [16]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

In [17]:
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

In [18]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def trace_actions(tree, log=True):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        if log:
            print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
            print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
            print("Relations:", relations)
            print(action)
            print("========================")
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()           
        else:
            print("Unknown action.")
    if log:
        print("Gold relations:")
        print([(node["id"], node["head"]) for node in tree])
        print("Retrieved relations:")
        print(sorted(relations))

#### show prohectiva and non-projective trees result

In [19]:
# trace_actions(tree)

In [20]:
# trace_actions(np_tree)

### Feature extraction

In [21]:
import pandas as pd

In [22]:
def extract_features(stack, queue, relations):
    
    features = dict()
    
    if len(stack) > 1:        
        features["s0-word"] = stack[-2]["form"]
        features["s0-lemma"] = stack[-2]["lemma"]
        features["s0-tag"] = stack[-2]["upostag"]
        features["s0-rchildren-num"] = len([r for r in relations if r[1] == stack[-2]['id']])
        features["s0-lchildren-num"] = len([r for r in relations if r[0] == stack[-2]['id']])
        if stack[-2]["feats"]:
            for k, v in stack[-2]["feats"].items():
                features["s0-" + k] = v
    
    if len(stack) > 2:
        features["s1-word"] = stack[-3]["form"]
        features["s1-tag"] = stack[-3]["upostag"]
    
    if len(stack) > 3:
        features["s2-tag"] = stack[-4]["upostag"]
        
    if len(stack) > 4:
        features["s3-tag"] = stack[-5]["upostag"]
    
    if len(stack) > 1:
        queue_top = stack[-1]
        features["q0-word"] = stack[-1]["form"]
        features["q0-lemma"] = stack[-1]["lemma"]
        features["q0-tag"] = stack[-1]["upostag"]
        features["q0-rchildren-num"] = len([r for r in relations if r[1] == stack[-1]['id']])
        features["q0-lchildren-num"] = len([r for r in relations if r[0] == stack[-1]['id']])
        if stack[-1]["feats"]:
            for k, v in stack[-1]["feats"].items():
                features["q0-" + k] = v
    
    if len(queue) > 0:        
        features["q1-word"] = queue[0]["form"]
        features["q1-tag"] = queue[0]["upostag"]
    
    if len(queue) > 1:
        features["q2-tag"] = queue[1]["upostag"]
    
    if len(queue) > 2:
        features["q3-tag"] = queue[2]["upostag"]
       
    if len(stack) > 1:
        features["distance"] = stack[-1]["id"] - stack[-2]["id"]
    
    features['q-empty'] = not bool(queue)    
    
    return features


def extract_features_v2(stack, queue):
    features = dict()
    if len(stack) > 0:
        features["s0-word"] = stack[-1]["form"]
        features["s0-lemma"] = stack[-1]["lemma"]
        features["s0-tag"] = stack[-1]["upostag"]
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue[0]["form"]
        features["q0-lemma"] = queue[0]["lemma"]
        features["q0-tag"] = queue[0]["upostag"]
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue[1]["form"]
        features["q1-tag"] = queue[1]["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

In [23]:
def extract_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extract_features_v2(stack, queue))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

#### train data

In [24]:
train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = extract_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

In [25]:
train_df = pd.DataFrame(train_features)
train_df['target'] = train_labels

In [26]:
print(train_df.shape)

(190298, 12)


#### test data

In [27]:
test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = extract_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

In [28]:
test_df = pd.DataFrame(test_features)
test_df['target'] = test_labels

In [29]:
print(test_df.shape)

(25820, 12)


### Train clasifier

In [30]:
RANDOM_STATE = 0
N_COMP = 500

In [31]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *

#### Logistic regression

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  111126


In [34]:
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

In [35]:
lr_clf = LogisticRegression(C=2, solver="sag", multi_class="multinomial", max_iter=1000, 
                            verbose=1, random_state=RANDOM_STATE)

In [36]:
lr_pipe = Pipeline([('vec', vec), ('lr_clf', lr_clf)])

In [37]:
lr_clf.fit(train_features_vectorized, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 332 epochs took 43 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.9s finished


LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='sag',
          tol=0.0001, verbose=1, warm_start=False)

In [38]:
predicted = lr_clf.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.86      0.86      0.86      6371
      reduce       0.85      0.78      0.81      6875
       right       0.75      0.79      0.76      5996
       shift       0.84      0.87      0.85      6578

   micro avg       0.82      0.82      0.82     25820
   macro avg       0.82      0.82      0.82     25820
weighted avg       0.82      0.82      0.82     25820



#### Feedforward neural network

In [39]:
import tensorflow as tf
from gensim.models.keyedvectors import KeyedVectors

In [40]:
label_mapping = dict(zip(np.unique(train_labels), range(4)))
label_mapping_inversed = {v: k for k, v in label_mapping.items()}

In [41]:
train_labels_ohe = tf.keras.utils.to_categorical(list(map(lambda x: label_mapping.get(x), train_labels)))
test_labels_ohe = tf.keras.utils.to_categorical(list(map(lambda x: label_mapping.get(x), test_labels)))

######################

In [92]:
import bz2

In [160]:
def read_embeddings(word_index=None):
    word_2_vec = {}
    with bz2.open("ubercorpus.lowercased.tokenized.300d.bz2", "rt") as f:
        words, ndim = map(int, f.readline().strip().split())
        for line in f:
            values = line.split()
            word = values[0]
            if words:
                if word in word_index:
                    vec = np.asarray(values[1:], dtype=np.float32)
                    word_2_vec[word] = vec
            else:
                vec = np.asarray(values[1:], dtype=np.float32)
                word_2_vec[word] = vec
    return word_2_vec, ndim, words

In [161]:
def get_childs(word, tree):
    return [w for w in tree if w["head"]==word["id"]]


In [162]:
def get_heights(tree, res=None, word=None, k=0):
    if res is None:
        res = {0: 0}
        k = 1
    if word is None:
        word = [w for w in tree if not w["head"]][0]
        res[word["id"]] = k
    for child in get_childs(word, tree):
        res[child["id"]] = k + 1
        get_heights(tree, res, child, k+1)
    return res


In [163]:
def build_vocabulary(trees, form="form"):
    word_index = {}
    pos_index = {}
    label_index = {}
    labels = []
    for tree in trees:
        tree = [t for t in tree if type(t["id"])==int]
        for word in tree:
            deprel = word["deprel"]
            word_id = len(word_index)+1
            pos_id = len(pos_index)+1
            word_t = word[form].lower()
            word_pos = word["upostag"]
            word_index[word_t] = word_index.get(word_t, word_id)
            pos_index[word_pos] = pos_index.get(word_pos, pos_id)
            if deprel not in label_index:
                label_id = len(label_index)
                label_index[deprel] = label_id
            else:
                label_id = label_index.get(deprel)
            labels.append(label_id)
    word_index[ROOT["form"]] = len(word_index)+1
    pos_index[ROOT["upostag"]] = len(pos_index)+1
    return word_index, pos_index, label_index, labels

In [164]:
def build_features(tree, word_index, pos_index, form="form"):
    label_index = {}
    records = []
    labels = []
    heights = get_heights(tree)
    tree = [t for t in tree if type(t["id"])==int]
    for word in tree:
        deprel = word["deprel"]
        word_t = word[form].lower()
        head = tree[word["head"]-1] if word["head"] else ROOT
        head_t = head[form].lower() if word["head"] else "ROOT"
        word_pos = word["upostag"]
        head_pos = head["upostag"]

        dist = word["id"] - head["id"]
        height_diff = heights.get(word["id"], 0) - heights.get(head["id"], 0)
        n_l_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]<word["id"]])
        n_r_child_word = len([_ for w in tree if w["head"]==word["id"] and w["id"]>word["id"]])
        n_l_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]<head["id"]])
        n_r_child_head = len([_ for w in tree if w["head"]==head["id"] and w["id"]>head["id"]])

        # children
        w_children = [w for w in tree if w["head"]==word["id"]]
        if w_children:
            lm = min(w_children, key=lambda x: x["id"])
            rm = max(w_children, key=lambda x: x["id"])
            w_lr = [word_index.get(lm[form].lower()), word_index.get(rm[form].lower())]
            pos_lr = [pos_index.get(lm["upostag"]), pos_index.get(rm["upostag"])]
        else:
            w_lr = [0, 0]
            pos_lr = [0, 0]
               
        # construct final feature vector
        num_features = [dist, height_diff, heights.get(word["id"], 0), 
                        n_l_child_word, n_r_child_word, 
                        n_r_child_head, n_l_child_head
                       ]
        words = [word_index.get(word_t), word_index.get(head_t), *w_lr]
        tags = [pos_index.get(word_pos), pos_index.get(head_pos), *pos_lr]
        records.append((*words, *tags, *num_features))
    return records, len(words), len(tags), len(num_features)

In [165]:
def process_data(trees, word_index, pos_index, form="form"):
    records = []
    for tree in trees:
        feats, n_w, n_t, n_f = build_features(tree, word_index, pos_index, form)
        records.extend(feats)
    return records, n_w, n_t, n_f

In [144]:
# %%time

# w2v_model = KeyedVectors.load_word2vec_format("ubercorpus.lowercased.tokenized.word2vec.300d", binary=False)

In [166]:
n_train = np.sum([len(tree) for tree in train_trees])

In [193]:
word_index, pos_index, label_index, labels = build_vocabulary(train_trees+test_trees)
# labels = tf.keras.utils.to_categorical(np.asarray(labels))

In [168]:
idx_2_label = {v:k for k,v in label_index.items()}

In [169]:
records, n_words, n_tags, n_feat = process_data(train_trees+test_trees, word_index, pos_index)
X = np.asarray(records)

In [170]:
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = labels[:n_train], labels[n_train:]

In [171]:
word_2_vec, ndim, _ = read_embeddings(word_index=word_index)

In [172]:
X_train.shape, y_train.shape

((92635, 15), (92635, 57))

In [173]:
X_test.shape, y_test.shape

((12340, 15), (12340, 57))

In [174]:
DEFAULT_VEC = np.zeros(ndim, np.float32)

In [175]:
embedding_matrix = np.zeros((len(word_index)+1, ndim))
for word, i in word_index.items():
    embedding_matrix[i] = word_2_vec.get(word, DEFAULT_VEC)

In [176]:
word_embedding_layer = tf.keras.layers.Embedding(len(word_index)+1,
                            ndim,
                            weights=[embedding_matrix],
                            input_length=n_words,
                            trainable=0
                           )

In [177]:
pos_embedding_layer = tf.keras.layers.Embedding(len(pos_index)+1,
                                ndim,
                                input_length=n_tags,
                                trainable=1
                               )

In [178]:
word_sequence_input = tf.keras.layers.Input(shape=(n_words,), dtype='int32')
word_embedded_sequences = word_embedding_layer(word_sequence_input)

In [179]:
pos_sequence_input = tf.keras.layers.Input(shape=(n_tags,), dtype='int32')
pos_embedded_sequences = pos_embedding_layer(pos_sequence_input)

In [180]:
features = tf.keras.layers.Input(shape=(n_feat,))

In [181]:

left = tf.keras.layers.Flatten()(word_embedded_sequences)
right = tf.keras.layers.Flatten()(pos_embedded_sequences)
x = tf.keras.layers.concatenate(inputs=[left, right, features])
x = tf.keras.layers.Dense(192)(x)
x = tf.keras.layers.Dense(96, activation='relu')(x)
preds = tf.keras.layers.Dense(len(label_index), activation='softmax')(x)

In [182]:
model = tf.keras.models.Model(inputs=[word_sequence_input, pos_sequence_input, features], outputs=preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [192]:

model.fit([X_train[:, :n_words], X_train[:, n_words:n_words+n_tags], X_train[:, n_words+n_tags:]], y_train, 
          validation_data=([X_test[:, :n_words], X_test[:, n_words:n_words+n_tags], X_test[:, n_words+n_tags:]], y_test), 
          epochs=6, 
          batch_size=128, 
          verbose=1)

Train on 92635 samples, validate on 12340 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7fe1066676a0>

In [205]:
pred = model.predict([X_test[:, :n_words], X_test[:, n_words:n_words+n_tags], X_test[:, n_words+n_tags:]])

In [206]:
print(classification_report(np.argmax(pred, axis=1), np.argmax(y_test, axis=1)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1135
           1       0.96      0.96      0.96       824
           2       1.00      0.98      0.99      1030
           3       0.96      0.96      0.96      1190
           4       0.78      0.76      0.77       185
           5       1.00      1.00      1.00       656
           6       0.90      0.90      0.90       778
           7       1.00      1.00      1.00      2363
           8       0.54      0.46      0.50       115
           9       0.99      1.00      0.99       438
          10       0.82      0.82      0.82       554
          11       0.98      0.97      0.98       598
          12       0.56      0.71      0.63       101
          13       0.97      0.91      0.94       190
          14       0.96      0.94      0.95       114
          15       0.88      0.86      0.87       591
          16       0.00      0.00      0.00         1
          17       1.00    

  'recall', 'true', average, warn_for)


In [None]:
# callbacks = [
#         tf.keras.callbacks.EarlyStopping(monitor='val_loss',  min_delta=0.001, patience=5),
#         tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001),
#         tf.keras.callbacks.ModelCheckpoint('fnn.h5', monitor='val_acc', verbose=1, save_best_only=True)
# ]

### Calculate the unlabeled attachment score

In [None]:
from tqdm import tqdm

In [None]:
def dep_parse(sentence, oracle, vectorizer=None, t_svd=None, log=False):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue, relations)
            if vectorizer:
                features = vectorizer.transform([features])
            if t_svd:
                features = t_svd.transform(features)
            action = oracle.predict(features)[0]
            if log:
                print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
                print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
                print("Relations:", relations)
                print(action)
                print("========================")
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [None]:
total, tp = 0, 0
for tree in tqdm(test_trees):
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lgb_clf, vec, t_svd)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))