In [1]:
!ls ../../../../UD_Ukrainian-IU

CONTRIBUTING.md  LICENSE.txt  stats.xml		   uk_iu-ud-test.conllu
eval.log	 README.md    uk_iu-ud-dev.conllu  uk_iu-ud-train.conllu


In [2]:
PATH = '../../../../UD_Ukrainian-IU/'
train_filename = PATH + 'uk_iu-ud-train.conllu'
dev_filename = PATH + 'uk_iu-ud-dev.conllu'
test_filename = PATH + 'uk_iu-ud-test.conllu'

In [3]:
from collections import OrderedDict
from conllu import parse

with open(train_filename) as f:
    data = f.read()

trees = parse(data)
tree = trees[0]
print(tree[0])

for node in tree:
    head = node["head"]
    print("{} <-- {}".format(node["form"],
                             tree[head - 1]["form"]
                             if head > 0 else "root"))

OrderedDict([('id', 1), ('form', 'У'), ('lemma', 'у'), ('upostag', 'ADP'), ('xpostag', 'Spsl'), ('feats', OrderedDict([('Case', 'Loc')])), ('head', 2), ('deprel', 'case'), ('deps', [('case', 2)]), ('misc', OrderedDict([('Id', '0003'), ('LTranslit', 'u'), ('Translit', 'U')]))])
У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


In [4]:
from enum import Enum

class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

def oracle(stack, queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    top_queue = queue[0]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def trace_actions(tree):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    actions = []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else [None],
                        relations)
        actions.append(action)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return actions, relations

trace_actions(tree)

([<Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.RIGHT: 'right'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>],
 [(1, 2),
  (3, 4),
  (4, 2),
  (5, 4),
  (2, 6),
  (6, 0),
  (7, 8),
  (8, 6),
  (9, 10),
  (10, 8),
  (11, 10),
  (12, 13),
  (13, 11),
  (14, 6)])

In [5]:
def extract_features(stack, queue):
    features = {}
    if stack:
        stack_elem = stack[-1]
        features['s0_word'] = stack_elem['form']
        features['s0_lemma'] = stack_elem['lemma']
        features['s0_tag'] = stack_elem['upostag']
    if len(stack) > 1:
        stack_elem = stack[-2]
        features['s1_tag'] = stack_elem['upostag']
    if queue:
        queue_elem = queue[0]
        features['q0_word'] = queue_elem['form']
        features['q0_lemma'] = queue_elem['lemma']
        features['q0_tag'] = queue_elem['upostag']
    if len(queue) > 1:
        queue_elem = queue[1]
        features['q1_word'] = queue_elem['form']
        features['q1_tag'] = queue_elem['upostag']
    if len(queue) > 2:
        queue_elem = queue[2]
        features['q2_tag'] = queue_elem['upostag']
    if len(queue) > 3:
        queue_elem = queue[3]
        features['q3_tag'] = queue_elem['upostag']

    return features

In [6]:
def get_data(tree, feature_extractor, oracle):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    labels = []
    features = []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else [None],
                        relations)
        features.append(feature_extractor(stack, queue))
        labels.append(action.name.lower())
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

train_labels = []
train_features = []
for tree in trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t['id']) == int], extract_features, oracle)
    train_features += tree_features
    train_labels += tree_labels

len(train_labels), len(train_features)

(190298, 190298)

In [7]:
with open(dev_filename) as f:
    data = f.read()

test_trees = parse(data)

test_labels = []
test_features = []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t['id']) == int], extract_features, oracle)
    test_features += tree_features
    test_labels += tree_labels

len(test_labels), len(test_features)

(25820, 25820)

In [8]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)
print("Total number of features: ", len(vec.get_feature_names()))
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

Total number of features:  111126


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(random_state=1, solver='sag', multi_class="multinomial", max_iter=1000, n_jobs=-1)
lr.fit(train_features_vectorized, train_labels)
y_pred = lr.predict(test_features_vectorized)

print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

        left       0.86      0.87      0.86      6371
      reduce       0.85      0.78      0.81      6875
       right       0.75      0.79      0.77      5996
       shift       0.85      0.87      0.86      6578

    accuracy                           0.83     25820
   macro avg       0.83      0.83      0.83     25820
weighted avg       0.83      0.83      0.83     25820



In [10]:
tree = test_trees[0]
tree_features, tree_labels = get_data([t for t in tree if type(t['id']) == int], extract_features, oracle)
tree_features_vectorized = vec.transform(tree_features)
y_pred = lr.predict(tree_features_vectorized)
list(y_pred)

['right',
 'right',
 'shift',
 'shift',
 'shift',
 'left',
 'left',
 'shift',
 'reduce',
 'reduce',
 'left',
 'left',
 'right',
 'shift',
 'left',
 'right',
 'reduce',
 'right',
 'reduce',
 'reduce',
 'reduce']

In [12]:
def build_relations(tree, model, vectorizer):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []

    while (queue or stack):
        if stack and not queue:
            stack.pop()
            continue

        features = extract_features(stack, queue)
        action = model.predict(vectorizer.transform(features))[0]
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action:", action)
    return sorted(relations)

total, tp = 0, 0
for i, tree in enumerate(test_trees):
    tree2 = [t for t in tree if type(t['id']) == int]
    golden = [(node['id'], node['head']) for node in tree2]
    try:
        predicted = build_relations(tree2, lr, vec)
    except:
        print(i, [t['id'] for t in tree2])
        raise
    total += len(tree2)
    tp += len(set(predicted).intersection(set(golden)))

print('Total: {}'.format(total))
print('Correct: {}'.format(tp))

Total: 12574
Correct: 8717


## Let's try to improve features

According to https://books.google.com.ua/books?id=k3iiup7HB9UC&pg=PA31&lpg=PA31&dq=typical+feature+model+of+transition+based+parsing&source=bl&ots=z9QdIRGfz_&sig=ACfU3U1TvLWCgAL0PGu5plpoi09iTHyJ6w&hl=uk&sa=X&ved=2ahUKEwjLr_mM7ZLpAhUIuIsKHYw5Dp0Q6AEwAXoECAoQAQ#v=onepage&q&f=true

In [13]:
def get_left_right_deps(element):
    elem_id = element['id']
    if type(elem_id) == tuple:
        elem_id = elem_id[0] + 0.1 * elem_id[-1]

    left_deps, right_deps = [], []
    for dep in (element['deps'] or []):
        dep_id = dep[1]
        if type(dep_id) == tuple:
            dep_id = dep_id[0] + 0.1 * dep_id[-1]
        if dep_id > elem_id:
            right_deps.append((dep, dep_id))
        else:
            left_deps.append((dep, dep_id))
    leftmost_dep = None
    rightmost_dep = None
    if left_deps:
        leftmost_dep = list(sorted(left_deps, key=lambda x:x[1]))[0]
    if right_deps:
        rightmost_dep = list(sorted(right_deps, key=lambda x:x[1], reverse=True))[0]
    return leftmost_dep, rightmost_dep

def extract_more_features(stack, queue):
    features = {}
    if stack:
        stack_elem = stack[-1]
        features['s0_word'] = stack_elem['form']
        features['s0_lemma'] = stack_elem['lemma']
        features['s0_tag'] = stack_elem['upostag']
        leftmost_dep, rightmost_dep = get_left_right_deps(stack_elem)
        if leftmost_dep:
            features['ldep_s0_deprel'] = leftmost_dep[0][0]
        if rightmost_dep:
            features['rdep_s0_deprel'] = rightmost_dep[0][0]
    if len(stack) > 1:
        stack_elem = stack[-2]
        features['s1_tag'] = stack_elem['upostag']
        leftmost_dep, rightmost_dep = get_left_right_deps(stack_elem)
        if leftmost_dep:
            features['ldep_s1_deprel'] = leftmost_dep[0][0]
        if rightmost_dep:
            features['rdep_s1_deprel'] = rightmost_dep[0][0]
    if queue:
        queue_elem = queue[0]
        features['q0_word'] = queue_elem['form']
        features['q0_lemma'] = queue_elem['lemma']
        features['q0_tag'] = queue_elem['upostag']
        leftmost_dep, rightmost_dep = get_left_right_deps(stack_elem)
        if leftmost_dep:
            features['ldep_q0_deprel'] = leftmost_dep[0][0]
        if rightmost_dep:
            features['rdep_q0_deprel'] = rightmost_dep[0][0]
    if len(queue) > 1:
        queue_elem = queue[1]
        features['q1_word'] = queue_elem['form']
        features['q1_tag'] = queue_elem['upostag']
        leftmost_dep, rightmost_dep = get_left_right_deps(queue_elem)
        if leftmost_dep:
            features['ldep_q1_deprel'] = leftmost_dep[0][0]
        if rightmost_dep:
            features['rdep_q1_deprel'] = rightmost_dep[0][0]
    if len(queue) > 2:
        queue_elem = queue[2]
        features['q2_tag'] = queue_elem['upostag']
    if len(queue) > 3:
        queue_elem = queue[3]
        features['q3_tag'] = queue_elem['upostag']

    return features

In [14]:
train_labels = []
train_features = []
for tree in trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t['id']) == int], extract_more_features, oracle)
    train_features += tree_features
    train_labels += tree_labels

print('Train labels & features:', len(train_labels), len(train_features))

with open(dev_filename) as f:
    data = f.read()

test_trees = parse(data)

test_labels = []
test_features = []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t['id']) == int], extract_more_features, oracle)
    test_features += tree_features
    test_labels += tree_labels

print('Test labels & features:', len(test_labels), len(test_features))

Train labels & features: 190298 190298
Test labels & features: 25820 25820


In [15]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)
print("Total number of features: ", len(vec.get_feature_names()))
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

Total number of features:  111523


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lrf = LogisticRegression(random_state=1, solver='sag', multi_class="multinomial", max_iter=1000, n_jobs=-1)
lrf.fit(train_features_vectorized, train_labels)
y_pred = lrf.predict(test_features_vectorized)

print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

        left       0.94      0.95      0.94      6371
      reduce       0.90      0.85      0.88      6875
       right       0.80      0.83      0.82      5996
       shift       0.87      0.87      0.87      6578

    accuracy                           0.88     25820
   macro avg       0.88      0.88      0.88     25820
weighted avg       0.88      0.88      0.88     25820



In [18]:
def build_relations(tree, model, vectorizer):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []

    while (queue or stack):
        if stack and not queue:
            stack.pop()
            continue

        features = extract_more_features(stack, queue)
        action = model.predict(vectorizer.transform(features))[0]
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action:", action)
    return sorted(relations)

total, tp = 0, 0
for i, tree in enumerate(test_trees):
    tree2 = [t for t in tree if type(t['id']) == int]
    golden = [(node['id'], node['head']) for node in tree2]
    try:
        predicted = build_relations(tree2, lrf, vec)
    except Exception as e:
        print(i, [t['id'] for t in tree2])
        raise
    total += len(tree2)
    tp += len(set(predicted).intersection(set(golden)))

print('Total: {}'.format(total))
print('Correct: {}'.format(tp))

Total: 12574
Correct: 9735


## Let's try XGBoost (just a basic one w/o meta arguments optimization)

In [19]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=42, n_estimators=1000, n_jobs=-1)
model.fit(train_features_vectorized, train_labels)
test_pred = model.predict(test_features_vectorized)
print(classification_report(test_labels, test_pred))

              precision    recall  f1-score   support

        left       0.94      0.96      0.95      6371
      reduce       0.93      0.85      0.89      6875
       right       0.82      0.88      0.85      5996
       shift       0.89      0.89      0.89      6578

    accuracy                           0.89     25820
   macro avg       0.89      0.89      0.89     25820
weighted avg       0.90      0.89      0.89     25820



In [None]:
total, tp = 0, 0
for i, tree in enumerate(test_trees):
    tree2 = [t for t in tree if type(t['id']) == int]
    golden = [(node['id'], node['head']) for node in tree2]
    try:
        predicted = build_relations(tree2, model, vec)
    except Exception as e:
        print(i, [t['id'] for t in tree2])
        raise
    total += len(tree2)
    tp += len(set(predicted).intersection(set(golden)))

print('Total: {}'.format(total))
print('Correct: {}'.format(tp))

---------------
works for ages......


## Just Anything Else

In [20]:
from enum import Enum

class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"
    RIGHT2 = "right2"

def adhoc_oracle(stack, queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    top_queue = queue[0]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    elif len(queue) > 1 and queue[1]["head"] == top_stack["id"]:
        return Actions.RIGHT2
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def trace_adhoc_actions(tree):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    actions = []
    while queue or stack:
        action = adhoc_oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else [None],
                        relations)
        actions.append(action)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.RIGHT2:
            relations.append((queue[1]["id"], stack[0]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return actions, relations

trace_adhoc_actions(tree)

([<Actions.SHIFT: 'shift'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.LEFT: 'left'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.LEFT: 'left'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.LEFT: 'left'>,
  <Actions.LEFT: 'left'>,
  <Actions.RIGHT: 'right'>,
  <Actions.RIGHT: 'right'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.SHIFT: 'shift'>,
  <Actions.LEFT: 'left'>,
  <Actions.LEFT: 'left'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.RIGHT: 'right'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.RIGHT: 'right'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>,
  <Actions.REDUCE: 'reduce'>],
 [(2, 3),
  (1, 3),
  (5, 4),
  (4, 6),
  (7, 8),
  (6, 8),
  (3, 8),
  (8, 0),
  (9, 8),
  (11, 12),
  (10, 12),
  (12, 8),
  (13, 12),
  (14, 8)])

In [21]:
def get_data_adhoc(tree, feature_extractor, oracle):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    labels = []
    features = []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue if len(queue) > 0 else [None],
                        relations)
        features.append(feature_extractor(stack, queue))
        labels.append(action.name.lower())
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.RIGHT2:
            relations.append((queue[1]["id"], stack[0]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

train_labels = []
train_features = []
for tree in trees:
    tree_features, tree_labels = get_data_adhoc(
        [t for t in tree if type(t['id']) == int], extract_more_features, adhoc_oracle
    )
    train_features += tree_features
    train_labels += tree_labels

print('Train labels & features:', len(train_labels), len(train_features))

with open(dev_filename) as f:
    data = f.read()

test_trees = parse(data)

test_labels = []
test_features = []
for tree in test_trees:
    tree_features, tree_labels = get_data_adhoc([t for t in tree if type(t['id']) == int], extract_more_features, adhoc_oracle)
    test_features += tree_features
    test_labels += tree_labels

print('Test labels & features:', len(test_labels), len(test_features))

Train labels & features: 190298 190298
Test labels & features: 25820 25820


In [22]:
lrf = LogisticRegression(random_state=1, solver='sag', multi_class="multinomial", max_iter=1000, n_jobs=-1)
lrf.fit(train_features_vectorized, train_labels)
y_pred = lrf.predict(test_features_vectorized)

print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

        left       0.93      0.95      0.94      6371
      reduce       0.90      0.86      0.88      6875
       right       0.80      0.84      0.82      5996
      right2       0.81      0.84      0.82      1265
       shift       0.83      0.81      0.82      5313

    accuracy                           0.86     25820
   macro avg       0.85      0.86      0.86     25820
weighted avg       0.87      0.86      0.86     25820



In [23]:
def build_adhoc_relations(tree, model, vectorizer):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []

    while (queue or stack):
        if stack and not queue:
            stack.pop()
            continue

        features = extract_more_features(stack, queue)
        action = model.predict(vectorizer.transform(features))[0]
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.RIGHT2:
            relations.append((queue[1]["id"], stack[0]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action:", action)
    return sorted(relations)


total, tp = 0, 0
for i, tree in enumerate(test_trees):
    tree2 = [t for t in tree if type(t['id']) == int]
    golden = [(node['id'], node['head']) for node in tree2]
    try:
        predicted = build_adhoc_relations(tree2, lrf, vec)
    except Exception as e:
        print(i, [t['id'] for t in tree2])
        raise
    total += len(tree2)
    tp += len(set(predicted).intersection(set(golden)))

print('Total: {}'.format(total))
print('Correct: {}'.format(tp))
print('LAS: {}'.format(tp/total))

Total: 12574
Correct: 9771
LAS: 0.7770796882455862


# Перевірка на нових даних

In [24]:
import pymorphy2
import tokenize_uk

In [25]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [26]:
DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

In [60]:
from collections import OrderedDict

sentences = [
    'Не працюй неохоче або без турбот про загальне добро, без належної уважливости або в розсіяності.',
    'Зробивши кілька ковтків, я поглянув на обриси хмар.',
    'На хвилину маємо перервати історію про студентів і лотерею, тому що, коли ми заговорили про адитивність очікуваної цінності, я не можу не розповісти про одне з найкрасивіших з відомих мені доведень.'
]

for sentence in sentences:
    print(sentence)
    parsed_sentence = OrderedDict()
    i = 1
    for word in tokenize_uk.tokenize_words(sentence):
        parsed_word = morph.parse(word)[0]
        parsed_sentence[i] = {
            'id': i,
            'form': word,
            'lemma': parsed_word.normal_form,
            'upostag': normalize_pos(parsed_word),
            'deps': [], # Here our new advanced feature set breaks
        }
        i += 1
    predicted = build_relations(list(parsed_sentence.values()), lrf, vec)
    parsed_sentence[0] = {'id': 0, 'form': 'ROOT', 'lemma': 'ROOT'}
    for pair in predicted:
        from_, to_ = pair
        print('{} <- {}'.format(parsed_sentence[to_]['form'], parsed_sentence[from_]['form']))

Не працюй неохоче або без турбот про загальне добро, без належної уважливости або в розсіяності.
працюй <- Не
ROOT <- працюй
працюй <- неохоче
загальне <- добро
належної <- уважливости
розсіяності <- або
розсіяності <- в
уважливости <- розсіяності
працюй <- .
Зробивши кілька ковтків, я поглянув на обриси хмар.
ROOT <- Зробивши
кілька <- ковтків
Зробивши <- поглянув
обриси <- хмар
Зробивши <- .
На хвилину маємо перервати історію про студентів і лотерею, тому що, коли ми заговорили про адитивність очікуваної цінності, я не можу не розповісти про одне з найкрасивіших з відомих мені доведень.
маємо <- На
ROOT <- маємо
маємо <- перервати
перервати <- історію
історію <- про
про <- студентів
тому <- що
ми <- заговорили
заговорили <- про
про <- адитивність
очікуваної <- цінності
можу <- не
я <- можу
розповісти <- не
можу <- розповісти
розповісти <- про
розповісти <- одне
відомих <- мені
мені <- доведень
можу <- .
