In [1]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

with open("UD_Ukrainian-IU/uk_iu-ud-train.conllu", "r") as f:
    data = f.read()

trees = parse(data)

In [2]:

tree = trees[0]

In [3]:
print(tree[0])

OrderedDict([('id', 1), ('form', 'У'), ('lemma', 'у'), ('upostag', 'ADP'), ('xpostag', 'Spsl'), ('feats', OrderedDict([('Case', 'Loc')])), ('head', 2), ('deprel', 'case'), ('deps', [('case', 2)]), ('misc', OrderedDict([('Id', '0003'), ('LTranslit', 'u'), ('Translit', 'U')]))])


In [4]:
for node in tree:
    head = node["head"]
    print("{} <-- {}: {}".format(node["form"],
                                 tree[head - 1]["form"] if head > 0 else "root",
                                 node["deprel"]))

У <-- домі: case
домі <-- була: obl
римського <-- патриція: amod
патриція <-- домі: nmod
Руфіна <-- патриція: flat:title
була <-- root: root
прегарна <-- фреска: amod
фреска <-- була: nsubj
, <-- зображення: punct
зображення <-- фреска: appos
Венери <-- зображення: nmod
та <-- Адоніса: cc
Адоніса <-- Венери: conj
. <-- була: punct


In [5]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"
    
    

def oracle(stack, top_queue, relations):
    top_stack = stack[-1]
    if top_stack and not top_queue:
        return Actions.REDUCE
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    elif top_stack["id"] in [i[0] for i in relations] and (top_queue["head"] < top_stack["id"] or [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    else:
        return Actions.SHIFT



In [6]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [7]:
def extract_features(stack, queue):
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
        if stack_top["feats"]:
            for k, v in stack_top["feats"].items():
                features["s0-" + k] = v
    if len(stack) > 1:
        features["s1-tag"] = stack_top["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
        if queue_top["feats"]:
            for k, v in queue_top["feats"].items():
                features["q0-" + k] = v
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    if stack and queue:
        features["distance"] = queue[0]["id"] - stack[-1]["id"]
    return features

In [78]:
def get_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extract_features(stack, queue))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels, relations

In [79]:
features, labels, relations = get_data(tree)
print("Number of words:", len(tree))
print("Number of actions:", len(labels))
print("List of actions taken:", labels)
print("Features:", len(features))
print("Relations: ", relations)

Number of words: 14
Number of actions: 29
List of actions taken: ['shift', 'left', 'shift', 'shift', 'left', 'right', 'right', 'reduce', 'reduce', 'left', 'right', 'shift', 'left', 'right', 'shift', 'left', 'right', 'right', 'shift', 'left', 'right', 'reduce', 'reduce', 'reduce', 'reduce', 'right', 'reduce', 'reduce', 'reduce']
Features: 29
Relations:  [(1, 2), (3, 4), (4, 2), (5, 4), (2, 6), (6, 0), (7, 8), (8, 6), (9, 10), (10, 8), (11, 10), (12, 13), (13, 11), (14, 6)]


In [10]:
train_features, train_labels = [], []
for tree in trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

print(len(train_features), len(train_labels))

190298 190298


In [11]:
with open("UD_Ukrainian-IU/uk_iu-ud-test.conllu", "r") as f:
    data = f.read()
test_trees = parse(data)

test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

print(len(test_features), len(test_labels))

35124 35124


In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
import numpy as np

In [13]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  111260


In [14]:
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

print(len(train_features_vectorized.toarray()), len(test_features_vectorized.toarray()))

190298 35124


In [15]:
train_features_vectorized[5]

<1x111260 sparse matrix of type '<class 'numpy.float64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [16]:
mnb = MultinomialNB()
mnb_pred = mnb.fit(train_features_vectorized, train_labels)

In [17]:
predicted_mnb = mnb.predict(test_features_vectorized)
print(classification_report(test_labels, predicted_mnb))

              precision    recall  f1-score   support

        left       0.76      0.86      0.81      8658
      reduce       0.72      0.47      0.57      9350
       right       0.63      0.77      0.69      8291
       shift       0.77      0.78      0.78      8825

    accuracy                           0.72     35124
   macro avg       0.72      0.72      0.71     35124
weighted avg       0.72      0.72      0.71     35124



In [18]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(50, 15), random_state=3)
clf.fit(train_features_vectorized, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 15), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=3, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [19]:
predicted_clf = clf.predict(test_features_vectorized)
print(classification_report(test_labels, predicted_clf))

              precision    recall  f1-score   support

        left       0.85      0.88      0.86      8658
      reduce       0.80      0.72      0.76      9350
       right       0.74      0.80      0.77      8291
       shift       0.90      0.90      0.90      8825

    accuracy                           0.82     35124
   macro avg       0.82      0.82      0.82     35124
weighted avg       0.82      0.82      0.82     35124



In [20]:
lrc = LogisticRegression(random_state=50, solver="sag", multi_class="multinomial", max_iter=1000, verbose=1)
lrc.fit(train_features_vectorized, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 209 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=50, solver='sag', tol=0.0001, verbose=1,
                   warm_start=False)

In [21]:
predicted_lrc = lrc.predict(test_features_vectorized)

In [22]:
print(classification_report(test_labels, predicted_lrc))

              precision    recall  f1-score   support

        left       0.85      0.87      0.86      8658
      reduce       0.82      0.72      0.77      9350
       right       0.74      0.80      0.77      8291
       shift       0.88      0.89      0.88      8825

    accuracy                           0.82     35124
   macro avg       0.82      0.82      0.82     35124
weighted avg       0.82      0.82      0.82     35124



In [23]:
len(trees)

5496

In [26]:
len(test_trees)

892

In [59]:
def get_features(sent, child_id, parent_id):
    l = len(sent)
    features = dict()
    features["child_lemma"] = sent[child_id]['lemma']
    features["parent_lemma"] = sent[parent_id]['lemma']
    features["child_pos"] = sent[child_id]['upostag']
    features["parent_pos"] = sent[parent_id]['upostag']
    if child_id>0:
        features["left_child_lemma"] = sent[child_id-1]['lemma']
        features["left_child_pos"] = sent[child_id-1]['upostag']
    if child_id<l-1:
        features["right_child_lemma"] = sent[child_id+1]['lemma']
        features["right_child_pos"] = sent[child_id+1]['upostag']
    if parent_id>0:
        features["left_parent_lemma"] = sent[parent_id-1]['lemma']
        features["left_parent_pos"] = sent[parent_id-1]['upostag']
    if parent_id<l-1:
        features["right_parent_lemma"] = sent[parent_id+1]['lemma']
        features["right_parent_pos"] = sent[parent_id+1]['upostag']
    return features

    
def get_connections_from_sentence(s):
    connections = []
    features_list = []
    for node in s:
        head = node["head"]
        if not head is None and head > 0:
            connection = node["deprel"]
            features = get_features(s, node['id']-1, head-1)
            connections.append(connection)
            features_list.append(features)
    return connections, features_list
            
c, f = get_connections_from_sentence(tree)

In [65]:
def get_labeled_data(t):
    connections = []
    features = []
    for tree in t:
        c, f = get_connections_from_sentence(tree)
        connections.extend(c)
        features.extend(f)
    return connections, features

In [73]:
train_connections, train_labeled_features = get_labeled_data(trees)
test_connections, test_labeled_features = get_labeled_data(test_trees)

In [70]:
len(test_labeled_features)

16224

In [83]:
v2 = DictVectorizer()
vec2 = v2.fit(train_labeled_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  68737


In [84]:
train_labeled_features_vectorized = vec2.transform(train_labeled_features)
test_labeled_features_vectorized = vec2.transform(test_labeled_features)

print(len(train_labeled_features_vectorized.toarray()), len(test_labeled_features_vectorized.toarray()))

86905 16224


In [75]:
lrc2 = LogisticRegression(random_state=50, solver="sag", multi_class="multinomial", max_iter=1000, verbose=1)
lrc2.fit(train_labeled_features_vectorized, train_connections)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 282 epochs took 105 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=50, solver='sag', tol=0.0001, verbose=1,
                   warm_start=False)

In [76]:
predicted_lrc2 = lrc2.predict(test_labeled_features_vectorized)

In [80]:
print(classification_report(test_connections, predicted_lrc2))

                     precision    recall  f1-score   support

                acl       0.74      0.59      0.66        66
            acl:adv       1.00      0.14      0.25         7
          acl:relcl       0.78      0.84      0.81       132
              advcl       0.40      0.39      0.39       153
           advcl:sp       0.00      0.00      0.00         5
          advcl:svc       0.00      0.00      0.00         5
             advmod       0.91      0.99      0.95       743
         advmod:det       1.00      0.83      0.91         6
               amod       0.94      0.96      0.95      1648
              appos       0.56      0.52      0.54       128
                aux       1.00      1.00      1.00        27
               case       0.96      0.98      0.97      1580
                 cc       0.96      0.98      0.97       630
              ccomp       0.73      0.51      0.60        88
           compound       0.73      0.77      0.75       100
               conj    

  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
def get_relations(tree):
    relations = [(node["id"], node["head"]) for node in tree]
    return relations

In [104]:
def dep_parse_labels(sentence, model, vectorizer):
    relations = get_relations(sentence)
    labels = []
    for rel in relations:
        child_id, parent_id = rel
        if not parent_id is None and parent_id>0:
            features = get_features(sentence, child_id-1, parent_id-1)
            label = model.predict(vectorizer.transform([features]))
            labels.extend(label)
    return labels

In [105]:
total, tp, full_match = 0, 0, 0
for tree in test_trees:
    golden, _ = get_connections_from_sentence(tree)
    predicted = dep_parse_labels(tree, lrc2, vec2)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

Total: 17148
Correctly defined: 6887
UAS: 0.4
Full match: 0.32


In [None]:


print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))
