In [1]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

### 1. Проглянути дані

In [6]:
def read_file_content(file):
    with open(file, "r") as f:
        data = f.read()
        return data

In [7]:
def read_conllu_data(file):
    data = read_file_content(file)
    trees = parse(data)
    return trees

In [8]:
train_trees = read_conllu_data(PATH + "/uk_iu-ud-train.conllu")
dev_trees = read_conllu_data(PATH + "/uk_iu-ud-dev.conllu")
test_trees = read_conllu_data(PATH + "/uk_iu-ud-test.conllu")

In [9]:
for node in train_trees[0]:
    head = node["head"]
    print("{} <-- {}".format(node["form"],
                             tree[head - 1]["form"]
                             if head > 0 else "root"))

У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


### 2. Побудувати статичного оракула, який визначає послідовність дій для побудови дерева

In [10]:
from enum import Enum
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

In [11]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [12]:
def has_relation(node_from, node_to):
    if node_from['head'] == node_to['id']:
        return True
    
    return False

In [14]:
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]

    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

In [15]:
def trace_actions(tree):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    features = []
    stack, queue, relations = [ROOT], tree[:], []
    actions = []
    while queue or stack:        
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        actions.append(action)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return actions

In [21]:
trace_actions(train_trees[0])

[<Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>]

### 3. Виділити ознаки 

In [17]:
def create_features(stack, buf):
    fetaures = dict()
    if len(stack) > 0:
        fetaures['s-0-form'] = str(stack[0]['form'])
        fetaures['s-0-lemma'] = str(stack[0]['lemma'])
        fetaures['s-0-postag'] = str(stack[0]['upostag'])
        fetaures['s-0-feats'] = str(stack[0]['feats'])
        
        if len(stack) > 1:
            fetaures['s-1-postag'] = str(stack[1]['upostag'])

    return fetaures

### 4. Дістати тренувальні та тестувальні дані

* Написати функцію, яка дістає з дерева набір переходів та набір ознак для цих переходів.

In [19]:
def get_data(tree):
    
    stack, queue, relations = [ROOT], tree[:], []
    features, actions = [], []
    while queue or stack:        
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(create_features(stack, queue))
        actions.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, actions

In [28]:
def get_data_for_trees(trees):
    features_list, labels_list = [], []
    for i in range(0, len(train_trees)):
        features, labels = get_data([t for t in train_trees[i] if type(t['id']) == int])
        features_list.append(features)
        labels_list.append(labels)
        
    return features_list, labels_list

In [23]:
features, labels = get_data(train_trees[0])

In [79]:
features

[{'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'ADP'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN'},
 {'s-0-form': 'ROOT',
  's-0-lemma': 'ROOT',
  's-0-postag': 'ROOT',
  's-0-feats': 'None',
  's-1-postag': 'NOUN

* Пройтися по всіх деревах у тренувальній вибірці та дістати всі переходи з ознаками.

In [160]:
train_features, train_labels = get_data_for_trees(train_trees)

In [183]:
train_features_flatten = [item for sublist in train_features for item in sublist]
train_labels_flatten = [item for sublist in train_labels for item in sublist]

* Пройтися по всіх деревах у тестувальній вибірці та дістати всі переходи з ознаками.

In [161]:
dev_features, dev_labels = get_data_for_trees(dev_trees)

In [184]:
dev_features_flatten = [item for sublist in dev_features for item in sublist]
dev_labels_flatten = [item for sublist in dev_labels for item in sublist]

### 5. Натренувати класифікатор

In [188]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [186]:
clf = Pipeline([
    ('vect', DictVectorizer()),
    ('log_reg', LogisticRegression(multi_class='multinomial', random_state=42, solver='lbfgs'))
])

In [187]:
clf.fit(train_features_flatten, train_labels_flatten)



Pipeline(memory=None,
     steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('log_reg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [191]:
pred_labels = clf.predict(dev_features_flatten)
print(classification_report(
    dev_labels_flatten, pred_labels
))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

        left       0.34      0.49      0.40     48195
      reduce       0.28      0.59      0.38     49702
       right       0.00      0.00      0.00     42685
       shift       0.37      0.13      0.19     49716

   micro avg       0.31      0.31      0.31    190298
   macro avg       0.25      0.30      0.24    190298
weighted avg       0.26      0.31      0.25    190298



In [39]:
## TODO: improve clasifier 

### 6. Вирахувати unlabeled attachment score (UAS)

* Скільки залежностей у побудованому дереві збіглося з еталонним деревом? <br>
* Порахувати на тестувальній вибірці.


UAS - the percentage of words in an input that are assign the correct head.

In [195]:
def dep_parse(tree, oracle):
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:   
        if stack and not queue:
            stack.pop()
        
        else:
            features = create_features(stack, queue)
#             print(features)
            action = oracle.predict([features])[0]
#             print(action)
#             actions.append(action)
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [196]:
def calc_unlabeled_attachment_score(trees, oracle):
    total, tp = 0, 0
    for tree in trees:
        tree = [t for t in tree if type(t['id']) == int]
        etalon = [(node['id'], node['head']) for node in tree]
        pred = dep_parse(tree, oracle)
        total += len(tree)
        tp += len(set(etalon).intersection(set(pred)))
    
    uas = round(tp / total, 2)
    
    return total, tp, uas

In [197]:
total, tp, uas = calc_unlabeled_attachment_score(dev_trees, clf)

In [198]:
print("Total", total)
print("Correct", tp)
print("UAS", uas)

Total 12574
Correct 3240
UAS 0.26
