## Частина, яку ми робили на практичній

In [2]:
from collections import OrderedDict
from conllu import parse

PATH = 'UD_Ukrainian-IU/'

with open(PATH + 'uk_iu-ud-train.conllu', 'r') as f:
    train_trees = parse(f.read())

tree = train_trees[0]
for node in tree:
    head = node['head']
    print('{} <-- {}'.format(node['form'],
                             tree[head - 1]['form']
                             if head > 0 else 'root'))

У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


In [3]:
from enum import Enum
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [4]:
def initial_state(tree):
    return [ROOT], tree, []

def infer_action(stack, queue, deps):
    if stack and len(queue) == 0:
        return Actions.REDUCE
    elif queue[0]['head'] == stack[0]['id']:
        return Actions.RIGHT
    elif stack[0]['head'] == queue[0]['id']:
        return Actions.LEFT
    elif (stack[0]['id'] in [child for _, child in deps] and
             (queue[0]['head'] < stack[0]['id'] or
                 queue[0]['id'] in [s['head'] for s in stack])):
        return Actions.REDUCE

    return Actions.SHIFT

def apply_action(action, stack, queue, deps):
    if action == Actions.REDUCE:
        return stack[1:], queue, deps
    elif action == Actions.SHIFT:
        return [queue[0]] + stack, queue[1:], deps
    elif action == Actions.LEFT:
        return stack[1:], queue, deps + [(queue[0]['id'], stack[0]['id'])]
    elif action == Actions.RIGHT:
        return [queue[0]] + stack, queue[1:], deps + [(stack[0]['id'], queue[0]['id'])]
    
def oracle(tree):
    stack, queue, deps = initial_state(tree)
    actions = []
    
    while stack or queue:
        action = infer_action(stack, queue, deps)
        actions.append(action)
        stack, queue, deps = apply_action(action, stack, queue, deps)
        
    return actions

oracle(tree)

[<Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.RIGHT: 'right'>,
 <Actions.SHIFT: 'shift'>,
 <Actions.LEFT: 'left'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.RIGHT: 'right'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>,
 <Actions.REDUCE: 'reduce'>]

In [5]:
def extract_initial_features(tree, stack, queue, deps):
    feats = {}
    
    if stack:
        top_stack = stack[0]
        feats['stack-0-form'] = top_stack['form']
        feats['stack-0-lemma'] = top_stack['lemma']
        feats['stack-0-pos'] = top_stack['upostag']
    if len(stack) > 1:
        feats['stack-1-pos'] = stack[1]['upostag']
        
    if queue:
        top_queue = queue[0]
        feats['queue-0-form'] = top_queue['form']
        feats['queue-0-lemma'] = top_queue['lemma']
        feats['queue-0-pos'] = top_queue['upostag']
    if len(queue) > 1:
        feats['queue-1-form'] = queue[1]['form']
        feats['queue-1-pos'] = queue[1]['upostag']
    
    if len(queue) > 2:
        feats['queue-2-pos'] = queue[2]['upostag']

    if len(queue) > 3:
        feats['queue-3-pos'] = queue[3]['upostag']

    return feats

In [6]:
def combine_extractors(*funcs):
    def combined(*args):
        feats = {}
        for f in funcs:
            feats.update(f(*args))
        return feats
    return combined

def tree_data(tree, extractors):
    actions = oracle(tree)
    state = initial_state(tree)
    features = []
    labels = []
    extractor = combine_extractors(*extractors)
    
    for a in actions:
        stack, queue, deps = state
        action_feats = extractor(tree, stack, queue, deps)
        
        labels.append(a.value)
        features.append(action_feats)
        state = apply_action(a, *state)
    return features, labels

tree_data(tree, [extract_initial_features])[0]

[{'stack-0-form': 'ROOT',
  'stack-0-lemma': 'ROOT',
  'stack-0-pos': 'ROOT',
  'queue-0-form': 'У',
  'queue-0-lemma': 'у',
  'queue-0-pos': 'ADP',
  'queue-1-form': 'домі',
  'queue-1-pos': 'NOUN',
  'queue-2-pos': 'ADJ',
  'queue-3-pos': 'NOUN'},
 {'stack-0-form': 'У',
  'stack-0-lemma': 'у',
  'stack-0-pos': 'ADP',
  'stack-1-pos': 'ROOT',
  'queue-0-form': 'домі',
  'queue-0-lemma': 'дім',
  'queue-0-pos': 'NOUN',
  'queue-1-form': 'римського',
  'queue-1-pos': 'ADJ',
  'queue-2-pos': 'NOUN',
  'queue-3-pos': 'PROPN'},
 {'stack-0-form': 'ROOT',
  'stack-0-lemma': 'ROOT',
  'stack-0-pos': 'ROOT',
  'queue-0-form': 'домі',
  'queue-0-lemma': 'дім',
  'queue-0-pos': 'NOUN',
  'queue-1-form': 'римського',
  'queue-1-pos': 'ADJ',
  'queue-2-pos': 'NOUN',
  'queue-3-pos': 'PROPN'},
 {'stack-0-form': 'домі',
  'stack-0-lemma': 'дім',
  'stack-0-pos': 'NOUN',
  'stack-1-pos': 'ROOT',
  'queue-0-form': 'римського',
  'queue-0-lemma': 'римський',
  'queue-0-pos': 'ADJ',
  'queue-1-form': 'п

In [7]:
def prepare_data(trees, extractors):
    features, labels = [], []
    
    for tree in trees:
        t_feats, t_labels = tree_data([t for t in tree if type(t['id'])==int], extractors)
        features += t_feats
        labels += t_labels
    return features, labels

with open(PATH + 'uk_iu-ud-dev.conllu', 'r') as f:
    test_trees = parse(f.read())

train_feats, train_labels = prepare_data(train_trees, [extract_initial_features])
test_feats, test_labels = prepare_data(test_trees, [extract_initial_features])

In [8]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

def make_lrc():
    clf = Pipeline([('vect', DictVectorizer()),
                    ('log_reg', LogisticRegression(multi_class='multinomial', 
                                                   solver='sag', 
                                                   random_state=42, 
                                                   max_iter=5000))])
    return clf

In [9]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))

              precision    recall  f1-score   support

        left       0.86      0.87      0.86      6371
      reduce       0.85      0.78      0.81      6875
       right       0.75      0.79      0.77      5996
       shift       0.85      0.87      0.86      6578

   micro avg       0.83      0.83      0.83     25820
   macro avg       0.83      0.83      0.83     25820
weighted avg       0.83      0.83      0.83     25820



In [10]:
def dep_parse(tree, clf, extractors):
    extractor = combine_extractors(*extractors)
    stack, queue, deps = initial_state(tree)

    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extractor(tree, stack, queue, deps)
            action = clf.predict([features])[0]
            stack, queue, deps = apply_action(action, stack, queue, deps)

    return sorted(deps)

dep_parse(tree, clf, [extract_initial_features])

[(0, 6),
 (2, 1),
 (2, 4),
 (4, 3),
 (4, 5),
 (6, 2),
 (6, 8),
 (6, 14),
 (8, 7),
 (8, 10),
 (10, 9),
 (10, 11),
 (11, 13),
 (13, 12)]

In [11]:
def uas_report(trees, clf, extractors):
    total, tp, full_match = 0, 0, 0
    for tree in trees:
        tree = [t for t in tree if type(t["id"])==int]
        golden = [(node["head"], node["id"]) for node in tree]
        predicted = dep_parse(tree, clf, extractors)
        total += len(tree)
        tp += len(set(golden).intersection(set(predicted)))
        if set(golden) == set(predicted):
            full_match += 1

    print("Total:", total)
    print("Correctly defined:", tp)
    print("UAS:", round(tp/total, 2))
    print("Full match:", round(full_match/len(test_trees), 2))
    
uas_report([tree], clf, [extract_initial_features])

Total: 14
Correctly defined: 14
UAS: 1.0
Full match: 0.0


In [12]:
uas_report(test_trees, clf, [extract_initial_features])

Total: 12574
Correctly defined: 8717
UAS: 0.69
Full match: 0.09


## Покращення класифікатора. Підбираю нові фічі.

### Ітерація №1

In [13]:
def has_head_(x, deps):
    return x['id'] in [child for _, child in deps]
def children_num_(x, deps):
    return len([head for head, _ in deps if head == x['id']])

def extract_features_1(tree, stack, queue, deps):
    feats = {}
    
    if stack:
        feats['stack-0-has-head'] = has_head_(stack[0], deps)
        feats['stack-0-children-num'] = children_num_(stack[0], deps)

    if queue:
        feats['queue-0-has-head'] = has_head_(queue[0], deps)
        feats['queue-0-children-num'] = children_num_(queue[0], deps)

    return feats

In [14]:
extractors = [extract_initial_features, extract_features_1]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Є покращення

In [15]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))

              precision    recall  f1-score   support

        left       0.93      0.94      0.93      6371
      reduce       0.90      0.84      0.87      6875
       right       0.78      0.81      0.79      5996
       shift       0.85      0.88      0.87      6578

   micro avg       0.87      0.87      0.87     25820
   macro avg       0.86      0.86      0.86     25820
weighted avg       0.87      0.87      0.87     25820



In [16]:
uas_report(test_trees, clf, extractors)

Total: 12574
Correctly defined: 8714
UAS: 0.69
Full match: 0.13


### Ітерація №2

In [17]:
def extract_features_2(tree, stack, queue, deps):
    feats = {}

    if len(stack) > 1:
        feats['stack-1-has-head'] = has_head_(stack[1], deps)
        feats['stack-1-children-num'] = children_num_(stack[1], deps)
    if len(stack) > 2:
        feats['stack-2-has-head'] = has_head_(stack[2], deps)
        feats['stack-2-children-num'] = children_num_(stack[2], deps)
    
    if len(queue) > 1:
        feats['queue-1-has-head'] = has_head_(queue[1], deps)
        feats['queue-1-children-num'] = children_num_(queue[1], deps)
    if len(queue) > 2:
        feats['queue-2-has-head'] = has_head_(queue[2], deps)
        feats['queue-2-children-num'] = children_num_(queue[2], deps)
    
    return feats

In [18]:
extractors = [extract_initial_features, extract_features_1, extract_features_2]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Покращення немає, в подальшому цей набір фіч не буду використовувати

In [19]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.93      0.94      0.93      6371
      reduce       0.89      0.84      0.87      6875
       right       0.78      0.80      0.79      5996
       shift       0.86      0.88      0.87      6578

   micro avg       0.87      0.87      0.87     25820
   macro avg       0.86      0.87      0.86     25820
weighted avg       0.87      0.87      0.87     25820

Total: 12574
Correctly defined: 8693
UAS: 0.69
Full match: 0.13


### Ітерація №3

In [20]:
def find_head_(tree, deps, x):
    parents = [parent for parent, child in deps if child == x['id']]
    return tree[parents[0] - 1] if parents else None    

def find_children_(tree, deps, x):
    children = [child for parent, child in deps if parent == x['id']]
    return [tree[c - 1] for c in children]

def extract_features_3(tree, stack, queue, deps):
    feats = {}
    if stack:
        head = find_head_(tree, deps, stack[0])
        if head:
            feats['stack-0-head-form'] = head['form']
            feats['stack-0-head-lemma'] = head['lemma']
            feats['stack-0-head-pos'] = head['upostag']
        children = find_children_(tree, deps, stack[0])
        if children:
            for i, child in zip(range(3), children):
                feats['stack-0-child-{i}-form'] = child['form']
                feats['stack-0-child-{i}-lemma'] = child['lemma']
                feats['stack-0-child-{i}-pos'] = child['upostag']

    if queue:
        head = find_head_(tree, deps, queue[0])
        if head:
            feats['queue-0-head-form'] = head['form']
            feats['queue-0-head-lemma'] = head['lemma']
            feats['queue-0-head-pos'] = head['upostag']
        children = find_children_(tree, deps, queue[0])
        if children:
            for i, child in zip(range(3), children):
                feats['queue-0-child-{i}-form'] = children[i]['form']
                feats['queue-0-child-{i}-lemma'] = children[i]['lemma']
                feats['queue-0-child-{i}-pos'] = children[i]['upostag']
                
    return feats


In [21]:
extractors = [extract_initial_features, extract_features_1, extract_features_3]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Є покращення

In [22]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.94      0.95      0.95      6371
      reduce       0.89      0.87      0.88      6875
       right       0.82      0.82      0.82      5996
       shift       0.87      0.89      0.88      6578

   micro avg       0.88      0.88      0.88     25820
   macro avg       0.88      0.88      0.88     25820
weighted avg       0.88      0.88      0.88     25820

Total: 12574
Correctly defined: 8869
UAS: 0.71
Full match: 0.15


### Ітерація №4

In [23]:
def extract_features_4(tree, stack, queue, deps):
    feats = {}
    feats['sent-len'] = len(tree)
    if stack:
        feats['stack-0-idx'] = stack[0]['id']
        feats['stack-0-rev-idx'] = len(tree) - stack[0]['id']
    if queue:
        feats['queue-0-idx'] = queue[0]['id']
        feats['queue-0-rev-idx'] = len(tree) - queue[0]['id']

    return feats

In [24]:
extractors = [extract_initial_features, extract_features_1, extract_features_3, extract_features_4]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Є покращення

In [25]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.94      0.95      0.95      6371
      reduce       0.90      0.87      0.89      6875
       right       0.84      0.83      0.83      5996
       shift       0.88      0.90      0.89      6578

   micro avg       0.89      0.89      0.89     25820
   macro avg       0.89      0.89      0.89     25820
weighted avg       0.89      0.89      0.89     25820

Total: 12574
Correctly defined: 8955
UAS: 0.71
Full match: 0.14


### Ітерація №5

In [26]:
def extract_features_5(tree, stack, queue, deps):
    feats = {}
    if len(stack) > 1:
        feats['stack-1-idx'] = stack[1]['id']
        feats['stack-1-rev-idx'] = len(tree) - stack[1]['id']
    if len(stack) > 2:
        feats['stack-2-idx'] = stack[2]['id']
        feats['stack-2-rev-idx'] = len(tree) - stack[2]['id']

    if len(queue) > 1:
        feats['queue-1-idx'] = queue[1]['id']
        feats['queue-1-rev-idx'] = len(tree) - queue[1]['id']
    if len(queue) > 2:
        feats['queue-2-idx'] = queue[2]['id']
        feats['queue-2-rev-idx'] = len(tree) - queue[2]['id']

    return feats

In [27]:
extractors = [extract_initial_features, extract_features_1, extract_features_3, extract_features_4, extract_features_5]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Є покращення в класифікації, але UAS значення погіршились, тому не буду використовувати цей набір.

In [28]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.94      0.95      0.94      6371
      reduce       0.90      0.87      0.89      6875
       right       0.84      0.83      0.83      5996
       shift       0.88      0.90      0.89      6578

   micro avg       0.89      0.89      0.89     25820
   macro avg       0.89      0.89      0.89     25820
weighted avg       0.89      0.89      0.89     25820

Total: 12574
Correctly defined: 8902
UAS: 0.71
Full match: 0.14


### Ітерація №6

In [29]:
def extract_feats_(prefix, x):
    feats = x['feats']
    if feats:
        return {prefix + '-' + k: v for k, v in feats.items()}
    else:
        return {}

def extract_features_6(tree, stack, queue, deps):    
    feats = {}
    if stack:
        feats.update(extract_feats_('stack-0', stack[0]))
    if queue:
        feats.update(extract_feats_('queue-0', queue[0]))

    return feats

In [30]:
extractors = [extract_initial_features, extract_features_1, extract_features_3, extract_features_4, extract_features_6]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Є покращення

In [31]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.95      0.96      0.95      6371
      reduce       0.91      0.88      0.90      6875
       right       0.85      0.85      0.85      5996
       shift       0.89      0.90      0.89      6578

   micro avg       0.90      0.90      0.90     25820
   macro avg       0.90      0.90      0.90     25820
weighted avg       0.90      0.90      0.90     25820

Total: 12574
Correctly defined: 9161
UAS: 0.73
Full match: 0.16


### Ітерація №7

In [32]:
def extract_features_7(tree, stack, queue, deps):
    feats = {}
    if len(stack) > 1:
        feats.update(extract_feats_('stack-1', stack[1]))
    if len(stack) > 2:
        feats.update(extract_feats_('stack-2', stack[2]))

    if len(queue) > 1:
        feats.update(extract_feats_('queue-1', queue[1]))
    if len(queue) > 2:
        feats.update(extract_feats_('queue-2', queue[2]))

    return feats

In [33]:
extractors = [extract_initial_features, extract_features_1, extract_features_3, extract_features_4, extract_features_6, extract_features_7]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

### Покращень особливо немає

In [34]:
clf = make_lrc()
clf.fit(train_feats, train_labels)
print(classification_report(test_labels, clf.predict(test_feats)))
uas_report(test_trees, clf, extractors)

              precision    recall  f1-score   support

        left       0.94      0.96      0.95      6371
      reduce       0.91      0.89      0.90      6875
       right       0.85      0.85      0.85      5996
       shift       0.89      0.90      0.89      6578

   micro avg       0.90      0.90      0.90     25820
   macro avg       0.90      0.90      0.90     25820
weighted avg       0.90      0.90      0.90     25820

Total: 12574
Correctly defined: 9178
UAS: 0.73
Full match: 0.16


## Пробую використати SVM і підібрати гіперпараметри для нього

In [35]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

def make_svc():
    clf = Pipeline([('vect', DictVectorizer()),
                    ('svc', LinearSVC(random_state=42, max_iter=5000))])
    return clf

In [37]:
extractors = [extract_initial_features, extract_features_1, extract_features_3, extract_features_4, extract_features_6]
train_feats, train_labels = prepare_data(train_trees, extractors)
test_feats, test_labels = prepare_data(test_trees, extractors)

svc_parameter_grid = {'svc__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
                      'svc__dual': [True, False]}

gs_svc = GridSearchCV(make_svc(),
                      svc_parameter_grid,
                      scoring='accuracy',
                      verbose=5,
                      cv=5,
                      n_jobs=-1)

gs_svc.fit(train_feats, train_labels)
gs_svc.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done  66 out of  80 | elapsed: 47.6min remaining: 10.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 59.4min finished


{'svc__C': 0.05, 'svc__dual': False}

In [42]:
print(classification_report(test_labels, gs_svc.predict(test_feats)))

              precision    recall  f1-score   support

        left       0.95      0.96      0.95      6371
      reduce       0.90      0.88      0.89      6875
       right       0.84      0.84      0.84      5996
       shift       0.88      0.89      0.88      6578

   micro avg       0.89      0.89      0.89     25820
   macro avg       0.89      0.89      0.89     25820
weighted avg       0.89      0.89      0.89     25820



### З SVM щось не вдалось. Тому підберу гіперпараметри для лог. регресії.

In [40]:
lr_parameter_grid = {'log_reg__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
                     'log_reg__solver': ['sag', 'saga']}

gs_lrc = GridSearchCV(make_lrc(),
                      lr_parameter_grid,
                      scoring='accuracy',
                      verbose=5,
                      cv=5,
                      n_jobs=-1)

gs_lrc.fit(train_feats, train_labels)
gs_lrc.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done  66 out of  80 | elapsed: 45.3min remaining:  9.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 52.0min finished


{'log_reg__C': 5, 'log_reg__solver': 'sag'}

### Покращення мінімальне

In [41]:
print(classification_report(test_labels, gs_lrc.predict(test_feats)))
uas_report(test_trees, gs_lrc, extractors)

              precision    recall  f1-score   support

        left       0.95      0.96      0.95      6371
      reduce       0.91      0.88      0.90      6875
       right       0.85      0.85      0.85      5996
       shift       0.89      0.90      0.89      6578

   micro avg       0.90      0.90      0.90     25820
   macro avg       0.90      0.90      0.90     25820
weighted avg       0.90      0.90      0.90     25820

Total: 12574
Correctly defined: 9183
UAS: 0.73
Full match: 0.16


## Використання парсеру залежностей
### pymorphy2 у мене щось не заводиться :( тому буду використовувати stanza.

In [46]:
import stanza
nlp = stanza.Pipeline(lang='uk', processors='tokenize,mwt,pos,lemma')

2020-05-01 15:02:37 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package |
-----------------------
| tokenize  | iu      |
| mwt       | iu      |
| pos       | iu      |
| lemma     | iu      |

2020-05-01 15:02:37 INFO: Use device: cpu
2020-05-01 15:02:37 INFO: Loading: tokenize
2020-05-01 15:02:37 INFO: Loading: mwt
2020-05-01 15:02:37 INFO: Loading: pos
2020-05-01 15:02:38 INFO: Loading: lemma
2020-05-01 15:02:38 INFO: Done loading processors!


In [47]:
def convert_feats(feats_str):
    pairs = [s.split('=') for s in feats_str.split('|')]
    return {k: v for k, v in pairs}

convert_feats('Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing')

{'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}

In [48]:
def convert_token(tok):
    converted = {}
    converted['id'] = int(tok.id)
    converted['form'] = tok.text
    converted['lemma'] = tok.lemma
    converted['upostag'] = tok.upos
    converted['feats'] = convert_feats(tok.feats) if tok.feats else None
    return converted

In [49]:
def dep_parse_sentence(clf, extractors, sentence):
    model = nlp(sentence)
    tokens = [convert_token(word) for sent in model.sentences for word in sent.words]
    deps = dep_parse(tokens, clf, extractors)
    
    for parent, child in deps:
        print("{} <- {}".format(tokens[parent - 1]['form'] if parent > 0 else 'ROOT', tokens[child - 1]['form']))

dep_parse_sentence(gs_lrc, extractors, 'я люблю котів.')

ROOT <- люблю
люблю <- я
люблю <- котів
люблю <- .


In [52]:
dep_parse_sentence(gs_lrc, extractors, 'Друг міністра внутрішніх справ Арсена Авакова Ігор Котвіцький у карантин відвідує елітний бутик-ресторан та проводить зустрічі у 5-зірковому готелі.')

ROOT <- відвідує
Друг <- міністра
Друг <- Ігор
Друг <- карантин
міністра <- справ
справ <- внутрішніх
справ <- Арсена
Арсена <- Авакова
Ігор <- Котвіцький
карантин <- у
відвідує <- Друг
відвідує <- бутик
відвідує <- проводить
відвідує <- .
бутик <- елітний
бутик <- ресторан
ресторан <- -
проводить <- та
проводить <- зустрічі
зустрічі <- готелі
готелі <- у
готелі <- 5-зірковому


In [53]:
dep_parse_sentence(gs_lrc, extractors, 'Трамп пригрозив Китаю новими митами в якості помсти за коронавірус')

ROOT <- пригрозив
пригрозив <- Трамп
пригрозив <- Китаю
пригрозив <- митами
пригрозив <- якості
пригрозив <- коронавірус
митами <- новими
якості <- в
якості <- помсти
коронавірус <- за
