In [1]:
import os

from IPython.display import display, Markdown, Image

In [2]:
REPO_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
TASK_PATH = os.path.join(REPO_PATH, "tasks", "08-syntactic-parsing.md")
DATA_PATH = '/home/dima/Projects/UD_Ukrainian-IU'

In [3]:
def show_markdown(path):
    with open(path, 'r') as fh:
        content = fh.read()
    display(Markdown(content))

In [4]:
show_markdown(TASK_PATH)

# Синтаксичний аналіз

## I. Покращення парсера залежностей

Візьміть за основу парсер залежностей, побудований на практичному занятті, і зробіть мінімум дві ітерації для покращення якості.

Варіанти покращення парсера:
* підберіть кращий набір ознак;
* зробіть класифікацію типів залежностей та поміряйте LAS (labelled attachment score);
* додайте операцію swap для опрацювання непроективних дерев;
* покращіть статичний оракул або замініть його недетермінованим чи динамічним оракулом;
* спробуйте інший класифікатор та зробіть оптимізацію гіперпараметрів;
* ваші ідеї.

За основу можна використати або свій парсер, або [приклад із заняття](../lectures/08-dep-parser-uk.ipynb).

Корисні посилання:
* [UD-корпус для української](https://github.com/UniversalDependencies/UD_Ukrainian-IU/)
* [Зручна бібліотека для роботи з форматом CoNLL](https://github.com/EmilStenstrom/conllu)
* Стаття з блогу Matthew Honnibal - [Parsing English in 500 Lines of Python](https://explosion.ai/blog/parsing-english-in-python)
* Книга про парсери залежностей - [Dependency Parsing by Kübler, McDonald, and Nivre](https://books.google.com.ua/books?id=k3iiup7HB9UC&pg=PA21&hl=uk&source=gbs_toc_r&cad=4#v=onepage&q&f=false)
* Гарний огляд типів парсера залежностей та оракулів - [Improvements in Transition Based Systems for Dependency Parsing](http://paduaresearch.cab.unipd.it/8004/1/Tesi.pdf)

## II. Використання парсера на нових даних

Виберіть кілька випадкових речень українською мовою на побудуйте дерева залежностей для них, використовуючи свій парсер.

Для токенізації можна використати https://github.com/lang-uk/tokenize-uk.

Для частиномовного аналізу можна використати https://github.com/kmike/pymorphy2. Зважте, що частиномовні теги в UD та в pymorphy2 відрізняються, зокрема pymorphy2 не розрізняє типи сполучників. Нижче подано спосіб вирівняти ці дві нотації:

```python
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='uk')

DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)
```

Запишіть ваші спостереження та результати в окремий файл.

### Оцінювання

80% - I. Покращення парсера залежностей  
20% - II. Використання парсера на нових даних

### Крайній термін

02.05.2020


## Part 1

In [5]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

In [6]:
%%time

with open(os.path.join(DATA_PATH, "uk_iu-ud-train.conllu"), "r") as f:
    train_trees = parse(f.read())

with open(os.path.join(DATA_PATH, "uk_iu-ud-dev.conllu"), "r") as f:
    test_trees = parse(f.read())

CPU times: user 2.89 s, sys: 108 ms, total: 3 s
Wall time: 3 s


In [7]:
print(len(train_trees), len(test_trees))

5496 672


In [8]:
def print_tree(tree):
    for node in tree:
        head = node["head"]
        print("{} <-- {}".format(node["form"],
                             tree[head - 1]["form"]
                             if head > 0 else "root"))

def check_tree(tree):
    for n in tree:
        if not isinstance(n["id"], int):
            return True
    return False

In [9]:
tree = train_trees[0]
print_tree(tree)

У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


In [10]:
print("Bad trees: " )
print("Train:", len(list(filter(check_tree, train_trees))))
print("Test:", len(list(filter(check_tree, test_trees))))

Bad trees: 
Train: 197
Test: 16


In [11]:
clean_train_trees = list(filter(lambda t: not check_tree(t), train_trees))
clean_test_trees = list(filter(lambda t: not check_tree(t), test_trees))

print(len(clean_train_trees), len(clean_test_trees))

5299 656


In [12]:
def intersects(n1, n2):
    s1 = n1['id'] if n1['head'] > n1['id'] else n1['head']
    e1 = n1['head'] if n1['head'] > n1['id'] else n1['id']
    s2 = n2['id'] if n2['head'] > n2['id'] else n2['head']
    e2 = n2['head'] if n2['head'] > n2['id'] else n2['id']
    
    return (s1 < s2 and e1 > s2 and e2 > e1) or (s2 < s1 and e2 > s1 and e1 > e2)

def non_projective(tree):
    for n1 in tree:
        for n2 in tree:
            if n1['id'] < n2['id'] and intersects(n1, n2):
                return True
            
    return False

In [13]:
non_projective_train_trees = list(filter(non_projective, clean_train_trees))
non_projective_test_trees = list(filter(non_projective, clean_test_trees))

print(len(non_projective_train_trees), len(non_projective_test_trees))

414 57


In [14]:
np_tree = non_projective_train_trees[21]
print_tree(np_tree)

Звісно <-- було
не <-- було
було <-- root
жодного <-- способу
способу <-- було
дізнатись <-- способу
чи <-- спостерігають
спостерігають <-- дізнатись
за <-- вами
вами <-- спостерігають
саме <-- цей
у <-- проміжок
цей <-- проміжок
проміжок <-- спостерігають
часу <-- проміжок
. <-- було


In [15]:
projective_train_trees = list(filter(lambda t: not non_projective(t), clean_train_trees))
projective_test_trees = list(filter(lambda t: not non_projective(t), clean_test_trees))

print(len(projective_train_trees), len(projective_test_trees))

4885 599


### Design actions and the oracle

In [16]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    SWAP = 'swap'
    RIGHT = "right"
    LEFT = "left"

In [17]:
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
#     elif stack[-2]['id'] > 0 and stack[-2]['id'] < stack[-1]['id'] and \
#     (intersects2(stack[-2], stack[-1]) or \
#     [n for n in queue if n['index'] < stack[-2]['head_index'] and intersects2(stack[-2], n)] or\
#     [n for n in stack[1:-2] if intersects2(n, stack[-2])]):
#         return Actions.SWAP
    # default option
    else:
        return Actions.SHIFT

In [18]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def trace_actions(tree, log=True):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        if log:
            print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
            print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
            print("Relations:", relations)
            print(action)
            print("========================")
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
#         elif action == Actions.SWAP:
#             queue.insert(0, stack[-2])
#             tmp = stack[-2]['index']
#             queue[0]['index'] = stack[-1]['index']
#             stack[-1]['index'] = tmp
#             for n in stack[1:] + queue:
#                 if queue[0]['id'] == n['head']:
#                     n['head_index'] = queue[0]['index']

#                 if stack[-1]['id'] == n['head']:
#                     n['head_index'] = stack[-1]['index']

#             del stack[-2]
            
        else:
            print("Unknown action.")
    if log:
        print("Gold relations:")
        print([(node["id"], node["head"]) for node in tree])
        print("Retrieved relations:")
        print(sorted(relations))

#### show prohectiva and non-projective trees result

In [19]:
# trace_actions(tree)

In [20]:
# trace_actions(np_tree)

### Feature extraction

In [26]:
import pandas as pd

In [27]:
def extract_features(stack, queue, relations):
    
    features = dict()
    
    if len(stack) > 1:        
        features["s0-word"] = stack[-2]["form"]
        features["s0-lemma"] = stack[-2]["lemma"]
        features["s0-tag"] = stack[-2]["upostag"]
        features["s0-rchildren-num"] = len([r for r in relations if r[1] == stack[-2]['id']])
        features["s0-lchildren-num"] = len([r for r in relations if r[0] == stack[-2]['id']])
        if stack[-2]["feats"]:
            for k, v in stack[-2]["feats"].items():
                features["s0-" + k] = v
    
    if len(stack) > 2:
        features["s1-word"] = stack[-3]["form"]
        features["s1-tag"] = stack[-3]["upostag"]
    
    if len(stack) > 3:
        features["s2-tag"] = stack[-4]["upostag"]
        
    if len(stack) > 4:
        features["s3-tag"] = stack[-5]["upostag"]
    
    if len(stack) > 1:
        queue_top = stack[-1]
        features["q0-word"] = stack[-1]["form"]
        features["q0-lemma"] = stack[-1]["lemma"]
        features["q0-tag"] = stack[-1]["upostag"]
        features["q0-rchildren-num"] = len([r for r in relations if r[1] == stack[-1]['id']])
        features["q0-lchildren-num"] = len([r for r in relations if r[0] == stack[-1]['id']])
        if stack[-1]["feats"]:
            for k, v in stack[-1]["feats"].items():
                features["q0-" + k] = v
    
    if len(queue) > 0:        
        features["q1-word"] = queue[0]["form"]
        features["q1-tag"] = queue[0]["upostag"]
    
    if len(queue) > 1:
        features["q2-tag"] = queue[1]["upostag"]
    
    if len(queue) > 2:
        features["q3-tag"] = queue[2]["upostag"]
       
    if len(stack) > 1:
        features["distance"] = stack[-1]["id"] - stack[-2]["id"]
    
    features['q-empty'] = not bool(queue)    
    
    return features

In [28]:
def extract_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extract_features(stack, queue, relations))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

#### train data

In [29]:
train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = extract_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

In [30]:
train_df = pd.DataFrame(train_features)
train_df['target'] = train_labels

In [31]:
print(train_df.shape)

(190298, 73)


In [32]:
counter = 0
nan_features = []

for column in train_df.columns:
    tmp = train_df.loc[train_df[column].isna()].shape[0]
    nan_ratio = tmp / train_df.shape[0]
    if nan_ratio > 0.954:  # 2 sigma
        print(column, nan_ratio, sep='\t\t\t')
        counter += 1
        nan_features.append(column)

q0-NameType			0.9856120400634794
s0-NameType			0.9891538534298837
q0-PunctType			0.9763371133695572
q0-Degree			0.9673879914660165
s0-Degree			0.9792168073232509
q0-Poss			0.9942301022606649
q0-Reflex			0.995953714700102
q0-Uninflect			0.9781395495486027
q0-NumType			0.9847397240118131
s0-NumType			0.9916919778452743
q0-Voice			0.9882762824622434
q0-Polarity			0.9918286056605955
s0-Polarity			0.9977141115513563
s0-PunctType			0.9873776918307077
s0-Poss			0.9985338784432837
s0-Voice			0.9883498512858779
q0-Foreign			0.9958906557084152
s0-Foreign			0.9967104226003426
q0-PartType			0.9989385069732735
s0-PartType			0.9990541151246991
s0-Reflex			0.9983657211321191
s0-Uninflect			0.9843088209019538
q0-Hyph			0.9989700364691169
s0-Hyph			0.9994482338227412
q0-Abbr			0.9933682960409463
s0-Abbr			0.9937939442348317
q0-Animacy[gram]			0.9995848616380624
s0-Animacy[gram]			0.9996584304616969
q0-Variant			0.9995007829824801
s0-Variant			0.9995796067220886
q0-Orth			0.9992327822678115
s0-Orth			0.

#### test data

In [33]:
test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = extract_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

In [34]:
test_df = pd.DataFrame(test_features)
test_df['target'] = test_labels

In [35]:
print(test_df.shape)

(25820, 72)


### Train clasifier

In [36]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *

#### Logistic regression

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  115615


In [39]:
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

In [45]:
lr_clf = LogisticRegression(C=2, solver="sag", multi_class="multinomial", max_iter=1000, verbose=1)

In [46]:
lr_pipe = Pipeline([('vec', vec), ('lr_clf', lr_clf)])

In [47]:
lr_pipe.fit(train_features, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 157 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min finished


Pipeline(memory=None,
     steps=[('vec', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('lr_clf', LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=1, warm_start=False))])

In [52]:
predicted = lr_pipe.predict(test_features)
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.92      0.95      0.94      6371
      reduce       0.88      0.85      0.87      6875
       right       0.80      0.79      0.79      5996
       shift       0.86      0.87      0.86      6578

   micro avg       0.87      0.87      0.87     25820
   macro avg       0.87      0.87      0.87     25820
weighted avg       0.87      0.87      0.87     25820



#### Random forest

In [53]:
RANDOM_STATE = 0
N_COMP = 500

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
truncated_svd = TruncatedSVD(n_components=N_COMP)

In [56]:
%%time

t_svd = truncated_svd.fit(train_features_vectorized)

CPU times: user 5min 2s, sys: 1min 16s, total: 6min 19s
Wall time: 1min 1s


In [57]:
train_features_truncated = t_svd.transform(train_features_vectorized)
test_features_truncated = t_svd.transform(test_features_vectorized)

In [58]:
rf_params = {
        'bootstrap': True,
        'criterion': 'gini', # entropy
#         'class_weight': 'balanced'
    
#         'max_depth': None,
#         'max_features': 'auto',
#         'max_leaf_nodes': None,
#         'min_impurity_decrease': 0.0,
#         'min_impurity_split': None,
#         'min_samples_leaf': 1,
#         'min_samples_split': 2,
#         'min_weight_fraction_leaf': 0.0,
        'n_estimators': 1000,
    
        'n_jobs': -1,
        'oob_score': False,
        'random_state': RANDOM_STATE,
        'verbose': 100,
        'warm_start': False
}

rf_clf = RandomForestClassifier()

In [59]:
rf_pipe = Pipeline([('vec', vec), ('t_svd', t_svd), ('rf_clf', rf_clf)])

In [60]:
rf_pipe.fit(train_features, train_labels)



Pipeline(memory=None,
     steps=[('vec', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('t_svd', TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5,
       random_state=None, tol=0.0)), ('rf_clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [61]:
predicted = rf_pipe.predict(test_features)
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.81      0.91      0.86      6371
      reduce       0.75      0.82      0.78      6875
       right       0.76      0.66      0.70      5996
       shift       0.82      0.74      0.77      6578

   micro avg       0.78      0.78      0.78     25820
   macro avg       0.78      0.78      0.78     25820
weighted avg       0.78      0.78      0.78     25820



#### Lightgbm

In [62]:
from lightgbm import LGBMClassifier, plot_importance

In [63]:
def lgb_fscore(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1)
    y_pred = y_pred.argmax(axis=0)
    res = f1_score(y_true, y_pred, average='macro')
    return 'macro_f1', res, True

In [85]:
params = {
    'num_class': 4,
    'num_rounds': 10000,
    'max_depth': -1, #  8
    'learning_rate': 0.01,  #  0.007
    'num_leaves': 31, # was 127
    'verbose': 500,
    'early_stopping_rounds': 300,
    'min_data_in_leaf': 20,
    'lambda_l2': 0.7,
    'feature_fraction': 0.2, #  0.8
    'metric': 'custom',
    'random_state': RANDOM_STATE
}


lgb_clf = LGBMClassifier(**params)

In [86]:
lgb_clf.fit(
    X=train_features_truncated,
    y=train_labels,
    eval_set=[(test_features_truncated, test_labels)],
    verbose=params['verbose'],
    eval_metric=lgb_fscore,
)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 300 rounds
[500]	valid_0's macro_f1: 0.824888
[1000]	valid_0's macro_f1: 0.852523
[1500]	valid_0's macro_f1: 0.867515
[2000]	valid_0's macro_f1: 0.876588
[2500]	valid_0's macro_f1: 0.881406
[3000]	valid_0's macro_f1: 0.884606
[3500]	valid_0's macro_f1: 0.887773
[4000]	valid_0's macro_f1: 0.889726
[4500]	valid_0's macro_f1: 0.891186
[5000]	valid_0's macro_f1: 0.892063
Early stopping, best iteration is:
[4741]	valid_0's macro_f1: 0.892362


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        early_stopping_rounds=300, feature_fraction=0.2,
        importance_type='split', lambda_l2=0.7, learning_rate=0.01,
        max_depth=-1, metric='custom', min_child_samples=20,
        min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=4, num_leaves=31,
        num_rounds=10000, objective=None, random_state=0, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0, verbose=500)

In [87]:
predicted = lgb_clf.predict(test_features_truncated)

print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.94      0.96      0.95      6371
      reduce       0.90      0.88      0.89      6875
       right       0.84      0.83      0.83      5996
       shift       0.90      0.90      0.90      6578

   micro avg       0.89      0.89      0.89     25820
   macro avg       0.89      0.89      0.89     25820
weighted avg       0.89      0.89      0.89     25820



### Calculate the unlabeled attachment score

In [88]:
from tqdm import tqdm

In [89]:
def dep_parse(sentence, oracle, vectorizer=None, t_svd=None, log=False):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue, relations)
            if vectorizer:
                features = vectorizer.transform([features])
            if t_svd:
                features = t_svd.transform(features)
            action = oracle.predict(features)[0]
            if log:
                print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
                print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
                print("Relations:", relations)
                print(action)
                print("========================")
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

#### Lightgbm score

In [90]:
total, tp = 0, 0
for tree in tqdm(test_trees):
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lgb_clf, vec, t_svd)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

100%|██████████| 672/672 [21:48<00:00,  1.95s/it]

Total: 12574
Correctly defined: 9219
UAS: 0.73





#### LR score

In [91]:
total, tp = 0, 0
for tree in tqdm(test_trees):
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lr_pipe)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

100%|██████████| 672/672 [00:19<00:00, 35.07it/s]

Total: 12574
Correctly defined: 8630
UAS: 0.69





In [104]:
import eli5

In [122]:
eli5.show_weights(lr_pipe.steps[1][1], vec=lr_pipe.steps[0][1])

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.925,q0-tag=ADP,,
+2.535,q0-tag=PUNCT,,
+2.093,s0-word=ROOT,,
+2.093,s0-lemma=ROOT,,
+2.093,s0-tag=ROOT,,
+2.053,q0-tag=PART,,
+1.948,q0-Hyph=Yes,,
+1.887,s0-tag=ADV,,
+1.882,s2-tag=ROOT,,
… 49668 more positive …,… 49668 more positive …,,

Weight?,Feature
+2.925,q0-tag=ADP
+2.535,q0-tag=PUNCT
+2.093,s0-word=ROOT
+2.093,s0-lemma=ROOT
+2.093,s0-tag=ROOT
+2.053,q0-tag=PART
+1.948,q0-Hyph=Yes
+1.887,s0-tag=ADV
+1.882,s2-tag=ROOT
… 49668 more positive …,… 49668 more positive …

Weight?,Feature
+10.692,q-empty
+7.633,q0-lchildren-num
… 40792 more positive …,… 40792 more positive …
… 74804 more negative …,… 74804 more negative …
-1.796,q2-tag=ADV
-1.796,q2-tag=PUNCT
-1.822,q2-tag=PROPN
-1.870,q2-tag=ADP
-1.932,q1-tag=PART
-1.944,q1-tag=DET

Weight?,Feature
+2.613,q1-word=ж
+2.407,q1-tag=PUNCT
+2.365,q1-word=також
+2.356,q1-tag=PROPN
+2.271,q1-tag=X
+2.095,q1-tag=PART
+1.966,q1-tag=AUX
+1.862,q2-tag=PUNCT
+1.860,q1-tag=NUM
+1.806,q1-tag=NOUN

Weight?,Feature
+5.208,q1-tag=ADP
+4.493,q1-tag=CCONJ
+3.159,q1-tag=SCONJ
+2.786,q1-word=(
+2.691,q1-tag=PUNCT
+2.507,q3-tag=VERB
+2.420,q2-tag=SCONJ
+2.412,q3-tag=PRON
+2.378,q3-tag=PART
+2.375,q1-word=«


## Part 2

In [95]:
import pymorphy2

In [96]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [97]:
DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

In [158]:
from tokenize_uk import tokenize_uk

def convert_string_to_tree_format(text):
    tokens = tokenize_uk.tokenize_words(text)
    i = 0
    tree = []
    for token in tokens:
        i += 1
        word = morph.parse(token)[0]
        token_ = OrderedDict([('id', i), 
                              ('form', str(token)), 
                              ('lemma', str(morph.parse(token)[0].normal_form)),
                              ('upostag', normalize_pos(word)), 
                              ('xpostag', None), ('feats', None), 
                              ('head', None), ('deprel', None),
                              ('deps', None), ('misc', None)])
        tree.append(token_)
    
    relations = dep_parse(tree, lgb_clf, vec, t_svd)
    for parent, child in relations:
        tree[parent-1]['head'] = child
    
    return tree

In [159]:
sent1 = "Це є звичайне речення, на якому ми тестуємо нашу гіпотезу."

In [160]:
tree1 = convert_string_to_tree_format(sent1)
tree1

[OrderedDict([('id', 1),
              ('form', 'Це'),
              ('lemma', 'це'),
              ('upostag', 'DET'),
              ('xpostag', None),
              ('feats', None),
              ('head', 2),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'є'),
              ('lemma', 'бути'),
              ('upostag', 'VERB'),
              ('xpostag', None),
              ('feats', None),
              ('head', 0),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 3),
              ('form', 'звичайне'),
              ('lemma', 'звичайний'),
              ('upostag', 'ADJ'),
              ('xpostag', None),
              ('feats', None),
              ('head', 4),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 4),
              ('form', 'речення'),
          

In [161]:
print_tree(tree1)

Це <-- є
є <-- root
звичайне <-- речення
речення <-- є
, <-- тестуємо
на <-- тестуємо
якому <-- тестуємо
ми <-- тестуємо
тестуємо <-- є
нашу <-- гіпотезу
гіпотезу <-- тестуємо
. <-- є


In [162]:
sent2 = 'Ти признайся мені, звідки в тебе ті чари'

In [163]:
tree2 = convert_string_to_tree_format(sent2)
tree2

[OrderedDict([('id', 1),
              ('form', 'Ти'),
              ('lemma', 'ти'),
              ('upostag', 'PRON'),
              ('xpostag', None),
              ('feats', None),
              ('head', 2),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'признайся'),
              ('lemma', 'признатися'),
              ('upostag', 'VERB'),
              ('xpostag', None),
              ('feats', None),
              ('head', 0),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 3),
              ('form', 'мені'),
              ('lemma', 'мен'),
              ('upostag', 'NOUN'),
              ('xpostag', None),
              ('feats', None),
              ('head', 2),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 4),
              ('form', ','),
          

In [164]:
print_tree(tree2)

Ти <-- признайся
признайся <-- root
мені <-- признайся
, <-- чари
звідки <-- чари
в <-- тебе
тебе <-- чари
ті <-- чари
чари <-- признайся
