In [1]:
import os

from IPython.display import display, Markdown, Image

In [2]:
REPO_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
TASK_PATH = os.path.join(REPO_PATH, "tasks", "08-syntactic-parsing.md")
DATA_PATH = '/home/dima/Projects/UD_Ukrainian-IU'

In [3]:
def show_markdown(path):
    with open(path, 'r') as fh:
        content = fh.read()
    display(Markdown(content))

In [4]:
show_markdown(TASK_PATH)

# Синтаксичний аналіз

## I. Покращення парсера залежностей

Візьміть за основу парсер залежностей, побудований на практичному занятті, і зробіть мінімум дві ітерації для покращення якості.

Варіанти покращення парсера:
* підберіть кращий набір ознак;
* зробіть класифікацію типів залежностей та поміряйте LAS (labelled attachment score);
* додайте операцію swap для опрацювання непроективних дерев;
* покращіть статичний оракул або замініть його недетермінованим чи динамічним оракулом;
* спробуйте інший класифікатор та зробіть оптимізацію гіперпараметрів;
* ваші ідеї.

За основу можна використати або свій парсер, або [приклад із заняття](../lectures/08-dep-parser-uk.ipynb).

Корисні посилання:
* [UD-корпус для української](https://github.com/UniversalDependencies/UD_Ukrainian-IU/)
* [Зручна бібліотека для роботи з форматом CoNLL](https://github.com/EmilStenstrom/conllu)
* Стаття з блогу Matthew Honnibal - [Parsing English in 500 Lines of Python](https://explosion.ai/blog/parsing-english-in-python)
* Книга про парсери залежностей - [Dependency Parsing by Kübler, McDonald, and Nivre](https://books.google.com.ua/books?id=k3iiup7HB9UC&pg=PA21&hl=uk&source=gbs_toc_r&cad=4#v=onepage&q&f=false)
* Гарний огляд типів парсера залежностей та оракулів - [Improvements in Transition Based Systems for Dependency Parsing](http://paduaresearch.cab.unipd.it/8004/1/Tesi.pdf)

## II. Використання парсера на нових даних

Виберіть кілька випадкових речень українською мовою на побудуйте дерева залежностей для них, використовуючи свій парсер.

Для токенізації можна використати https://github.com/lang-uk/tokenize-uk.

Для частиномовного аналізу можна використати https://github.com/kmike/pymorphy2. Зважте, що частиномовні теги в UD та в pymorphy2 відрізняються, зокрема pymorphy2 не розрізняє типи сполучників. Нижче подано спосіб вирівняти ці дві нотації:

```python
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='uk')

DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)
```

Запишіть ваші спостереження та результати в окремий файл.

### Оцінювання

80% - I. Покращення парсера залежностей  
20% - II. Використання парсера на нових даних

### Крайній термін

02.05.2020


## Part 1

In [5]:
from collections import OrderedDict
from conllu import parse
from enum import Enum

In [6]:
%%time

with open(os.path.join(DATA_PATH, "uk_iu-ud-train.conllu"), "r") as f:
    train_trees = parse(f.read())

with open(os.path.join(DATA_PATH, "uk_iu-ud-dev.conllu"), "r") as f:
    test_trees = parse(f.read())

CPU times: user 2.85 s, sys: 82.5 ms, total: 2.94 s
Wall time: 2.94 s


In [7]:
print(len(train_trees), len(test_trees))

5496 672


In [8]:
def print_tree(tree):
    for node in tree:
        head = node["head"]
        print("{} <-- {}".format(node["form"],
                             tree[head - 1]["form"]
                             if head > 0 else "root"))

def check_tree(tree):
    for n in tree:
        if not isinstance(n["id"], int):
            return True
    return False

In [9]:
tree = train_trees[0]
print_tree(tree)

У <-- домі
домі <-- була
римського <-- патриція
патриція <-- домі
Руфіна <-- патриція
була <-- root
прегарна <-- фреска
фреска <-- була
, <-- зображення
зображення <-- фреска
Венери <-- зображення
та <-- Адоніса
Адоніса <-- Венери
. <-- була


In [10]:
print("Bad trees: " )
print("Train:", len(list(filter(check_tree, train_trees))))
print("Test:", len(list(filter(check_tree, test_trees))))

Bad trees: 
Train: 197
Test: 16


In [11]:
clean_train_trees = list(filter(lambda t: not check_tree(t), train_trees))
clean_test_trees = list(filter(lambda t: not check_tree(t), test_trees))

print(len(clean_train_trees), len(clean_test_trees))

5299 656


In [12]:
def intersects(n1, n2):
    s1 = n1['id'] if n1['head'] > n1['id'] else n1['head']
    e1 = n1['head'] if n1['head'] > n1['id'] else n1['id']
    s2 = n2['id'] if n2['head'] > n2['id'] else n2['head']
    e2 = n2['head'] if n2['head'] > n2['id'] else n2['id']
    
    return (s1 < s2 and e1 > s2 and e2 > e1) or (s2 < s1 and e2 > s1 and e1 > e2)

def non_projective(tree):
    for n1 in tree:
        for n2 in tree:
            if n1['id'] < n2['id'] and intersects(n1, n2):
                return True
            
    return False

In [13]:
non_projective_trees = list(filter(non_projective, clean_train_trees))

len(non_projective_trees)

414

In [14]:
projective_trees = list(filter(lambda t: not non_projective(t), clean_train_trees))

len(projective_trees)

4885

### Design actions and the oracle

In [15]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

In [16]:
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

In [17]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def trace_actions(tree, log=True):
    """
    Try out the oracle to verify it's returning the right actions.
    """
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        if log:
            print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
            print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
            print("Relations:", relations)
            print(action)
            print("========================")
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    if log:
        print("Gold relations:")
        print([(node["id"], node["head"]) for node in tree])
        print("Retrieved relations:")
        print(sorted(relations))

In [18]:
# trace_actions(tree)

### Feature extraction

In [19]:
# def extract_features(stack, queue):
#     features = dict()
#     if len(stack) > 0:
#         stack_top = stack[-1]
#         features["s0-word"] = stack_top["form"]
#         features["s0-lemma"] = stack_top["lemma"]
#         features["s0-tag"] = stack_top["upostag"]
#         if stack_top["feats"]:
#             for k, v in stack_top["feats"].items():
#                 features["s0-" + k] = v
#     if len(stack) > 1:
#         features["s1-tag"] = stack_top["upostag"]
#     if queue:
#         queue_top = queue[0]
#         features["q0-word"] = queue_top["form"]
#         features["q0-lemma"] = queue_top["lemma"]
#         features["q0-tag"] = queue_top["upostag"]
#         if queue_top["feats"]:
#             for k, v in queue_top["feats"].items():
#                 features["q0-" + k] = v
#     if len(queue) > 1:
#         queue_next = queue[1]
#         features["q1-word"] = queue_next["form"]
#         features["q1-tag"] = queue_next["upostag"]
#     if len(queue) > 2:
#         features["q2-tag"] = queue[2]["upostag"]
#     if len(queue) > 3:
#         features["q3-tag"] = queue[3]["upostag"]
#     if stack and queue:
#         features["distance"] = queue[0]["id"] - stack[-1]["id"]
#     return features

In [20]:
def extract_features(stack, queue, relations=None):
    
    features = dict()
    
    if len(stack) > 1:        
        features["s0-word"] = stack[-2]["form"]
        features["s0-lemma"] = stack[-2]["lemma"]
        features["s0-tag"] = stack[-2]["upostag"]
#         features["s0-rchildren-num"] = len([r for r in relations if r[1] == stack[-2]['id']])
#         features["s0-lchildren-num"] = len([r for r in relations if r[0] == stack[-2]['id']])
        if stack[-2]["feats"]:
            for k, v in stack[-2]["feats"].items():
                features["s0-" + k] = v
    
    if len(stack) > 2:
        features["s1-word"] = stack[-3]["form"]
        features["s1-tag"] = stack[-3]["upostag"]
    
    if len(stack) > 3:
        features["s2-tag"] = stack[-4]["upostag"]
        
    if len(stack) > 4:
        features["s3-tag"] = stack[-5]["upostag"]
    
    if len(stack) > 1:
        queue_top = stack[-1]
        features["q0-word"] = stack[-1]["form"]
        features["q0-lemma"] = stack[-1]["lemma"]
        features["q0-tag"] = stack[-1]["upostag"]
#         features["q0-rchildren-num"] = len([r for r in relations if r[1] == stack[-1]['id']])
#         features["q0-lchildren-num"] = len([r for r in relations if r[0] == stack[-1]['id']])
        if stack[-1]["feats"]:
            for k, v in stack[-1]["feats"].items():
                features["q0-" + k] = v
    
    if len(queue) > 0:        
        features["q1-word"] = queue[0]["form"]
        features["q1-tag"] = queue[0]["upostag"]
    
    if len(queue) > 1:
        features["q2-tag"] = queue[1]["upostag"]
    
    if len(queue) > 2:
        features["q3-tag"] = queue[2]["upostag"]
       
    if len(stack) > 1:
        features["distance"] = stack[-1]["id"] - stack[-2]["id"]
    
    features['q-empty'] = not bool(queue)    
    
    return features

### Prepare train and test data

In [21]:
def get_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extract_features(stack, queue))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

In [22]:
features, labels = get_data(tree)
print("Number of words:", len(tree))
print("Number of actions:", len(labels))
print("List of actions taken:", labels)
# print("Features:")
# for word in features:
#     print(word)

Number of words: 14
Number of actions: 29
List of actions taken: ['shift', 'left', 'shift', 'shift', 'left', 'right', 'right', 'reduce', 'reduce', 'left', 'right', 'shift', 'left', 'right', 'shift', 'left', 'right', 'right', 'shift', 'left', 'right', 'reduce', 'reduce', 'reduce', 'reduce', 'right', 'reduce', 'reduce', 'reduce']


In [23]:
train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels
    
test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

    
print("train: ", len(train_features), len(train_labels))
print("test: ", len(test_features), len(test_labels))

train:  190298 190298
test:  25820 25820


### Train a classifier

In [24]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

In [25]:
RANDOM_STATE = 0
N_COMP = 500

In [26]:
vectorizer = DictVectorizer()
truncated_svd = TruncatedSVD(n_components=N_COMP)

In [27]:
vec = vectorizer.fit(train_features)

train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))


Total number of features:  115611


In [28]:
%%time

t_svd = truncated_svd.fit(train_features_vectorized)

train_features_reduced = t_svd.transform(train_features_vectorized)
test_features_reduced = t_svd.transform(test_features_vectorized)

CPU times: user 4min 54s, sys: 1min 13s, total: 6min 7s
Wall time: 59.2 s


#### Logistic Regression

In [29]:
lrc = LogisticRegression(random_state=RANDOM_STATE,
                         C=1,
                         solver="sag", 
                         multi_class="multinomial", # ovr
                         max_iter=1000, 
                         verbose=100)
lrc.fit(train_features_vectorized, train_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
max_iter reached after 151 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.5min finished




LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='sag',
          tol=0.0001, verbose=100, warm_start=False)

In [30]:
predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.88      0.90      0.89      6371
      reduce       0.86      0.81      0.83      6875
       right       0.77      0.80      0.79      5996
       shift       0.85      0.87      0.86      6578

   micro avg       0.84      0.84      0.84     25820
   macro avg       0.84      0.84      0.84     25820
weighted avg       0.84      0.84      0.84     25820



#### Support Vector Classification 

In [55]:
# from sklearn.svm import SVC

In [47]:
# svc = SVC(C=1, 
# #           kernel='rbf',
#           gamma='auto',
#           verbose=10, 
#           max_iter=100, 
# #           decision_function_shape='ovr',
#           random_state=RANDOM_STATE)
# # svc.fit(train_features_reduced, train_labels)
# svc.fit(train_features_vectorized, train_labels)

In [48]:
# predicted = svc.predict(test_features_vectorized)
# print(classification_report(test_labels, predicted))

Doesn't work at all. Had not too much time to play with hypoparams

#### LightGBM classification

In [49]:
import numpy as np
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

In [50]:
def lgb_fscore(y_true, y_pred):
    y_pred = y_pred.reshape(len(np.unique(y_true)), -1)
    y_pred = y_pred.argmax(axis=0)
    res = f1_score(y_true, y_pred, average='macro')
    return 'macro_f1', res, True

In [53]:
params = {
    'num_class': 4,
    'num_rounds': 5000,
    'max_depth': -1, #  8
    'learning_rate': 0.01,  #  0.007
    'num_leaves': 31, # was 127
    'verbose': 100,
    'early_stopping_rounds': 300,
    'min_data_in_leaf': 20,
    'lambda_l2': 0.7,
    'feature_fraction': 0.2, #  0.8
    'metric': 'custom',
    'random_state': RANDOM_STATE
}


lgb_clf = LGBMClassifier(**params)

In [58]:
lgb_clf.fit(
    X=train_features_reduced,
    y=train_labels,
    eval_set=[(test_features_reduced, test_labels)],
    verbose=params['verbose'],
    eval_metric=lgb_fscore,
)

  'precision', 'predicted', average, warn_for)


Training until validation scores don't improve for 300 rounds
[100]	valid_0's macro_f1: 0.746799
[200]	valid_0's macro_f1: 0.775073
[300]	valid_0's macro_f1: 0.793324
[400]	valid_0's macro_f1: 0.802359
[500]	valid_0's macro_f1: 0.811071
[600]	valid_0's macro_f1: 0.818107
[700]	valid_0's macro_f1: 0.821889
[800]	valid_0's macro_f1: 0.826738
[900]	valid_0's macro_f1: 0.831232
[1000]	valid_0's macro_f1: 0.834389
[1100]	valid_0's macro_f1: 0.837445
[1200]	valid_0's macro_f1: 0.839946
[1300]	valid_0's macro_f1: 0.841971
[1400]	valid_0's macro_f1: 0.844382
[1500]	valid_0's macro_f1: 0.847029
[1600]	valid_0's macro_f1: 0.849394
[1700]	valid_0's macro_f1: 0.851064
[1800]	valid_0's macro_f1: 0.852692
[1900]	valid_0's macro_f1: 0.854158
[2000]	valid_0's macro_f1: 0.854906
[2100]	valid_0's macro_f1: 0.856036
[2200]	valid_0's macro_f1: 0.857217
[2300]	valid_0's macro_f1: 0.858188
[2400]	valid_0's macro_f1: 0.859826
[2500]	valid_0's macro_f1: 0.860991
[2600]	valid_0's macro_f1: 0.8615
[2700]	valid_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        early_stopping_rounds=300, feature_fraction=0.2,
        importance_type='split', lambda_l2=0.7, learning_rate=0.01,
        max_depth=-1, metric='custom', min_child_samples=20,
        min_child_weight=0.001, min_data_in_leaf=20, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_class=4, num_leaves=31,
        num_rounds=3000, objective=None, random_state=0, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0, verbose=100)

In [60]:
predicted = lgb_clf.predict(test_features_reduced)

print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.89      0.93      0.91      6371
      reduce       0.87      0.82      0.85      6875
       right       0.81      0.82      0.81      5996
       shift       0.88      0.89      0.88      6578

   micro avg       0.87      0.87      0.87     25820
   macro avg       0.86      0.87      0.86     25820
weighted avg       0.87      0.87      0.86     25820



### Calculate the unlabeled attachment score

In [67]:
from tqdm import tqdm

In [68]:
def dep_parse(sentence, oracle, vectorizer, t_svd=None, log=False):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue)
            features = vectorizer.transform([features])
            if t_svd:
                features = t_svd.transform(features)
            action = oracle.predict(features)[0]
            if log:
                print("Stack:", [i["form"]+"_"+str(i["id"]) for i in stack])
                print("Queue:", [i["form"]+"_"+str(i["id"]) for i in queue])
                print("Relations:", relations)
                print(action)
                print("========================")
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [70]:
total, tp = 0, 0
for tree in tqdm(test_trees[:10]):
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lgb_clf, vec, t_svd)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))

100%|██████████| 10/10 [00:13<00:00,  1.35s/it]

Total: 134
Correctly defined: 93
UAS: 0.69





## Part 2

In [72]:
import pymorphy2

In [73]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [74]:
DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

In [128]:
from tokenize_uk import tokenize_uk

def convert_string_to_tree_format(text):
    """
    перетрорення речення в формат корпусу
    """
#     tokens= [t['form'] for t in trees_test[0]]

    tokens = tokenize_uk.tokenize_words(text)
    i = 0
    tree = []
    for token in tokens:
        i = i + 1
        # pos = str(morph.parse(token)[0].tag.POS)
        word = morph.parse(token)[0]
        token_ = OrderedDict([('id', i), 
                              ('form', str(token)), 
                              ('lemma', str(morph.parse(token)[0].normal_form)),
                              ('upostag', normalize_pos(word)), 
                              ('xpostag', None), ('feats', None), 
                              ('head', None), ('deprel', None),
                              ('deps', None), ('misc', None)])
        tree.append(token_)
    return tree

In [129]:
sent1 = "Це є звичайне речення, на якому ми тестуємо нашу гіпотезу."

In [130]:
tree1 = convert_string_to_tree_format(sent1)
tree1

[OrderedDict([('id', 1),
              ('form', 'Це'),
              ('lemma', 'це'),
              ('upostag', 'DET'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'є'),
              ('lemma', 'бути'),
              ('upostag', 'VERB'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 3),
              ('form', 'звичайне'),
              ('lemma', 'звичайний'),
              ('upostag', 'ADJ'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 4),
              ('form', 'речення'),
 

In [131]:
tree_pred = dep_parse(convert_string_to_tree_format(sent1), lgb_clf, vec, t_svd)
tree_pred

[(1, 2),
 (2, 0),
 (2, 9),
 (3, 4),
 (4, 2),
 (5, 9),
 (6, 9),
 (7, 9),
 (8, 9),
 (9, 0),
 (10, 11),
 (11, 9),
 (12, 9)]

In [132]:
sent2 = 'Ти признайся мені, звідки в тебе ті чари'

In [133]:
tree2 = convert_string_to_tree_format(sent1)
tree2

[OrderedDict([('id', 1),
              ('form', 'Це'),
              ('lemma', 'це'),
              ('upostag', 'DET'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'є'),
              ('lemma', 'бути'),
              ('upostag', 'VERB'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 3),
              ('form', 'звичайне'),
              ('lemma', 'звичайний'),
              ('upostag', 'ADJ'),
              ('xpostag', None),
              ('feats', None),
              ('head', None),
              ('deprel', None),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 4),
              ('form', 'речення'),
 

In [134]:
tree_pred = dep_parse(convert_string_to_tree_format(sent2), lgb_clf, vec, t_svd)
tree_pred

[(1, 2), (2, 0), (3, 2), (4, 9), (5, 9), (6, 7), (8, 9), (9, 2)]