In [13]:
from collections import OrderedDict
from conllu import parse
from enum import Enum
from copy import deepcopy


from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [14]:
# Read the data

PATH = "/home/holdbar/projects/UD_Ukrainian-IU"

with open(PATH + "/uk_iu-ud-train.conllu", "r") as f:
    train_trees = parse(f.read())

with open(PATH + "/uk_iu-ud-dev.conllu", "r") as f:
    test_trees = parse(f.read())

with open(PATH + "/uk_iu-ud-test.conllu", "r") as f:
    final_test_trees = parse(f.read())

In [15]:
# Design actions and the oracle
    
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"


def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

In [16]:
# Feature extraction

def extract_features(stack, queue):
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

In [17]:

# Prepare train and test data

ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

def get_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extract_features(stack, queue))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

In [18]:

# A simple hack would be to check the type of the node id

train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

print(len(train_features), len(train_labels))


# Test data

test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

print(len(test_features), len(test_labels))


190298 190298
25820 25820


In [19]:
# Train a classifier

vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))

train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)

# print(len(train_features_vectorized.toarray()), len(test_features_vectorized.toarray()))

lrc = LogisticRegression(random_state=42, solver="saga", multi_class="multinomial", max_iter=600)
lrc.fit(train_features_vectorized, train_labels)

predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))



Total number of features:  111126
              precision    recall  f1-score   support

        left       0.86      0.87      0.86      6371
      reduce       0.85      0.78      0.81      6875
       right       0.75      0.79      0.77      5996
       shift       0.85      0.87      0.86      6578

    accuracy                           0.83     25820
   macro avg       0.83      0.83      0.83     25820
weighted avg       0.83      0.83      0.83     25820



In [20]:

# Calculate the unlabeled attachment score
# UAS - the percentage of words in an input that are assigned the correct head.

def dep_parse(sentence, oracle, vectorizer, log=True):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue)
            action = oracle.predict(vectorizer.transform([features]))[0]
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

total, tp, full_match = 0, 0, 0
for tree in test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lrc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))

Total: 12574
Correctly defined: 8717
UAS: 0.69
Full match: 0.09


In [21]:
# Find non-projective trees

def is_non_projective(tree):
    relations = [[i['id'], i['head']] for i in tree if type(i["id"])==int]
    for rel in relations:
        for ref_rel in relations:
            a, c = sorted(rel)
            b, d = sorted(ref_rel)
            if a < b and b < c and c < d:
                return True
    return False

total_non_pr = 0
tree_ids = []
for i in range(len(train_trees)):
    total_non_pr += 1
    tree_ids.append(i)

# 8% (e.g., tree no. 28)
print("The number of non-projective trees is {} ({} out of {}).".
      format(round(total_non_pr * 100 / len(train_trees), 2), total_non_pr, len(train_trees)))

print("IDs:", tree_ids[:10])

The number of non-projective trees is 100.0 (5496 out of 5496).
IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [22]:
############## Add more features
print("################## Add more features:")


def get_feats(token, key):
    features = dict()
    feats = token["feats"]
    if isinstance(feats, str):
        feats = eval(feats)
    if isinstance(feats, OrderedDict):
        for k,v in feats.items():
            features[f'{key}-{k}'] = v

    return features
            
def extract_features(stack, queue):
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
        features.update(get_feats(stack_top,"s0"))
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
        features.update(get_feats(queue_top,"q0"))
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
        features.update(get_feats(queue_next,"q1"))
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

################## Add more features:


In [30]:

train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

print(len(train_features), len(train_labels))


# Test data

test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

print(len(test_features), len(test_labels))

final_test_features, final_test_labels = [], []
for tree in final_test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    final_test_features += tree_features
    final_test_labels += tree_labels

print(len(final_test_features), len(final_test_labels))

190298 190298
25820 25820
35124 35124


In [31]:


# Train a classifier

vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)

print("\nTotal number of features: ", len(vec.get_feature_names()))

train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)
final_test_features_vectorized = vec.transform(final_test_features)

# print(len(train_features_vectorized.toarray()), len(test_features_vectorized.toarray()))

lrc = LogisticRegression(random_state=42, solver="saga", multi_class="multinomial", max_iter=600)
lrc.fit(train_features_vectorized, train_labels)

predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))


Total number of features:  111327
              precision    recall  f1-score   support

        left       0.86      0.88      0.87      6371
      reduce       0.86      0.80      0.83      6875
       right       0.77      0.81      0.79      5996
       shift       0.86      0.87      0.87      6578

    accuracy                           0.84     25820
   macro avg       0.84      0.84      0.84     25820
weighted avg       0.84      0.84      0.84     25820





In [32]:

total, tp, full_match = 0, 0, 0
for tree in test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lrc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))

Total: 12574
Correctly defined: 8921
UAS: 0.71
Full match: 0.1


In [33]:
# other classifier - decision tree classifier with all features and decomposition

print("#### Decision tree classifier with all features:")

def dep_parse2(sentence, oracle, vectorizer, log=True):
    stack, queue, relations = [ROOT], sentence[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue)
            action = oracle.predict(vectorizer.transform([features]))[0]
            # actual parsing
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                if not stack:
                    break
                stack.pop()
            elif action == Actions.LEFT:
                if not stack:
                    break
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

#### Decision tree classifier with all features:


In [34]:

dtc = DecisionTreeClassifier()
dtc.fit(train_features_vectorized, train_labels)


predicted = dtc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.89      0.91      0.90      6371
      reduce       0.86      0.83      0.85      6875
       right       0.83      0.83      0.83      5996
       shift       0.90      0.91      0.90      6578

    accuracy                           0.87     25820
   macro avg       0.87      0.87      0.87     25820
weighted avg       0.87      0.87      0.87     25820



In [35]:

total, tp, full_match = 0, 0, 0
for tree in test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse2(tree, dtc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))


Total: 12574
Correctly defined: 9373
UAS: 0.75
Full match: 0.13


In [36]:
### Final test on conllu data with both classifiers
print("Decision tree final test:")
predicted = dtc.predict(final_test_features_vectorized)
print(classification_report(final_test_labels, predicted))


total, tp, full_match = 0, 0, 0
for tree in final_test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse2(tree, dtc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(final_test_trees), 2))


print("Logistic regression final test:")
predicted = lrc.predict(final_test_features_vectorized)
print(classification_report(final_test_labels, predicted))


total, tp, full_match = 0, 0, 0
for tree in final_test_trees:
    tree = [t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, lrc, vec, log=False)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(final_test_trees), 2))

Decision tree final test:
              precision    recall  f1-score   support

        left       0.89      0.91      0.90      8658
      reduce       0.85      0.81      0.83      9350
       right       0.82      0.83      0.83      8291
       shift       0.90      0.90      0.90      8825

    accuracy                           0.87     35124
   macro avg       0.86      0.87      0.87     35124
weighted avg       0.87      0.87      0.87     35124

Total: 17116
Correctly defined: 12702
UAS: 0.74
Full match: 0.2
Logistic regression final test:
              precision    recall  f1-score   support

        left       0.87      0.88      0.88      8658
      reduce       0.85      0.78      0.81      9350
       right       0.77      0.81      0.79      8291
       shift       0.86      0.87      0.87      8825

    accuracy                           0.84     35124
   macro avg       0.84      0.84      0.84     35124
weighted avg       0.84      0.84      0.84     35124

Total: 1

In [37]:
############ Test on my data                                                                                                                                               

from tokenize_uk import tokenize_uk
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='uk')

DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

def get_word_feats(word):
    feats = OrderedDict()
    if word.tag.animacy:
        feats["Animacy"] = word.tag.animacy.capitalize()
    if word.tag.aspect:
        feats["Aspect"] = word.tag.aspect.capitalize()
    if word.tag.case:
        feats["Case"] = word.tag.case.capitalize()
    if word.tag.gender:
        feats["Gender"] = word.tag.gender.capitalize()
    if word.tag.mood:
        feats["Mood"] = word.tag.mood.capitalize()
    if word.tag.number:
        feats["Number"] = word.tag.number.capitalize()
    if word.tag.person:
        feats["Person"] = word.tag.person.capitalize()
    if word.tag.tense:
        feats["Tense"] = word.tag.tense.capitalize()
    if word.tag.voice:
        feats["Voice"] = word.tag.voice.capitalize()

    return feats


def parse_sentence(text):
    tokens = tokenize_uk.tokenize_words(text)
    tree_nodes = []
    for i, token in enumerate(tokens):
        word = morph.parse(token)[0]
        tree_nodes.append(
            OrderedDict([
                ('id', i+1), 
                ('form', str(word.word)), 
                ('lemma', str(word.normal_form)),
                ('upostag', normalize_pos(word)),
                ('feats', get_word_feats(word)),
                ('deprel', None),
                ('head', None),
            ])
        )
        
    return tree_nodes

In [38]:
def set_relation_on_tree(tree_nodes, dep_parse_func,  oracle, vectorizer):
    tree = deepcopy(tree_nodes)
    relations = dep_parse_func(tree, oracle, vectorizer, log=False)
    for child, head in relations:
        tree[child - 1]["head"] = head - 1

    return tree


In [42]:

def print_tree(tree):
    for node in tree:
        head = node["head"]
        # print(node)
        # print(head)
        if head is None:
            head_str = 'none'
        elif head == 0:
            head_str = 'root'
        elif head > 0:
            head_str = tree[head]["form"]
        print("{} <-- {}".format(node["form"], head_str))

In [43]:
sentences = [
    "Долини падають і туляться до ніг, звивають завої, відсахуючись, гори.",
    "Наш пружний крок тверда земля доріг стрічає стогоном покори.",
    "Чи ж не підіб'єм, не зірвемо ми і обрій цей, і хмари ці рожеві?!",
    "І вогкий вітер дужими грудьми співає на моїм мечеві.",
    "Я приготував смачний обід."
]


In [46]:

## decision tree
print("Decision tree classifier:")
for sentence in sentences:
    tree_nodes = parse_sentence(sentence)
    tree = set_relation_on_tree(tree_nodes, dep_parse2, dtc, vec)
    print_tree(tree)


Decision tree classifier:
долини <-- падають
падають <-- падають
і <-- none
туляться <-- падають
до <-- none
ніг <-- падають
, <-- падають
звивають <-- падають
завої <-- падають
, <-- none
відсахуючись <-- падають
, <-- none
гори <-- падають
. <-- падають
наш <-- крок
пружний <-- крок
крок <-- стрічає
тверда <-- none
земля <-- крок
доріг <-- крок
стрічає <-- крок
стогоном <-- стрічає
покори <-- стрічає
. <-- стрічає
чи <-- підіб'єм
ж <-- підіб'єм
не <-- підіб'єм
підіб'єм <-- підіб'єм
, <-- підіб'єм
не <-- зірвемо
зірвемо <-- підіб'єм
ми <-- підіб'єм
і <-- обрій
обрій <-- підіб'єм
цей <-- none
, <-- підіб'єм
і <-- хмари
хмари <-- підіб'єм
ці <-- none
рожеві <-- підіб'єм
?! <-- підіб'єм
і <-- вітер
вогкий <-- вітер
вітер <-- none
дужими <-- none
грудьми <-- none
співає <-- none
на <-- none
моїм <-- none
мечеві <-- none
. <-- none
я <-- приготував
приготував <-- приготував
смачний <-- none
обід <-- приготував
. <-- приготував


In [47]:
## logistic regression
print("Logistic regression classifier:")
for sentence in sentences:
    tree_nodes = parse_sentence(sentence)
    tree = set_relation_on_tree(tree_nodes, dep_parse2, lrc, vec)
    print_tree(tree)

Logistic regression classifier:
долини <-- падають
падають <-- падають
і <-- none
туляться <-- падають
до <-- none
ніг <-- падають
, <-- падають
звивають <-- падають
завої <-- падають
, <-- none
відсахуючись <-- падають
, <-- none
гори <-- падають
. <-- падають
наш <-- крок
пружний <-- крок
крок <-- стрічає
тверда <-- none
земля <-- крок
доріг <-- крок
стрічає <-- крок
стогоном <-- стрічає
покори <-- стрічає
. <-- стрічає
чи <-- підіб'єм
ж <-- підіб'єм
не <-- підіб'єм
підіб'єм <-- підіб'єм
, <-- підіб'єм
не <-- зірвемо
зірвемо <-- підіб'єм
ми <-- підіб'єм
і <-- обрій
обрій <-- підіб'єм
цей <-- none
, <-- підіб'єм
і <-- хмари
хмари <-- підіб'єм
ці <-- none
рожеві <-- підіб'єм
?! <-- підіб'єм
і <-- вітер
вогкий <-- вітер
вітер <-- none
дужими <-- none
грудьми <-- none
співає <-- none
на <-- none
моїм <-- none
мечеві <-- none
. <-- none
я <-- приготував
приготував <-- приготував
смачний <-- none
обід <-- приготував
. <-- приготував
