In [1]:
from collections import OrderedDict
from conllu import parse
from enum import Enum
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import copy
import tokenize_uk

PATH = "UD_Ukrainian-IU"

with open(PATH + "/uk_iu-ud-train.conllu", "r") as f:
    train_trees = parse(f.read())

with open(PATH + "/uk_iu-ud-dev.conllu", "r") as f:
    test_trees = parse(f.read())

### Завдання 1: Покращення парсера залежностей

### Додаю SWAP для обрабки непроективних дерев

Для імплементації SWAP я керувалася статтями "Transition-Based Techniques for NonProjective Dependency Parsing" (https://www.diva-portal.org/smash/get/diva2:661423/FULLTEXT01.pdf) та "Non-Projective Dependency Parsing in Expected Linear Time" (https://www.aclweb.org/anthology/P09-1040.pdf), умову обрання дії SWAP в Оракулі підібрала частково експерементним шляхом, так як найбільш очевидна частина умови описується у статтях (id другого елементу стеку має бути меншим за id топу стеку), але не згадується які токени перевіряти на проективні залежності. Тож я дивилася на непроективні дерева у даних, та під них намагалася підібрати такі умови для Swap в Оракулі, щоб проективні зв'язки проставлятися точніше. 

Так, мені треба було виявити які саме токени стеку та черги порівнювати на непроективні залежності. Одна з ідей була порівняти тільки елементи стеку - топ стеку та другий елемент з кінця, але така конфігурація давала 0,04 покриття + погіршилися покриття та точність інших дій (SHIFT, LEFT і тд). На жаль, я не змогла знайти пояснення помилкам, тому обрала надалі працювати з конфігурацією, де перевіряю на непроективність топ стеку та топ черги, так як це дає кращу точність та покриття. 



In [2]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"
    SWAP = "swap"

def oracle(stack, top_queue, relations):
    top_stack = stack[-1]
    if len(stack) > 1:
        prev_stack = stack[-2]
    if top_stack and not top_queue:
        return Actions.REDUCE
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    elif len(stack) > 1 and prev_stack["id"] != 0 and (prev_stack["id"] < top_stack["id"]):
        a,c = sorted([top_queue["head"], top_queue["id"]])
        b,d = sorted([top_stack["head"], top_stack["id"]])
        if a < b and b < c and c < d:
            return Actions.SWAP
        return Actions.SHIFT
    else:
        return Actions.SHIFT


In [3]:
def extract_features(stack, queue, tree):
    features = dict()
    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

In [4]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])  

def get_data(tree, extractor):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []

    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                        queue[0] if len(queue) > 0 else None,
                        relations)
        features.append(extractor(stack, queue, tree))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.SWAP:
            queue.insert(0, stack.pop(-2))
        else:
            print("Unknown action.")
#     print(relations)
    return features, labels

#### Залежності без операції SWAP

In [29]:
test_tree = train_trees[5227]
print(test_tree)
result = get_data([t for t in test_tree if type(t["id"])==int], extract_features)

TokenList<Також, у, кліпі, некрасиві, люди, .>
[(2, 3), (5, 4), (6, 4)]


#### Залежності зі SWAP  --> матчиться більше залежностей


In [26]:
test_tree = train_trees[5227]
print(test_tree)
result = get_data([t for t in test_tree if type(t["id"])==int], extract_features)

TokenList<Також, у, кліпі, некрасиві, люди, .>
[(2, 3), (1, 4), (5, 4), (6, 4)]


In [None]:
def get_rels (tree, clf, extractor, vec):   
    stack, queue, relations = [ROOT], tree[:], []
    while queue or stack:
        
        features = extractor(stack, queue, tree)
        feats_vectorized = vec.transform([features])
        action = clf.predict(feats_vectorized)
        
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action == Actions.SWAP:
            queue.insert(0, stack.pop(-2))
        else:
            print("Unknown action.")
    return relations

result = get_rels(parsed_sentence, model2, extract_features, vec2)

In [5]:
def prepare_data (trees, extractor):
    features, labels = [], []
    for tree in trees:
        tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int], extractor)
        features += tree_features
        labels += tree_labels
    return features, labels

train_features, train_labels = prepare_data(train_trees, extract_features)
test_features, test_labels = prepare_data(test_trees, extract_features)

In [65]:
def train_model (train_features, test_features, train_labels):
    vectorizer = DictVectorizer()
    vec = vectorizer.fit(train_features)
    train_features_vectorized = vec.transform(train_features)
    test_features_vectorized = vec.transform(test_features)
    lrc = LogisticRegression(random_state=42, solver="saga",
                             multi_class="multinomial", max_iter=600, 
                             verbose=1)
    model = lrc.fit(train_features_vectorized, train_labels)
    predicted = lrc.predict(test_features_vectorized)
    return predicted, model, vec

In [66]:
predicted, model, vec = train_model(train_features, test_features, train_labels)
print(classification_report(test_labels, predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 580 epochs took 132 seconds
              precision    recall  f1-score   support

        left       0.85      0.87      0.86      6409
      reduce       0.85      0.78      0.81      6837
       right       0.75      0.79      0.77      6022
       shift       0.84      0.87      0.86      6625
        swap       0.60      0.16      0.26        73

    accuracy                           0.82     25966
   macro avg       0.78      0.69      0.71     25966
weighted avg       0.82      0.82      0.82     25966



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min finished


### Додаю нові фічі 1

In [8]:
def merge_features (features1, features2):
    features = copy.deepcopy(features1)
    for i, f in enumerate(features2):
        for k,v in f.items():
            features[i][k] = v
    return features   

In [9]:
def count_children (tree, node_id):
    return len([node['head'] for node in tree if node['head'] == node_id])

def extract_features_1 (stack, queue, tree):
    features = dict()
    if len(stack) > 0:
        features["s0-child-num"] = count_children(tree, stack[-1]["id"])
    if queue:
        features["q0-child-num"] = count_children(tree, queue[0]["id"])
    return features

In [10]:
train_features_1, _ = prepare_data(train_trees, extract_features_1)
test_features_1, _ = prepare_data(test_trees, extract_features_1)

train_features_1 = merge_features(train_features, train_features_1)
test_features_1 = merge_features(test_features, test_features_1)

predicted, model1, vec1 = train_model (train_features_1, test_features_1, train_labels)
print(classification_report(test_labels, predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 142 seconds
              precision    recall  f1-score   support

        left       0.88      0.91      0.90      6409
      reduce       0.86      0.82      0.84      6837
       right       0.82      0.83      0.82      6022
       shift       0.88      0.90      0.89      6625
        swap       0.54      0.18      0.27        73

    accuracy                           0.86     25966
   macro avg       0.80      0.73      0.74     25966
weighted avg       0.86      0.86      0.86     25966



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min finished


### Ще додаю трохи нових фіч - 2

In [11]:
def extract_features_2 (stack, queue, tree):
    features = dict()
    if len(stack) > 1:
        features["s1-child-num"] = count_children(tree, stack[-2]["id"])
    if len(queue) > 1:
        features["q1-child-num"] = count_children(tree, queue[1]["id"])
    if len(queue) > 2:
        features["q2-child-num"] = count_children(tree, queue[2]["id"])
    if len(queue) > 3:
        features["q3-child-num"] = count_children(tree, queue[3]["id"])
    return features

In [12]:
train_features_2, _ = prepare_data(train_trees, extract_features_2)
test_features_2, _ = prepare_data(test_trees, extract_features_2)

train_features_2 = merge_features(train_features_1, train_features_2)
test_features_2 = merge_features(test_features_1, test_features_2)

predicted, model2, vec2 = train_model (train_features_2, test_features_2, train_labels)
print(classification_report(test_labels, predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 146 seconds
              precision    recall  f1-score   support

        left       0.88      0.92      0.90      6409
      reduce       0.87      0.82      0.84      6837
       right       0.82      0.83      0.83      6022
       shift       0.88      0.90      0.89      6625
        swap       0.64      0.19      0.29        73

    accuracy                           0.87     25966
   macro avg       0.82      0.73      0.75     25966
weighted avg       0.86      0.87      0.86     25966



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min finished


### Ще додаю трохи нових фіч - 3

In [13]:
def extract_features_3 (stack, queue, tree):
    features = dict()
    if len(stack) > 0:
        features["s0-id"] = stack[-1]["id"]
    if queue:
        features["q0-id"] = queue[0]["id"]
    if len(stack) > 1:
        features["s1-id"] = stack[-2]["id"]
    if len(queue) > 1:
        features["q1-id"] = queue[1]["id"]
    if len(queue) > 2:
        features["q2-id"] = queue[2]["id"]
    if len(queue) > 3:
        features["q3-id"] = queue[3]["id"]
    return features

In [18]:
train_features_3, _ = prepare_data(train_trees, extract_features_3)
test_features_3, _ = prepare_data(test_trees, extract_features_3)

train_features_3 = merge_features(train_features_2, train_features_3)
test_features_3 = merge_features(test_features_2, test_features_3)

predicted, model3, vec3 = train_model(train_features_3, test_features_3, train_labels)
print(classification_report(test_labels, predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 174 seconds
              precision    recall  f1-score   support

        left       0.86      0.91      0.88      6409
      reduce       0.85      0.78      0.82      6837
       right       0.79      0.79      0.79      6022
       shift       0.85      0.88      0.87      6625
        swap       0.00      0.00      0.00        73

    accuracy                           0.84     25966
   macro avg       0.67      0.67      0.67     25966
weighted avg       0.84      0.84      0.84     25966



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.9min finished


### Пошук гіперпараметрів

Вирішила також спробувати зробити гіперпошук параметів, так як буду його робити також і для курсової роботи

Як видно - у мене не було покращення якості після пошуку гіперпараметрів. 

In [24]:
from sklearn.model_selection import RandomizedSearchCV

vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features_2)
train_features_vectorized = vec.transform(train_features_2)
test_features_vectorized = vec.transform(test_features_2)

logistic = LogisticRegression(random_state=42,
                             multi_class="multinomial", max_iter=600, 
                             verbose=1)
hyperparameters = {
    'penalty': ['l1', 'l2'],  
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'solver':  ['sag', 'saga']
}

clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1,
                         cv=3, verbose=0, n_jobs=-1)
model = clf.fit(train_features_vectorized, train_labels)
predicted = model.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


max_iter reached after 8524 seconds
              precision    recall  f1-score   support

        left       0.89      0.92      0.90      6409
      reduce       0.87      0.83      0.85      6837
       right       0.83      0.84      0.83      6022
       shift       0.89      0.90      0.90      6625
        swap       0.52      0.19      0.28        73

    accuracy                           0.87     25966
   macro avg       0.80      0.74      0.75     25966
weighted avg       0.87      0.87      0.87     25966



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 142.1min finished


In [27]:
# model.best_params_

{'solver': 'saga', 'penalty': 'l1', 'C': 1}

### Завдання 2: Використання парсера на нових даних

In [14]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer(lang='uk')

DET = ['будь-який', 'ваш', 'ввесь', 'весь', 'все', 'всенький', 'всякий',
       'всілякий', 'деякий', 'другий', 'жадний', 'жодний', 'ин.', 'ін.',
       'інакший', 'інш.', 'інший', 'їх', 'їхній', 'її', 'його', 'кожний',
       'кожній', 'котрий', 'котрийсь', 'кілька', 'мій', 'наш', 'небагато',
       'ніякий', 'отакий', 'отой', 'оцей', 'сам', 'самий', 'свій', 'сей',
       'скільки', 'такий', 'тамтой', 'твій', 'те', 'той', 'увесь', 'усякий',
       'усілякий', 'це', 'цей', 'чий', 'чийсь', 'який', 'якийсь']

PREP = ["до", "на"]

mapping = {"ADJF": "ADJ", "ADJS": "ADJ", "COMP": "ADJ", "PRTF": "ADJ",
           "PRTS": "ADJ", "GRND": "VERB", "NUMR": "NUM", "ADVB": "ADV",
           "NPRO": "PRON", "PRED": "ADV", "PREP": "ADP", "PRCL": "PART"}

def normalize_pos(word):
    if word.tag.POS == "CONJ":
        if "coord" in word.tag:
            return "CCONJ"
        else:
            return "SCONJ"
    elif "PNCT" in word.tag:
        return "PUNCT"
    elif word.normal_form in PREP:
        return "PREP"
    elif word.normal_form in DET:
        return "DET"
    else:
        return mapping.get(word.tag.POS, word.tag.POS)

In [70]:
from collections import OrderedDict

# приводжу речення до формату Conllu

def parse_sent (sent):
    i = 1
    parsed_sentence = []
    for token in tokenize_uk.tokenize_words(sent):
        token = morph.parse(token)[0]
        parsed_token =OrderedDict([("id", i),("form", token.word),("lemma", token.normal_form),
                                   ("upostag",  normalize_pos(token))])
        i += 1
        parsed_sentence.append(parsed_token)
        
    return parsed_sentence

def get_rels (sent, clf, extractor, vec): 
    tree = parse_sent(sent)
    stack, queue, relations,features = [ROOT], tree[:], [], []
    i = 0
    while queue or stack:
        
        features.append(extractor(stack, queue, tree))
        feats_vectorized = vec.transform(features)

        action = clf.predict(feats_vectorized)
        if action[i] == 'shift':
            stack.append(queue.pop(0))
        elif action[i] == 'reduce':
            stack.pop()
        elif action[i] == 'left':
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action[i] == 'right':
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        elif action[i] == 'swap':
            queue.insert(0, stack.pop(-2))
        else:
            print("Unknown action.")
            
        i += 1
    return relations

In [72]:
get_rels("У парку Кіото (Київ) зацвіла одна з найдовших у світі алея сакур.", model2, extract_features, vec2)

[(1, 2),
 (3, 2),
 (4, 5),
 (5, 3),
 (6, 2),
 (7, 8),
 (9, 10),
 (11, 12),
 (10, 12),
 (12, 8),
 (14, 13),
 (15, 8)]

In [75]:
get_rels("Я ніколи не думала, що зможу написати парсер залежностей!", model2, extract_features, vec2)

[(3, 4), (2, 4), (1, 4), (9, 8), (10, 9)]