In [1]:
from collections import OrderedDict
from conllu import parse 
from enum import Enum
import os

In [2]:
script_path = os.path.abspath('__file__') 
path_list = script_path.split(os.sep)
script_directory = path_list[0:len(path_list)-5]
rel_path = "UD_Ukrainian-IU"
PATH = "/".join(script_directory[:4]) + "/" + rel_path

In [3]:
with open(PATH + "/uk_iu-ud-train.conllu", "r") as f:
    train_trees = parse(f.read())

with open(PATH + "/uk_iu-ud-dev.conllu", "r") as f:
    test_trees = parse(f.read())

In [4]:
class Actions(str, Enum):
    SHIFT = "shift"
    REDUCE = "reduce"
    RIGHT = "right"
    LEFT = "left"

In [5]:
def oracle(stack, top_queue, relations):
    """
    Make a decision on the right action to do.
    """
    top_stack = stack[-1]
    # check if both stack and queue are non-empty
    if top_stack and not top_queue:
        return Actions.REDUCE
    # check if there are any clear dependencies
    elif top_queue["head"] == top_stack["id"]:
        return Actions.RIGHT
    elif top_stack["head"] == top_queue["id"]:
        return Actions.LEFT
    # check if we can reduce the top of the stack
    elif top_stack["id"] in [i[0] for i in relations] and \
         (top_queue["head"] < top_stack["id"] or \
          [s for s in stack if s["head"] == top_queue["id"]]):
        return Actions.REDUCE
    # default option
    else:
        return Actions.SHIFT

In [6]:
ROOT = OrderedDict([('id', 0), ('form', 'ROOT'), ('lemma', 'ROOT'), ('upostag', 'ROOT'),
                    ('xpostag', None), ('feats', None), ('head', None), ('deprel', None),
                    ('deps', None), ('misc', None)])

In [7]:
def extract_features(stack, queue):
    features = dict()

    if len(stack) > 0:
        stack_top = stack[-1]
        features["s0-word"] = stack_top["form"]
        features["s0-lemma"] = stack_top["lemma"]
        features["s0-tag"] = stack_top["upostag"]
         # фічі для слова в стеці
        if stack_top["feats"] != None:
            features["s0-feats"] = "_".join([f for f in stack_top["feats"].values()])
        else:
            features["s0-feats"] = False
    if len(stack) > 1:
        features["s1-tag"] = stack[-2]["upostag"]
        if stack[-2]["feats"] != None:
            features["s1-feats"] = "_".join([f for f in stack[-2]["feats"].values()])
        else:
            features["s1-feats"] = False
    if queue:
        queue_top = queue[0]
        features["q0-word"] = queue_top["form"]
        features["q0-lemma"] = queue_top["lemma"]
        features["q0-tag"] = queue_top["upostag"]
        # фічі для 1-го слова в черзі
        if queue_top["feats"] != None:
            features["q0-feats"] = "_".join([f for f in queue_top["feats"].values()])
        else:
            features["q0-feats"] = False
    if len(queue) > 1:
        queue_next = queue[1]
        features["q1-word"] = queue_next["form"]
        features["q1-tag"] = queue_next["upostag"]
        # фічі для 2-го слова в черзі
        if queue_next["feats"] != None:
            features["q1-feats"] = "_".join([f for f in queue_next["feats"].values()])
        else:
            features["q1-feats"] = False
    if len(queue) > 2:
        features["q2-tag"] = queue[2]["upostag"]
    if len(queue) > 3:
        features["q3-tag"] = queue[3]["upostag"]
    return features

In [8]:
def get_data(tree):
    features, labels = [], []
    stack, queue, relations = [ROOT], tree[:], []
    
    while queue or stack:
        action = oracle(stack if len(stack) > 0 else None,
                       queue[0] if len(queue) > 0 else None,
                       relations)
        features.append(extract_features(stack, queue))
        labels.append(action.value)
        if action == Actions.SHIFT:
            stack.append(queue.pop(0))
        elif action == Actions.REDUCE:
            stack.pop()
        elif action == Actions.LEFT:
            relations.append((stack[-1]["id"], queue[0]["id"]))
            stack.pop()
        elif action == Actions.RIGHT:
            relations.append((queue[0]["id"], stack[-1]["id"]))
            stack.append(queue.pop(0))
        else:
            print("Unknown action.")
    return features, labels

In [9]:
train_features, train_labels = [], []
for tree in train_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    train_features += tree_features
    train_labels += tree_labels

print(len(train_features), len(train_labels))

190298 190298


In [10]:
test_features, test_labels = [], []
for tree in test_trees:
    tree_features, tree_labels = get_data([t for t in tree if type(t["id"])==int])
    test_features += tree_features
    test_labels += tree_labels

print(len(test_features), len(test_labels))

25820 25820


In [11]:
from sklearn.feature_extraction import DictVectorizer 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

In [12]:
vectorizer = DictVectorizer()
vec = vectorizer.fit(train_features)
print("\nTotal number of features:", len(vec.get_feature_names()))


Total number of features: 115303


In [13]:
train_features_vectorized = vec.transform(train_features)
test_features_vectorized = vec.transform(test_features)
print(len(train_features_vectorized.toarray()), len(test_features_vectorized.toarray()))

190298 25820


In [None]:
lrc = LogisticRegression(random_state=42, solver="saga", multi_class="multinomial", max_iter=1000, verbose=1)
lrc.fit(train_features_vectorized, train_labels)

In [None]:
predicted = lrc.predict(test_features_vectorized)
print(classification_report(test_labels, predicted))

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
logistic = LogisticRegression()
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

In [None]:
# Fit grid search
best_model = clf.fit(train_features_vectorized, train_labels)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
pred_labels = best_model.predict(test_features_vectorized)
print(classification_report(test_labels, pred_labels))

In [14]:
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

In [15]:
mod = GradientBoostingClassifier()
params = {'max_depth'    : [4, 6],
          'n_estimators' : [100, 500]  
         }

grid = GridSearchCV(estimator=mod, param_grid = params, cv = 2, n_jobs=-1)
grid.fit(train_features_vectorized, train_labels)

# Results from Grid Search
print("\n========================================================")
print(" Results from Grid Search " )
print("========================================================")
print("\n The best estimator across ALL searched params:\n",
      grid.best_estimator_)
print("\n The best score across ALL searched params:\n",
      grid.best_score_)
print("\n The best parameters across ALL searched params:\n",
      grid.best_params_)
print("\n ========================================================")


 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=6,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

 The best score across ALL searched params:
 0.8711126759083123

 The best parameters across ALL searched params:
 {'max_depth': 6, 'n_estimators': 500}



In [16]:
predicted = grid.predict(test_features_vectorized)

In [17]:
print(classification_report(test_labels, predicted))

              precision    recall  f1-score   support

        left       0.89      0.94      0.91      6371
      reduce       0.88      0.83      0.85      6875
       right       0.84      0.83      0.83      5996
       shift       0.90      0.92      0.91      6578

    accuracy                           0.88     25820
   macro avg       0.88      0.88      0.88     25820
weighted avg       0.88      0.88      0.88     25820



### Unlabeled attachment score

In [18]:
def dep_parse(sent, oracle, vectorizer):
    stack, queue, relations = [ROOT], sent[:], []
    while queue or stack:
        if stack and not queue:
            stack.pop()
        else:
            features = extract_features(stack, queue)
            action = oracle.predict(vectorizer.transform([features]))[0]
            if action == Actions.SHIFT:
                stack.append(queue.pop(0))
            elif action == Actions.REDUCE:
                stack.pop()
            elif action == Actions.LEFT:
                relations.append((stack[-1]["id"], queue[0]["id"]))
                stack.pop()
            elif action == Actions.RIGHT:
                relations.append((queue[0]["id"], stack[-1]["id"]))
                stack.append(queue.pop(0))
            else:
                print("Unknown action.")
    return sorted(relations)

In [19]:
total, tp, full_match = 0, 0, 0
for tree in test_trees:
    tree = [ t for t in tree if type(t["id"])==int]
    golden = [(node["id"], node["head"]) for node in tree]
    predicted = dep_parse(tree, grid, vec)
    total += len(tree)
    tp += len(set(golden).intersection(set(predicted)))
    if set(golden) == set(predicted):
        full_match += 1

print("Total:", total)
print("Correctly defined:", tp)
print("UAS:", round(tp/total, 2))
print("Full match:", round(full_match/len(test_trees), 2))

Total: 12574
Correctly defined: 9792
UAS: 0.78
Full match: 0.17
