In [1]:
# register the pre-trained embedding model to spacy, please repalce the directories with your own
# also you need to input the abbreviation of langauge: en for english, sv for swedish

In [None]:
!python -m spacy init vectors sv D:/embeddings/sv_talbanken.txt D:/models/sv --name model

In [3]:
import spacy,easygui,os
root = os.getcwd() # define default root

## import test set from corpus

In [4]:
import conllu

# select corpus file
file = easygui.fileopenbox(default = root, title = "Select test data") 

In [5]:
# read annotated data from corpus
annotations = open(file, encoding = "utf-8-sig").read()
sentences = conllu.parse(annotations)
# extract sentences for parsing
sent_list = [sent.metadata["text"]  for sent in sentences]

# extract annotated dependency data for comparing
dep_annotated = [] # overall dependency with each sentence as a sublist                     
for sent in sentences:
    dependency = [] # dependency with a single sentence
    for token in sent:
        if token["head"] == 0:
            # convert the token id to its index in list if the head is root
            head = token["form"]
        elif type(token["head"]) == int: # when head is labelled, we can get the "head" as int
            # get index of the head
            head_index = token["head"] - 1 # in the corpus the token id strats from 1, while index of list strats from 0
            # retrieve the head 
            head = sent[head_index]["form"]
                    
        else: # when no head is labelled, the type of this parameter should be NoneType. we ignore such situation
            pass
        # insert token, dependency and head
        dependency.append((token["form"], token["deprel"], head))
        
    # append the parsed dependency to the overall list
    dep_annotated.append(dependency)

## import the training set from corpus

In [6]:
# select data for training
file_train = easygui.fileopenbox(default = root, title = "Select train data")
sents_train = conllu.parse(open(file_train, encoding = "utf-8-sig").read())

# arrange the data in format [("text", {"heads": heads, "deps": deps }), ...]
train_data = []
for sent in sents_train:
    # extract head index
    heads = []
    deps = []
    for token in sent:
            # when the token is the root, the value of head is 0. we manually set it to its only index
            if token["head"] == 0:
                # convert the token id to its index in list
                heads.append(token["id"] - 1)
                deps.append(token["deprel"])
            # otherwise, when the id is int other than 0, the token has another one as its head 
            elif type(token["id"]) == int:
                heads.append(token["head"] - 1)
                deps.append(token["deprel"])
            else:
            # in some case the head is not assigned in corpus and the type of id would be Nonetype. So we pop out such elements
                pass
    # combine deps and heads
    dep_dict = {"heads": heads,"deps": deps}
    # insert raw text
    train_data.append((sent.metadata["text"], dep_dict))

## prepare the data for training

In [7]:
# if if the result of the tokenization by spacy is different from the annotated data
# the training will be blocked
# so we need to remove such data
save_model = easygui.diropenbox(default = root, title = "Directory of saved model")
# load trained model with embedding
nlp = spacy.load(save_model)

len_original = len(train_data) # length of the original train data
count_removed = 0

from spacy.training import Example

examples = []
for item in train_data:
    doc = nlp(item[0])
    # get the token list
    token_spacy = [token.text for token in doc]

    # if the result of the tokenization by spacy is different from the annotated data
    if len(item[1]["deps"]) != len(token_spacy) or len(item[1]["heads"]) != len(token_spacy):
        # the item will be popped out
        train_data.remove(item)
        count_removed += 1
    else:
        # create examples
        examples.append(Example.from_dict(doc, item[1]))

# the loss of train data
print(count_removed, " sentences of ", len_original, "are removed.")

168  sentences of  4303 are removed.


## train the parser

In [20]:
# add parser
nlp.add_pipe("parser")

# gather the labels from annotated corpus
labels = []
for sent in sents_train:
    for token in sent:
        if token["deprel"] not in labels:
            labels.append(token["deprel"])

# add label to the default parser
parser = nlp.get_pipe("parser")
for label in labels:
    if label not in parser.labels:
        parser.add_label(label)
        
# training

import random

# get number of iterations
training_iterations = easygui.integerbox(msg="How many times of iteration:", lowerbound = 1)

# initialize the model
optimizer = nlp.initialize(lambda: examples )
for i in range(training_iterations):
    random.shuffle(examples)
    # update the model
    nlp.update(examples, sgd = optimizer) 

nlp.to_disk(save_model)

easygui.msgbox(msg = "Finished training!", title = None, ok_button = "Continue.")

'Continue.'

## parsing with spacy

In [21]:
# reload the trained model
nlp2 = spacy.load(save_model)

In [22]:
# extract dependency data from the result
dep_spacy = [] # overall dependency as result of spacy   
for sent in sent_list:
    dependency = [] # dependency withi each sentence
    for token in nlp2(sent):
        # insert token, dependency and head
        dependency.append((token.text, token.dep_.lower(), token.head.text))
    # append to the overall list
    dep_spacy.append(dependency)    

## comparison

In [23]:
token_mismatch = []
token_count = 0
mismatch_count = 0

for n in range(0, len(dep_spacy)):
    # we assume that the sentences are tokenized in the same way both in spacy and in corpus, so mismatch of tokenization is ignored
    if len(dep_annotated[n]) == len(dep_spacy[n]):
            # renew number of token
            token_count += len(dep_spacy[n])
            for a in range(0,len(dep_spacy[n])): # iterate through all the tokens in a sentence 
                 if dep_spacy[n][a] != dep_annotated[n][a]: # check if the dependency relations are identical
                        mismatch_count += 1
                        token_mismatch.append((dep_annotated[n][a],dep_spacy[n][a]))
# calculate accuracy                        
precision =  1 - (mismatch_count / token_count)         

In [None]:
print("The precision of spacy dependency parser is %d." % precision)

## save as csv

In [25]:
import pandas as pd

# to data frame
df = pd.DataFrame.from_records(token_mismatch, columns = ["Corpus", "Spacy"])

In [26]:
# input directory
directory = easygui.diropenbox(default = root, msg = "Save the csv")
# name the file
filename = easygui.enterbox(msg = "Name your file")
#to csv
df.to_csv((directory + "/" + filename + ".csv"))