https://gist.github.com/syllog1sm/10343947  
https://gist.github.com/Samurais/0dc04f265799731caebf80d9ebde6395  
https://explosion.ai/blog/parsing-english-in-python

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import os
import time

from os import path
from sys import intern

from tqdm import tqdm_notebook as tqdm

from syntactic_parser import Parser

class DefaultList(list):
    def __init__(self, default = None):
        self.default = default
        list.__init__(self)

    def __getitem__(self, index):
        try:
            return list.__getitem__(self, index)
        except IndexError:
            return self.default

In [3]:
def read_conll(loc):
    for sent_str in open(loc, mode='r', encoding='utf-8').read().strip().split('\n\n'):
        lines = [line.split() for line in sent_str.split('\n')]
        words = DefaultList(''); tags = DefaultList('')
        heads = [None]; labels = [None]
        for i, (_, word, _, pos, _, _, head, label, _, _) in enumerate(lines):
            words.append(intern(word))
            #words.append(intern(normalize(word)))
            tags.append(intern(pos))
            heads.append(int(head) + 1 if head != '-1' else len(lines) + 1)
            labels.append(label)
        pad_tokens(words); pad_tokens(tags)
        yield words, tags, heads, labels
        
def pad_tokens(tokens):
    tokens.insert(0, '<start>')
    tokens.append('ROOT')

In [4]:
def train(parser, sentences, nr_iter=100, threshold=0.0001):
    prev_acc = 0
    for itn in range(nr_iter):
        corr = 0; total = 0
        gx = 0; gy = 0
        random.shuffle(sentences)
        
        t = tqdm(total=len(sentences))
        for words, gold_tags, gold_parse, gold_label in sentences:
            x, y, c = parser.train_one(itn, words, gold_tags, gold_parse)
            corr += c
            gx += x
            gy += y
            total += len(words)
            t.update(1)
            
        acc = (float(corr) / float(total))
        print('Iter: {}, skipped: {}/{}, accuracy: {:.4f}'.format(itn, gx, gy, acc))
        
        if (abs(acc - prev_acc)) < threshold:
            print('Δacc < {}, stopping'.format(threshold))
            break
        prev_acc = acc
    print('Averaging weights')
    parser.model.average_weight()

In [5]:
def main(model_dir, train_loc, heldout_gold, nr_iter=100, threshold=0.0001):
    parser = Parser()
    sentences = list(read_conll(train_loc))
    train(parser, sentences, nr_iter, threshold)
    parser.save(model_dir)
    
    c = 0
    t = 0
    gold_sentences = list(read_conll(heldout_gold))
    t1 = time.time()
    
    for words, tags, gold_heads, gold_labels in gold_sentences:
        _, heads = parser.parse(words)
        for i, w in list(enumerate(words))[1:-1]:
            if gold_labels[i] in ('P', 'punct'):
                continue
            if heads[i] == gold_heads[i]:
                c += 1
            t += 1
            
    t2 = time.time()
    print('Parsing took %0.3f ms' % ((t2-t1)*1000.0))
    print(c, t, float(c)/t)

In [6]:
model_dir = path.join("tmp", "dep_parser.pkl")
train_loc = path.join("datasets", "UD_English-EWT", "en_ewt-ud-train.conll")
heldout_gold = path.join("datasets", "UD_English-EWT", "en_ewt-ud-test.conll")
main(model_dir, train_loc, heldout_gold, 15, 0.0001)

A Jupyter Widget

Iter: 0, skipped: 37362/396627, accuracy: 0.1075


A Jupyter Widget

Iter: 1, skipped: 38742/396627, accuracy: 0.1020


A Jupyter Widget

Iter: 2, skipped: 39055/396627, accuracy: 0.1004


A Jupyter Widget

Iter: 3, skipped: 39133/396627, accuracy: 0.1008


A Jupyter Widget

Iter: 4, skipped: 39325/396627, accuracy: 0.1002


A Jupyter Widget

Iter: 5, skipped: 39442/396627, accuracy: 0.0995


A Jupyter Widget

Iter: 6, skipped: 39545/396627, accuracy: 0.0998


A Jupyter Widget

Iter: 7, skipped: 39401/396627, accuracy: 0.0993


A Jupyter Widget

Iter: 8, skipped: 39498/396627, accuracy: 0.0993
Δacc < 0.0001, stopping
Averaging weights
Parsing took 29801.260 ms
1407 22028 0.06387325222444162
