In [72]:
import time
import os
import logging
import numpy as np
from collections import Counter
from utils.general_utils import logged_loop, get_minibatches
from q2_parser_transitions import PartialParse, minibatch_parse

P_PREFIX = '<p>:'
L_PREFIX = '<l>:'
UNK = '<UNK>'
NULL = '<NULL>'
ROOT = '<ROOT>'

class Config(object):
    language = 'english'
    with_punct = True
    unlabeled = True
    lowercase = True
    use_pos = True
    use_dep = True
    use_dep = use_dep and (not unlabeled)
    data_path = './data'
    train_file = 'train.conll'
    dev_file = 'dev.conll'
    test_file = 'test.conll'
    embedding_file = './data/en-cw.txt'

def read_conll(in_file, lowercase=False, max_example=None):
    examples = []
    with open(in_file) as f:
        word, pos, head, label = [], [], [], []
        for line in f.readlines():
            sp=line.strip().split('\t')
            if len(sp) == 10:
                if '-' not in sp[0]:
                    word.append(sp[1].lower() if lowercase else sp[1])
                    pos.append(sp[4])
                    head.append(int(sp[6]))
                    label.append(sp[7])
            elif len(word) > 0:
                examples.append({'word':word, 'pos': pos, 'head':head, 'label':label})
                word, pos, head, label = [], [], [], []
                if(max_example is not None) and (len(examples) == max_example):
                    break
        if len(word) > 0:
            examples.append({'word':word, 'pos': pos, 'head':head, 'label':label})
    return examples

def build_dict(keys, n_max=None, offset=0):
    print offset
    count = Counter()
    for key in keys:
        count[key] += 1
    ls = count.most_common() if n_max is None else count.most_common(n_max)
    return {w[0]: index+offset for (index, w) in enumerate(ls)}
    

class Parser(object):
    """Contains everything needed for transition-based dependency parsing except for the model"""
    
    def __init__(self,dataset):
        root_labels = list([l for ex in dataset for (h,l) in zip(ex['head'], ex['label']) if h==0])
        counter = Counter(root_labels)
        if len(counter) > 1:
            logging.info('Warning: more than one root label')
            logging.info(counter)
        self.root_label = counter.most_common()[0][0]

        deprel = [self.root_label] + list(set([w for ex in dataset for w in ex['label'] if w !=self.root_label]))
        tok2id = {L_PREFIX+l:i for (i,l) in enumerate(deprel)}
        tok2id[L_PREFIX + NULL] = self.L_NULL = len(tok2id)
        
        config = Config()
        self.unlabeled = config.unlabeled
        self.with_punct = config.with_punct
        self.use_pos = config.use_pos
        self.use_dep = config.use_dep
        self.language = config.language
        
        if self.unlabeled:
            trans = ['L', 'R', 'S']
            self.n_deprel = 1
        else:
            trans = ['L-'+l for l in deprel] + ['R-'+l for l in deprel] + ['S']
            self.n_deprel = len(deprel)
            
        self.n_trans = len(trans)
        self.tran2id = {t:i for (i,t) in enumerate(trans)}
        self.id2tran = {i:t for (i,t) in enumerate(trans)}
        tok2id.update(build_dict([P_PREFIX+w for ex in dataset for w in ex['pos']],offset=len(tok2id)))
        tok2id[P_PREFIX + UNK] = self.P_UNK = len(tok2id)
        tok2id[P_PREFIX + NULL] = self.P_NULL = len(tok2id)
        tok2id[P_PREFIX + ROOT] = self.P_ROOT = len(tok2id)
        
        tok2id.update(build_dict([w for ex in dataset for w in ex['word']],offset=len(tok2id)))
        tok2id[UNK] = self.UNK = len(tok2id)
        tok2id[NULL] = self.NULL = len(tok2id)
        tok2id[ROOT] = self.ROOT = len(tok2id)
        
        self.tok2id = tok2id
        self.id2tok = {v:k for (k,v) in tok2id.items()}
        self.n_features = 18 + (18 if config.use_pos else 0) + (12 if config.use_dep else 0)
        print "number of features: ", self.n_features
        self.n_tokens = len(tok2id)
    
    def vectorize(self, examples):
        vec_examples = []
        for ex in examples:
            word = [self.ROOT] + [self.tok2id[w] if w in self.tok2id else self.UNK for w in ex['word'] ]
            pos = [self.P_ROOT] + [self.tok2id[P_PREFIX+w] if P_PREFIX+w in self.tok2id else self.P_UNK for w in ex['pos']]
            head = [-1] + ex['head']
            label= [-1] +[self.tok2id[L_PREFIX + w] if L_PREFIX + w in self.tok2id else -1 for w in ex['label']]
            vec_examples.append({'word': word, 'pos': pos,'head': head, 'label': label})
        
        return vec_examples
    
    def get_oracle(self, stack, buf, ex):
        if len(stack) < 2:
            return self.n_trans - 1
        
        i0 = stack[-1]
        i1 = stack[-2]
        h0 = ex['head'][i0]
        h1 = ex['head'][i1]
        l0 = ex['label'][i0]
        l1 = ex['label'][i1]
        
        if self.unlabeled:
            if (i1 > 0) and (h1 == i0):
                return 0
            elif (i1 >= 0) and (h0 == i1) and(not any([x for x in buf if ex['head'][x] == i0])):
                return 1
            else:
                return None if len(buf) == 0 else 2
        else:
            if (i1 > 0) and (h1 == i0):
                return l1 if (l1 >= 0) and (l1 < self.n_deprel) else None
            elif (i1 >= 0) and (h0 == i1) and (not any([x for x in buf if ex['head'][x] == i0])):
                return l0 + self.n_deprel if (l0 >= 0) and (l0 < self.n_deprel) else None
            else:
                return None if len(buf) == 0 else self.n_trans - 1
            
    def legal_labels(self, stack, buf):
        labels = ([1] if len(stack) > 2 else [0]) * self.n_deprel
        labels += ([1] if len(stack)>=2 else [0]) * self.n_deprel
        labels += [1] if len(buf) > 0 else [0]
        return labels
    
    def extract_features(self, stack, buf, arcs, ex):
        if stack[0] == "ROOT":
            stack[0] = 0
        
        def get_lc(k):
            return sorted([arc[1] for arc in arcs if arc[0]==k and arc[1] < k])
        def get_rc(k):
            return sorted([arc[1] for arc in arcs if arc[0] == k and arc[1] > k], reverse=True)
        
        p_features = []
        l_features = []
        
        features = [self.NULL] * (3 - len(stack)) + [ex['word'][x] for x in stack[-3:]]
        print "feature step1 --->",
        print features
        features += [ex['word'][x] for x in buf[:3]] + [self.NULL] * (3 - len(buf))
        print "feature step2 --->",
        print features
        
        if self.use_pos:
            p_features = [self.P_NULL] * (3 - len(stack)) + [ex['pos'][x] for x in stack[-3:]]
            print "p_feature step1 --->",
            print p_features
            p_features += [ex['pos'][x] for x in buf[:3]] + [self.P_NULL] * (3 - len(buf))
            print "p_feature step2 --->",
            print p_features
        
        for i in xrange(2):
            if i < len(stack):
                print "case --- 1"
                k = stack[-i-1]
                lc = get_lc(k)
                print "lc, step1 ---->",
                print lc
                rc = get_rc(k)
                print "rc, step1 ---->",
                print rc
                llc = get_lc(lc[0]) if len(lc) > 0 else []
                print "llc, step1 ---->",
                print llc
                rrc = get_rc(rc[0]) if len(rc) > 0 else []
                print "rrc, step1 ---->",
                print rrc
                
                features.append(ex['word'][lc[0]] if len(lc) > 0 else self.NULL)
                features.append(ex['word'][rc[0]] if len(rc) > 0 else self.NULL)
                features.append(ex['word'][lc[1]] if len(lc) > 1 else self.NULL)
                features.append(ex['word'][rc[1]] if len(rc) > 1 else self.NULL)
                features.append(ex['word'][llc[0]] if len(llc) > 0 else self.NULL)
                features.append(ex['word'][rrc[0]] if len(rrc) > 0 else self.NULL)
                
                print "features step3 --->",
                print features
                
                if self.use_pos:
                    p_features.append(ex['pos'][lc[0]] if len(lc) > 0 else self.P_NULL)
                    p_features.append(ex['pos'][rc[0]] if len(rc) > 0 else self.P_NULL)
                    p_features.append(ex['pos'][lc[1]] if len(lc) > 1 else self.P_NULL)
                    p_features.append(ex['pos'][rc[1]] if len(rc) > 1 else self.P_NULL)
                    p_features.append(ex['pos'][llc[0]] if len(llc) > 0 else self.P_NULL)
                    p_features.append(ex['pos'][rrc[0]] if len(rrc) > 0 else self.P_NULL)                   
                
                print "p_features step3 --->",
                print p_features
                
                if self.use_dep:
                    l_features.append(ex['label'][lc[0]] if len(lc) > 0 else self.L_NULL)
                    l_features.append(ex['label'][rc[0]] if len(rc) > 0 else self.L_NULL)
                    l_features.append(ex['label'][lc[1]] if len(lc) > 1 else self.L_NULL)
                    l_features.append(ex['label'][rc[1]] if len(rc) > 1 else self.L_NULL)
                    l_features.append(ex['label'][llc[0]] if len(llc) > 0 else self.L_NULL)
                    l_features.append(ex['label'][rrc[0]] if len(rrc) > 0 else self.L_NULL)
                print "l_features step1 --->"
                print l_features
            else:
                print "case ---- 2"
                features += [self.NULL] * 6
                if self.use_pos:
                    p_features += [self.P_NULL] * 6
                if self.use_dep:
                    l_features += [self.L_NULL] * 6
        features += p_features + l_features
        assert len(features) == self.n_features
        print "final feature", features
        return features
                    
        
    def create_instances(self,examples):
        all_instances = []
        succ = 0
        for id,ex in enumerate(logged_loop(examples)):
            n_words = len(ex['word']) -1
            print "number of words: %d" % n_words
            #arcs = {(h,t,label)}
            stack = [0]
            buf = [i+1 for i in xrange(n_words)]
            arcs = []
            instances = []
            for i in xrange(n_words * 2):
                gold_t = self.get_oracle(stack, buf, ex)
                print "gold_t ---> %d" % gold_t
                if gold_t is None:
                    break
                legal_labels = self.legal_labels(stack, buf)
                print "legal_labels --->",
                print legal_labels
                assert legal_labels[gold_t] == 1
                instances.append((self.extract_features(stack, buf, arcs, ex),legal_labels, gold_t))
                if gold_t == self.n_trans -1:
                    stack.append(buf[0])
                    buf = buf[1:]
                elif gold_t < self.n_deprel:
                    arcs.append((stack[-1], stack[-2], gold_t))
                    stack = stack[:-2] + [stack[-1]]
                else:
                    arcs.append((stack[-2], stack[-1], gold_t - self.n_deprel))
                    stack = stack[:-1]
            else:
                succ +=1
                all_instances += instances
    
        return all_instances
                          
print 5

 5


In [73]:
reduced = True
config=Config()
start = time.time()
train_set = read_conll(os.path.join(config.data_path, config.train_file),lowercase=config.lowercase)
#dev_set=read_conll(os.path.join(config.data_path, config.dev_file), lowercase=config.lowercase)
#test_set=read_conll(os.path.join(config.data_path, config.test_file), lowercase=config.lowercase)

if reduced:
    train_set=train_set[1:2]
#    dev_set=dev_set[:500]
#    test_set=test_set[:500]
'''
example 
the first example in train_set:
{'head': [5, 5, 5, 5, 45, 9, 9, 9, 5, 9, 15, 15, 12, 15, 9, 20, 20, 19, 20, 5, 22, 20, 25, 25, 20, 20, 20, 20, 28, 28, 20, 45, 34, 45, 36, 34, 34, 34, 41, 41, 38, 34, 45, 45, 0, 48, 48, 45, 45], 
'word': ['in', 'an', 'oct.', '19', 'review', 'of', '``', 'the', 'misanthrope', "''", 'at', 'chicago', "'s", 'goodman', 'theatre', '-lrb-', '``', 'revitalized', 'classics', 'take', 'the', 'stage', 'in', 'windy', 'city', ',', "''", 'leisure', '&', 'arts', '-rrb-', ',', 'the', 'role', 'of', 'celimene', ',', 'played', 'by', 'kim', 'cattrall', ',', 'was', 'mistakenly', 'attributed', 'to', 'christina', 'haag', '.'], 
'pos': ['IN', 'DT', 'NNP', 'CD', 'NN', 'IN', '``', 'DT', 'NN', "''", 'IN', 'NNP', 'POS', 'NNP', 'NNP', '-LRB-', '``', 'VBN', 'NNS', 'VB', 'DT', 'NN', 'IN', 'NNP', 'NNP', ',', "''", 'NN', 'CC', 'NNS', '-RRB-', ',', 'DT', 'NN', 'IN', 'NNP', ',', 'VBN', 'IN', 'NNP', 'NNP', ',', 'VBD', 'RB', 'VBN', 'TO', 'NNP', 'NNP', '.'], 
'label': ['case', 'det', 'compound', 'nummod', 'nmod', 'case', 'punct', 'det', 'nmod', 'punct', 'case', 'nmod:poss', 'case', 'compound', 'nmod', 'punct', 'punct', 'amod', 'nsubj', 'dep', 'det', 'dobj', 'case', 'compound', 'nmod', 'punct', 'punct', 'dep', 'cc', 'conj', 'punct', 'punct', 'det', 'nsubjpass', 'case', 'nmod', 'punct', 'acl', 'case', 'compound', 'nmod', 'punct', 'auxpass', 'advmod', 'root', 'case', 'compound', 'nmod', 'punct']}
'''
print "took {:.2f} seconds".format(time.time()-start)

print "building parser...",
start=time.time()
parser=Parser(train_set)
print "took {:.2f} seconds".format(time.time()-start)

print "Loading pretrained embedding...",
start = time.time()
word_vectors = {}
for line in open(config.embedding_file).readlines():
    sp = line.strip().split()
    word_vectors[sp[0]] = [float(x) for x in sp[1:]]
embeddings_matrix = np.asarray(np.random.normal(0,0.9,(parser.n_tokens,50)),dtype='float32')

for token in parser.tok2id:
    i = parser.tok2id[token]
    if token in word_vectors:
        embeddings_matrix[i] = word_vectors[token]
    elif token.lower() in word_vectors:
        embeddings_matrix[i] = word_vectors[token.lower()]

print "took {:.2f} seconds".format(time.time() - start)   

print "Vectorizing data...",
start = time.time()
train_set = parser.vectorize(train_set)
#dev_set = parser.vectorize(dev_set)
#test_set = parser.vectorize(test_set)
print "took {:.2f} seconds".format(time.time() - start)

'''
the first example become
{'head': [-1, 5, 5, 5, 5, 45, 9, 9, 9, 5, 9, 15, 15, 12, 15, 9, 20, 20, 19, 20, 5, 22, 20, 25, 25, 20, 20, 20, 20, 28, 28, 20, 45, 34, 45, 36, 34, 34, 34, 41, 41, 38, 34, 45, 45, 0, 48, 48, 45, 45], 
'word': [5156, 91, 113, 948, 600, 708, 88, 96, 85, 3417, 97, 109, 1285, 93, 3592, 3245, 145, 96, 4873, 4311, 375, 85, 5042, 91, 4401, 1625, 86, 97, 2553, 201, 3382, 144, 86, 85, 846, 88, 3152, 86, 836, 105, 2690, 3396, 86, 103, 1793, 1673, 89, 3510, 1729, 87], 
'pos': [84, 40, 41, 42, 49, 39, 40, 61, 41, 39, 62, 40, 42, 60, 42, 42, 71, 61, 53, 44, 50, 41, 39, 40, 42, 42, 45, 62, 39, 51, 44, 72, 45, 41, 39, 40, 42, 45, 53, 40, 42, 42, 45, 48, 47, 53, 52, 42, 42, 46], 
'label': [-1, 27, 31, 24, 21, 32, 27, 23, 31, 32, 23, 27, 26, 27, 24, 32, 23, 23, 33, 19, 29, 31, 9, 27, 24, 32, 23, 23, 29, 1, 8, 23, 23, 31, 6, 27, 32, 23, 37, 27, 24, 32, 23, 13, 22, 0, 27, 24, 32, 23]}
'''
print "Preprocessing training data..."
train_examples = parser.create_instances(train_set)



took 3.64 seconds
building parser... 6
12
number of features:  36
took 0.00 seconds
Loading pretrained embedding... took 3.87 seconds
Vectorizing data... took 0.00 seconds
Preprocessing training data...
number of words: 5
gold_t ---> 2
legal_labels ---> [0, 0, 1]
feature step1 ---> [18, 18, 19]
feature step2 ---> [18, 18, 19, 12, 15, 14]
p_feature step1 ---> [10, 10, 11]
p_feature step2 ---> [10, 10, 11, 6, 6, 7]
case --- 1
lc, step1 ----> []
rc, step1 ----> []
llc, step1 ----> []
rrc, step1 ----> []
features step3 ---> [18, 18, 19, 12, 15, 14, 18, 18, 18, 18, 18, 18]
p_features step3 ---> [10, 10, 11, 6, 6, 7, 10, 10, 10, 10, 10, 10]
l_features step1 --->
[]
case ---- 2
final feature [18, 18, 19, 12, 15, 14, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 10, 10, 11, 6, 6, 7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
gold_t ---> 2
legal_labels ---> [0, 1, 1]
feature step1 ---> [18, 19, 12]
feature step2 ---> [18, 19, 12, 15, 14, 16]
p_feature step1 ---> [10, 11, 6]
p_feature ste