In [2]:
import nltk
from collections import deque
# nltk.download('dependency_treebank')
from nltk.corpus import dependency_treebank
import numpy as np
ROOT_NODE = {'word': 'ROOT', 'tag':'ROOT','address':0}



In [3]:
def get_data():
    sents = dependency_treebank.parsed_sents()
    train_set, test_set = sents[:int(0.9*len(sents))], sents[int(0.9*len(sents)):]
    return train_set,test_set

In [8]:
from collections import defaultdict


# def create_hot_vec(sent):
#     word_vec_dict,tag_vec_dict = defaultdict(int), defaultdict(int)
#     for val1 in vals:
#         for val2 in vals:
#             vec_dict[(val1,val2)] = 0
#     return vec_dict

def fill_hot_vec(sent):
    hot_vec = defaultdict(int)
    for i in range(len(sent.nodes)):
        dest_node = sent.nodes[i]
        source_node = sent.nodes[dest_node['head']] if dest_node['head'] else ROOT_NODE
        source_word, source_tag = source_node['word'],source_node['tag']
        dest_word , dest_tag = dest_node['word'], dest_node['tag']
        hot_vec[(source_word,dest_word)] += 1
        hot_vec[(source_tag,dest_tag)] += 1
    return hot_vec

    # return np.concatenate((vocab_hot_vec,tags_hot_vec),axis=0)

def get_feature_func(sent):
    tags,vocab = dict(), dict()
    nodes_stack = [sent.nodes[i] for i in range(len(sent.nodes))]
    words_ind,tag_ind = 0,0
    for i in range(len(nodes_stack)):
        node = nodes_stack[i]
        if node['word'] not in vocab:
            vocab[node['word']] = words_ind
            words_ind+=1
        if node['tag'] not in tags:
            tags[node['tag']] = tag_ind
            tag_ind+=1
    return fill_hot_vec(sent)


In [9]:
from collections import namedtuple
from Chu_Liu_Edmonds_algorithm import min_spanning_arborescence_nx
#
# Arc = namedtuple('Arc', ['head', 'tail'])
WeightedArc = namedtuple('WeightedArc', 'head tail weight')


def get_weighted_arcs(sent, sent_feature_func, weights):
    out_arcs = []

    for i in range(1,len(sent.nodes)):
        new_val = 0
        dest_node = sent.nodes[i]
        source_node = sent.nodes[dest_node['head']] if dest_node['head'] else ROOT_NODE
        source_word, source_tag = source_node['word'],source_node['tag']
        dest_word , dest_tag = dest_node['word'], dest_node['tag']
        word_arc, tag_arc = (source_word,dest_word), (source_tag,dest_tag)
        new_val -= (sent_feature_func[word_arc] * weights[word_arc]) + (sent_feature_func[tag_arc] * weights[word_arc])
        out_arcs.append(WeightedArc(head=source_node['address'],tail=dest_node['address'], weight=new_val))
    return out_arcs

def get_mst(sents,sent_feature_func, weights):
    arcs = get_weighted_arcs(sents,sent_feature_func,weights)
    return min_spanning_arborescence_nx(arcs,None)


def get_mst_feature_func(mst,sent):
    hot_vec = defaultdict(int)
    for arc in mst.values():
        source_ind, dest_ind = arc.head, arc.tail
        if source_ind:

            hot_vec[(sent.nodes[source_ind]['tag'], sent.nodes[dest_ind]['tag'])] = 1
            hot_vec[(sent.nodes[source_ind]['word'], sent.nodes[dest_ind]['word'])] = 1
        else:
            ### Root
            hot_vec[('ROOT', sent.nodes[dest_ind]['tag'])] = 1
            hot_vec[('ROOT', sent.nodes[dest_ind]['word'])] = 1

    return hot_vec



def perceptron(sents, iterations, learning_rate = 0.5):
    W = defaultdict(int)
    N = len(sents)*iterations
    for iter in range(iterations):
        for j, sent in enumerate(sents):
            gold_feature_func = get_feature_func(sent)
            mst = get_mst(sent,gold_feature_func,W)
            mst_feature_func = get_mst_feature_func(mst, sent)
            for arc in gold_feature_func:
                W[arc] = W[arc]+(gold_feature_func[arc]-mst_feature_func[arc])* learning_rate
    for j in W:
        W[j] /= N
    return W



In [10]:
### load data
train_set, test_set = get_data()
w = perceptron(train_set,2,1)
print(w)


