## Testing ConllDataset

In [1]:
from src.dataset import ConllDataset

original_file = './data/wsj_dev.conll06.pred'
conll_dataset = ConllDataset(original_file)

Test writing to new file

In [2]:
conll_dataset.write(filepath='./test.conll06')

## Access sentence

In [3]:
from src.dataset import ConllDataset

original_file = './data/wsj_dev.conll06.pred'
conll_dataset = ConllDataset(original_file)

In [4]:
conll_dataset[0].sentence

Unnamed: 0,ID,FORM,LEMMA,POS,XPOS,MORPH.,HEAD,REL,DEPS,MISC
0,1,The,the,DT,_,_,2,NMOD,_,_
1,2,economy,economy,NN,_,_,4,NMOD,_,_
2,3,'s,'s,POS,_,_,2,NMOD,_,_
3,4,temperature,temperature,NN,_,_,5,SBJ,_,_
4,5,will,will,MD,_,_,0,ROOT,_,_
5,6,be,be,VB,_,_,5,VC,_,_
6,7,taken,take,VBN,_,_,6,VC,_,_
7,8,from,from,IN,_,_,7,ADV,_,_
8,9,several,several,JJ,_,_,11,NMOD,_,_
9,10,vantage,vantage,NN,_,_,11,NMOD,_,_


## Evaluation

In [None]:
from src.dataset import ConllDataset

gold_file = './data/wsj_dev.conll06.gold'
prediction_file = './data/wsj_dev.conll06.pred'

conll_gold = ConllDataset(gold_file)
conll_prediction = ConllDataset(prediction_file)

In [None]:
from src.evaluation import uas, las

In [None]:
uas(conll_gold, conll_prediction)

0.8974243278811807

In [None]:
las(conll_gold, conll_prediction)

0.8811430720060162

In [18]:
import numpy as np

In [19]:
scores = np.array([
    [-np.Inf, 10, 15, 5],
    [-np.Inf, -np.Inf, 30, 10],
    [-np.Inf, 20, -np.Inf, 5],
    [-np.Inf, 5, 10, -np.Inf]
])
scores

array([[-inf,  10.,  15.,   5.],
       [-inf, -inf,  30.,  10.],
       [-inf,  20., -inf,   5.],
       [-inf,   5.,  10., -inf]])

In [20]:
scores = np.array([
    [-np.Inf, 10, 5, 15],
    [-np.Inf, -np.Inf, 20, 15],
    [-np.Inf, 25, -np.Inf, 25],
    [-np.Inf, 30, 10, -np.Inf]
])
scores

array([[-inf,  10.,   5.,  15.],
       [-inf, -inf,  20.,  15.],
       [-inf,  25., -inf,  25.],
       [-inf,  30.,  10., -inf]])

In [21]:
# Transform into dictionaries, where V= {node: {arc: cost}}
graph = {}
for i_row, row in enumerate(scores):
    arcs = {}
    for i_col, column_score in enumerate(row):
        if column_score > 0:
            arcs[i_col] = column_score
    
    # i_row here means the node
    graph[i_row] = arcs 

graph

{0: {1: 10.0, 2: 5.0, 3: 15.0},
 1: {2: 20.0, 3: 15.0},
 2: {1: 25.0, 3: 25.0},
 3: {1: 30.0, 2: 10.0}}

In [22]:
# Find max
max_column_index = np.argmax(scores, axis=0)
# Since max_column_index returns the index of the highest value in the column,
# we actually get the index of the row
max_index_pairs = [[r, c] for c, r in enumerate(max_column_index)]
index_pairs = max_index_pairs[1:]
print("Indexes:", index_pairs)
print("Values:", [scores[l[0], l[1]] for l in index_pairs])

Indexes: [[3, 1], [1, 2], [2, 3]]
Values: [30.0, 20.0, 25.0]


In [23]:
# Find maximal per row
max_graph = {}

# For excluding the root node, we start from 1
for i_node in range(1, len(graph)):
    for g_node, g_arcs in graph.items():
        if i_node in max_graph: 
            if max_graph[i_node][1] < g_arcs.get(i_node, 0):
                max_graph[i_node] = [g_node, g_arcs[i_node]]
        else:
            max_graph[i_node] = [g_node, g_arcs.get(i_node, 0)]

max_graph

{1: [3, 30.0], 2: [1, 20.0], 3: [2, 25.0]}

In [24]:
# Find cycle
def find_cycle(graph):
    for node in graph.keys():
        visited = []
        to_visit = [node]
        while len(to_visit) > 0:
            visiting = to_visit.pop()
            if visiting in visited:
                cycle = {v:graph[v] for v in visited}
                return cycle
            else:
                visited.append(visiting)
                if node in graph:
                    # Only get the node ([0]), not value ([1])
                    to_visit.append(graph[visiting][0])

# We don't need scoring values here
cycle = find_cycle(max_graph)
cycle

{1: [3, 30.0], 3: [2, 25.0], 2: [1, 20.0]}

In [25]:
new_graph = dict.copy(graph)
for node, end in cycle.items():
    new_graph[node].pop(end[0])
    if len(new_graph[node]) == 0:
        new_graph.pop(node)

new_graph

{0: {1: 10.0, 2: 5.0, 3: 15.0}, 1: {2: 20.0}, 2: {3: 25.0}, 3: {1: 30.0}}

In [26]:
# Find cycle (to be added)
cycle = index_pairs

In [27]:
scores = np.array([
    [-np.Inf, 10, 5, 15],
    [-np.Inf, -np.Inf, 20, 15],
    [-np.Inf, 25, -np.Inf, 25],
    [-np.Inf, 30, 10, -np.Inf]
])
scores

array([[-inf,  10.,   5.,  15.],
       [-inf, -inf,  20.,  15.],
       [-inf,  25., -inf,  25.],
       [-inf,  30.,  10., -inf]])

In [11]:
# Two cycle
scores = np.array([
    [-np.Inf, 10, 15, 5],
    [-np.Inf, -np.Inf, 30, 10],
    [-np.Inf, 20, -np.Inf, 5],
    [-np.Inf, 5, 10, -np.Inf]
])
scores

array([[-inf,  10.,  15.,   5.],
       [-inf, -inf,  30.,  10.],
       [-inf,  20., -inf,   5.],
       [-inf,   5.,  10., -inf]])

In [12]:
# Test
scores = np.array([
    [-np.Inf, 10, 4, 9],
    [-np.Inf, -np.Inf, 10, 3],
    [-np.Inf, 12, -np.Inf, 7],
    [-np.Inf, 2, 6, -np.Inf]
])
scores

array([[-inf,  10.,   4.,   9.],
       [-inf, -inf,  10.,   3.],
       [-inf,  12., -inf,   7.],
       [-inf,   2.,   6., -inf]])

In [29]:
scores = np.array([
    [-np.Inf,9,10,9],
    [-np.Inf,-np.Inf,20,3],
    [-np.Inf,30,-np.Inf,30],
    [-np.Inf,11,0,-np.Inf]
])
scores

array([[-inf,   9.,  10.,   9.],
       [-inf, -inf,  20.,   3.],
       [-inf,  30., -inf,  30.],
       [-inf,  11.,   0., -inf]])

In [69]:
# Find cycle
def find_cycle(graph):
    for node in graph.keys():
        visited = []
        to_visit = [node]
        while len(to_visit) > 0:
            visiting = to_visit.pop()
            if visiting in visited:
                cycle = np.array([[v, graph[v]] for v in visited])
                return cycle
            else:
                visited.append(visiting)
                if visiting in graph:
                    # Only get the node ([0]), not value ([1])
                    to_visit.append(graph[visiting])
    
    return None

def contract(scores, cycle):
    new_matrix = np.copy(scores)

    # Add to the ones that aren't in the max (so only [0, :])
    all_nodes = set(range(new_matrix.shape[0]))
    removed_nodes = set(cycle.flatten())
    nodes = all_nodes.difference(removed_nodes)


    # Remove internal arcs
    removed_nodes = np.array(list(removed_nodes))
    new_matrix[np.ix_(removed_nodes, removed_nodes)] = -np.Inf

    # Update outgoing arcs
    cycle_as_dict = {v[0]:v[1] for v in cycle}
    for node in nodes:
        for col in range(1, new_matrix.shape[1]):
            if col in cycle_as_dict and new_matrix[node, col] >= 0:
                new_matrix[node, col] += scores[col, cycle_as_dict[col]]



    # Remove unwanted arcs.
    # The algorithm works as follows:
    # 1) get the connections for a node (n3), in matrix as matrix[n3]
    # 2) find max values of connections in n3 that were in the cycle (or created as a new node), max matrix[n3][n2,3]
    # 3) Save only the max value and turn all other in matrix[n3][n2,n3] as -np.Inf
    # 4) do the same for all other

    # First for row
    for r in range(new_matrix.shape[0]):
        remove_nodes = list(removed_nodes)
        max_outgoing = np.argmax(new_matrix[r, remove_nodes], axis=0)
        max_index = remove_nodes[max_outgoing]

        remove_nodes.pop(remove_nodes.index(max_index))

        new_matrix[r, remove_nodes] = -np.Inf

    # And now remove columns
    for c in range(new_matrix.shape[0]):
        remove_nodes = list(removed_nodes)
        max_outgoing = np.argmax(new_matrix[remove_nodes, c], axis=0)
        max_index = remove_nodes[max_outgoing]

        remove_nodes.pop(remove_nodes.index(max_index))
        new_matrix[remove_nodes, c] = -np.Inf

    return new_matrix


def CLE(scores):
    max_column_index = np.argmax(scores, axis=1)
    # Since max_column_index returns the index of the highest value in the column,
    # we actually get the index of the row
    max_graph = {c:r for c, r in enumerate(max_column_index) if c > 0 and r > 0}

    cycle = find_cycle(max_graph)
    if cycle is None:
        # Add root score
        max_graph[np.argmax(scores[0])] = 0
        print(max_graph)
        return max_graph
    else:
        new_graph = contract(scores, cycle)
        y = CLE(new_graph)
        return y, cycle



In [70]:
scores

array([[-inf,   9.,  10.,   9.],
       [-inf, -inf,  20.,   3.],
       [-inf,  30., -inf,  30.],
       [-inf,  11.,   0., -inf]])

In [72]:
y, cycle = CLE(scores)

{2: 0, 3: 1}


In [73]:
y

{2: 0, 3: 1}

In [65]:
results

({2: 0, 3: 1},
 array([[1, 2],
        [2, 1]]))

In [14]:
# Two cycle
scores = np.array([
    [-np.Inf, 10, 15, 5],
    [-np.Inf, -np.Inf, 30, 10],
    [-np.Inf, 20, -np.Inf, 5],
    [-np.Inf, 5, 10, -np.Inf]
])
scores

array([[-inf,  10.,  15.,   5.],
       [-inf, -inf,  30.,  10.],
       [-inf,  20., -inf,   5.],
       [-inf,   5.,  10., -inf]])

In [17]:
CLE(scores)

[[-inf  10.  15.   5.]
 [-inf -inf  30.  10.]
 [-inf  20. -inf   5.]
 [-inf   5.  10. -inf]]
[[-inf  10.  15.   5.]
 [-inf -inf  30.  10.]
 [-inf  20. -inf   5.]
 [-inf   5.  10. -inf]]


{2: 0, 1: 3}

In [16]:

max_column_index = np.argmax(scores, axis=0)
# Since max_column_index returns the index of the highest value in the column,
# we actually get the index of the row
max_graph = {c:r for c, r in enumerate(max_column_index) if c > 0 and r > 0}
max_graph


{1: 2, 2: 1, 3: 1}

In [243]:
cycle = find_cycle(max_graph)
print(cycle)
if cycle is None:
    pass
else:
    new_graph = contract(scores, cycle)
    print(new_graph)

    max_column_index = np.argmax(new_graph, axis=0)
    # Since max_column_index returns the index of the highest value in the column,
    # we actually get the index of the row
    max_graph = {c:r for c, r in enumerate(max_column_index) if c > 0 and r > 0}

    print(max_graph)
    cycle = find_cycle(max_graph)
    print(cycle)

[[2 1]
 [1 2]]
[[-inf  40. -inf   5.]
 [-inf -inf -inf  10.]
 [-inf -inf -inf -inf]
 [-inf  35. -inf -inf]]
{3: 1}
None


In [219]:
scores

array([[-inf,  10.,   5.,  15.],
       [-inf, -inf,  20.,  15.],
       [-inf,  25., -inf,  25.],
       [-inf,  30.,  10., -inf]])

In [237]:
def resolve_cycle(graph, cycle):
    cycle_dict = {v[1]:v[0] for v in cycle}
    pass

In [262]:
y = [[1, 3], [0, 1]]
cycle = [[2, 1], [1, 2]]

cycle_dict = {v[1]:v[0] for v in cycle}

for node in y:
    head = node[0]
    print(head)
    if head in cycle_dict:
        arc = [head, cycle_dict.pop(head)]

        cycle = [val for val in cycle + y if not val==arc]
        print(cycle)
        break
        #scores[arc, 3] = 0

1
[[2, 1], [1, 3], [0, 1]]


In [256]:
cycle + y

[[2, 1], [1, 2], [0, 3], [0, 1], [0, 3], [0, 1]]