In [9]:
from os import listdir
from os.path import isfile, join
mypath = './hievents_v2/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(len(onlyfiles))

100


In [10]:
from collections import defaultdict 
#Class to represent a graph 
class Graph: 
  
    def __init__(self, vertices): 
        self.V = vertices 

    # Floyd Warshall algorithm 
    def transitiveClosure(self,graph): 
        '''reach[][] will be the output matrix that will finally 
        have reachability values. 
        Initialize the solution matrix same as input graph matrix'''
        reach =[i[:] for i in graph] 
        '''Add all vertices one by one to the set of intermediate 
        vertices. 
         ---> Before start of a iteration, we have reachability value 
         for all pairs of vertices such that the reachability values 
          consider only the vertices in set  
        {0, 1, 2, .. k-1} as intermediate vertices. 
          ----> After the end of an iteration, vertex no. k is 
         added to the set of intermediate vertices and the  
        set becomes {0, 1, 2, .. k}'''
        for k in range(self.V): 
              
            # Pick all vertices as source one by one 
            for i in range(self.V): 
                  
                # Pick all vertices as destination for the 
                # above picked source 
                for j in range(self.V): 
                      
                    # If vertex k is on a path from i to j,  
                       # then make sure that the value of reach[i][j] is 1 
                    reach[i][j] = min( reach[i][j] , reach[i][k] + reach[k][j] )
        return reach

In [11]:
import xml.etree.ElementTree as ET
import tqdm
import numpy as np

num_PC = 0
num_CP = 0
num_CO = 0
num_NO = 0

def rel(i, j, graph, num_CO, num_PC, num_CP, num_NO):
    if graph[i][j] < 100 or graph[j][i] < 100:
        num_CO += 1
        return "Coref", num_CO, num_PC, num_CP, num_NO
    elif graph[i][j] < 10000:
        num_PC += 1
        return "SuperSub", num_CO, num_PC, num_CP, num_NO
    elif graph[j][i] < 10000:
        num_CP += 1
        return "SubSuper", num_CO, num_PC, num_CP, num_NO
    else:
        num_NO += 1
        return "NoRel", num_CO, num_PC, num_CP, num_NO
    
for fname in tqdm.tqdm(onlyfiles):
    tree = ET.parse(mypath+fname)
    root = tree.getroot()
    with open('./hievents_v2/processed/' + fname.replace('.xml', '.tsvx'), 'w', encoding='utf8') as fp:
        event_dict = {}
        relation_set = {}
        for child in root:
            if child.tag == 'Text':
                fp.write('Text\t' + child.text + '\n')
            elif child.tag == 'Events':
                for event in child:
                    event_dict[event[0].text] = {"AnchorText": event[1].text, "Type": event[2].text, "Position": event[3].text}
                    fp.write('Event\t')
                    fp.write(event[0].text + "\t")
                    fp.write(event[1].text + "\t")
                    fp.write(event[2].text + "\t")
                    fp.write(event[3].text + "\n")
        num_event = len(event_dict)  
        g = Graph(num_event+1)
        graph = np.ones((num_event+1, num_event+1))
        graph = graph * 10000
        for child in root:
            if child.tag == "Relations":
                for RelationInfo in child:
                    if RelationInfo[2].text == "Coref":
                        graph[int(RelationInfo[0].text)][int(RelationInfo[1].text)] = 1
                        graph[int(RelationInfo[1].text)][int(RelationInfo[0].text)] = 1
                    elif RelationInfo[2].text == "SuperSub":
                        graph[int(RelationInfo[0].text)][int(RelationInfo[1].text)] = 100
                        
        graph = g.transitiveClosure(graph) 
        for i in range(1, num_event):
            for j in range(i+1, num_event + 1):
                if i < j:
                    relation, num_CO, num_PC, num_CP, num_NO = rel(i, j, graph, num_CO, num_PC, num_CP, num_NO)
                    fp.write('Relation\t')
                    fp.write(str(i) + "\t")
                    fp.write(str(j) + "\t")
                    fp.write(relation + "\t")
                    fp.write("true\t")
                    fp.write(event_dict[str(i)]["AnchorText"] + "\t")
                    fp.write(event_dict[str(j)]["AnchorText"] + "\n")
                
print("num_PC:", num_PC)     
print("num_CP:", num_CP)     
print("num_CO:", num_CO)     
print("num_NO:", num_NO)     

100%|██████████| 100/100 [00:05<00:00, 17.19it/s]

num_PC: 1802
num_CP: 1846
num_CO: 758
num_NO: 63755



