In [37]:
import pandas as pd
import csv
import numpy as np
import xml.etree.ElementTree as ET

In [3]:
class UFDS:
    def __init__(self):
        self.nodes = set()
        self.parent = {}
    
    def init_id(self, *nodes):
        for node in nodes:
            if node not in self.nodes:
                self.nodes.add(node)
                self.parent[node] = node
        
    def root(self, x):
        self.init_id(x)
        
        if self.parent[x] == x:
            return x
        else:
            self.parent[x] = self.root(self.parent[x])
            return self.parent[x]
    
    def gabung(self, x, y):
        self.init_id(x, y)
        
        self.parent[self.root(x)] = self.root(y)
    
    def is_same(self, x, y):
        self.init_id(x, y)
        
        return self.root(x) == self.root(y)

In [4]:
data = ET.parse('./data.xml')
root = data.getroot()

In [6]:
ufds = UFDS()
nodes = {}
phrases = []
phrase_id_by_node_id = {}

In [7]:
for sentence in root:
    for phrase in sentence:
        phrases.append(phrase)
        
        if 'id' in phrase.attrib:
            ufds.init_id(int(phrase.attrib['id']))
            nodes[int(phrase.attrib['id'])] = phrase
            phrase_id_by_node_id[int(phrase.attrib['id'])] = len(phrases) - 1
        
        if 'coref' in phrase.attrib:
            ufds.gabung(int(phrase.attrib['id']), int(phrase.attrib['coref']))

In [8]:
kelompok = {}

for node in ufds.nodes:
    if ufds.root(node) not in kelompok:
        kelompok[ufds.root(node)] = []
    
    kelompok[ufds.root(node)].append(node)

def is_singleton(node):
    par = ufds.root(node)
    return len(kelompok[par]) == 1

In [9]:
def get_pos_tag(word):
    return word.split('\\')[-1]

In [32]:
def get_previous_words(phrase_id, n):
    if phrase_id < 0 or n == 0:
        return []
    
    words = phrases[phrase_id].text.split()
    
    if len(words) <= n:
        return get_previous_words(phrase_id - 1, n - len(words)) + words
    else:
        return words[(-1*n):]

def get_next_words(phrase_id, n):
    if phrase_id >= len(phrases) or n == 0:
        return []
    
    words = phrases[phrase_id].text.split()
    
    if len(words) <= n:
        return words + get_next_words(phrase_id + 1, n - len(words))
    else:
        return words[:n]

In [33]:
markables = [{
    'id': int(node.attrib['id']),
    'text': node.text,
#     'nominal': '',o
    'num_words': len(node.text.split()),
#     'num_modifiers': '',
    'first_pos_tag': get_pos_tag(node.text.split()[0]),
#     'is_pronoun': '',
    'entity': node.attrib['ne'],
    'is_singleton': is_singleton(int(node.attrib['id'])),
    'previous_words': get_previous_words(phrase_id_by_node_id[int(node.attrib['id'])] - 1, 10),
    'next_words': get_next_words(phrase_id_by_node_id[int(node.attrib['id'])] + 1, 10)
} for node in nodes.values()]

In [34]:
markables[1:10]

[{'id': 2,
  'text': 'pesta\\NN olahraga\\NN',
  'num_words': 2,
  'first_pos_tag': 'NN',
  'entity': 'OTHER',
  'is_singleton': True,
  'previous_words': ['Kera\\NN', 'untuk\\SC', 'amankan\\VB'],
  'next_words': ['Pemerintah\\NNP',
   'kota\\NNP',
   'Delhi\\NNP',
   'mengerahkan\\VB',
   'monyet\\NN',
   'untuk\\SC',
   'mengusir\\VB',
   'monyet-monyet\\NN',
   'lain\\JJ',
   'yang\\SC']},
 {'id': 3,
  'text': 'Pemerintah\\NNP kota\\NNP Delhi\\NNP',
  'num_words': 3,
  'first_pos_tag': 'NNP',
  'entity': 'TITLE|LOCATION',
  'is_singleton': False,
  'previous_words': ['Kera\\NN',
   'untuk\\SC',
   'amankan\\VB',
   'pesta\\NN',
   'olahraga\\NN'],
  'next_words': ['mengerahkan\\VB',
   'monyet\\NN',
   'untuk\\SC',
   'mengusir\\VB',
   'monyet-monyet\\NN',
   'lain\\JJ',
   'yang\\SC',
   'berbadan\\VB',
   'lebih\\RB',
   'kecil\\JJ']},
 {'id': 4,
  'text': 'monyet\\NN',
  'num_words': 1,
  'first_pos_tag': 'NN',
  'entity': 'OTHER',
  'is_singleton': True,
  'previous_words': ['K

In [41]:
zzz = [[nodes[i].text for i in j] for j in kelompok.values()]

In [38]:
with open('markables.csv', 'w') as f:
    csvfile = csv.DictWriter(f, fieldnames=markables[0].keys())
    csvfile.writeheader()
    csvfile.writerows(markables)