In [1]:
import pandas as pd
import csv
import numpy as np
import xml.etree.ElementTree as ET

In [2]:
class UFDS:
    def __init__(self):
        self.nodes = set()
        self.parent = {}
    
    def init_id(self, *nodes):
        for node in nodes:
            if node not in self.nodes:
                self.nodes.add(node)
                self.parent[node] = node
        
    def root(self, x):
        self.init_id(x)
        
        if self.parent[x] == x:
            return x
        else:
            self.parent[x] = self.root(self.parent[x])
            return self.parent[x]
    
    def gabung(self, x, y):
        self.init_id(x, y)
        
        self.parent[self.root(x)] = self.root(y)
    
    def is_same(self, x, y):
        self.init_id(x, y)
        
        return self.root(x) == self.root(y)

In [4]:
data = ET.parse('./data.xml')
root = data.getroot()

In [5]:
ufds = UFDS()
nodes = {}

In [6]:
# ngitung jumlah markable sekalian gabung-gabungin

for sentence in root:
    for phrase in sentence:
        if 'id' in phrase.attrib:
            ufds.init_id(int(phrase.attrib['id']))
            nodes[int(phrase.attrib['id'])] = phrase
        
        if 'coref' in phrase.attrib:
            ufds.gabung(int(phrase.attrib['id']), int(phrase.attrib['coref']))

print('banyak markable: %d' % len(ufds.nodes))

banyak markable: 7120


In [8]:
# ngitung banyak singleton dan non-singleton, ngitung banyak "nya"

kelompok = {}
kelompok_non_singleton = set()
self_par = 0
not_self_par = 0
nya = 0

for node in ufds.nodes:
    if ufds.root(node) not in kelompok:
        kelompok[ufds.root(node)] = []
    
    kelompok[ufds.root(node)].append(node)
    
    if ufds.root(node) == node:
        self_par += 1
    else:
        not_self_par += 1
        kelompok_non_singleton.add(ufds.root(node))
    
    if nodes[node].text[:3] == 'nya':
        nya += 1

print('banyak kelompok non-singleton: %d' % len(kelompok_non_singleton))
print('banyak singleton: %d' % (self_par - len(kelompok_non_singleton)))
print('banyak non-singleton: %d' % (not_self_par + len(kelompok_non_singleton)))
print('banyak "nya": %d' % nya)

banyak kelompok non-singleton: 307
banyak singleton: 6148
banyak non-singleton: 972
banyak "nya": 457


In [9]:
maks = 0
for par in kelompok:
    maks = max(maks, len(kelompok[par]))

print('Kelompok terbesar: %d' % maks)

Kelompok terbesar: 18


In [10]:
def is_singleton(node):
    par = ufds.root(node)
    return len(kelompok[par]) == 1

In [11]:
positive = 0
negative_with_singleton = 0
negative_without_singleton = 0

for par in kelompok:
    for i in range(len(kelompok[par]) - 1):
        positive += 1
        
        for j in range(kelompok[par][i] + 1, kelompok[par][i+1]):
            negative_with_singleton += 2
            if not is_singleton(j):
                negative_without_singleton += 2

print('data latih positif: %d' % positive)
print('data latih negatif, singleton termasuk: %d' % negative_with_singleton)
print('data latih negatif, singleton tidak termasuk: %d' % negative_without_singleton)

data latih positif: 665
data latih negatif, singleton termasuk: 6548
data latih negatif, singleton tidak termasuk: 1112
