# Unsupervised Lexical-Sample WSD Dataset Builder from Wikipedia Indonesia

In [1]:
import pandas as pd
import sys
import string
from preprocessor import clean_word, stemmer, pipe
from functools import reduce

In [2]:
is_real_sentence = lambda s: len(s) > 10 and s[0] in string.ascii_letters  and '|' not in s and '=' not in s and 'html' not in s and ':' not in s

# depending on the circumstances, stemmer.stem should or should not be added to the pipe
def is_containing_exactly_one_chosen_word(sentence, chosen):
    return list(map(pipe(clean_word), sentence.split())).count(chosen) == 1

def get_discourse_id(s):
    tokens = s.strip().split('<id>')
    return tokens[-1].replace('</id>', '').replace('\n.', '') if len(tokens) == 2 else None

# Build Dataset

In [3]:
raw = open('../idwiki-latest-pages-articles-full.xml', 'r').readlines()

In [4]:
def execute():
    current_discourse = None
    kalimat = []
    discourse = []
    kata = []
    valid_discourse_id = False
    WIKI_LEN = len(raw)

    i = 0


    for line in raw:
        sentences = []
        for s in line.split('.'):
            sentences.append(s + '.')
        for s in sentences:
            if '<title>' in s:
                valid_discourse_id = True
                continue
            next_discourse = get_discourse_id(s)
            if next_discourse and valid_discourse_id:
                current_discourse = next_discourse
                valid_discourse_id = False
                continue
            if not is_real_sentence(s):
                continue
            if is_containing_exactly_one_chosen_word(s, chosen_word):
                s = s.replace('[[', '').replace(']]', '').replace('\n', '').replace("''", "").replace("'''", "").replace('&quot;', '')
                kalimat.append(s)
                kata.append(chosen_word)
                discourse.append(current_discourse)
        if len(kalimat) >= TARGET_SIZE:
            print('\ntarget achieved')
            break
        i += 1
        if i % (WIKI_LEN//500) == 0:
            sys.stdout.write("\rRaw read: {0:.1f} % | Instance collected: {1}".format(i/WIKI_LEN*100, len(kalimat)))
            sys.stdout.flush()
    
    dataset = pd.DataFrame({
        'kata': kata,
        'discourse_id': discourse,
        'kalimat': kalimat,
    })
    dataset.to_csv('../unsupervised dataset/{}.csv'.format(chosen_word), index=False)


In [5]:
ambiguous_words = '''cabang cerah coklat 
dalam dasar dunia halaman harapan 
jalan jam jaringan kabur kaki 
kali kepala ketat kulit kunci 
layar lebat lingkungan mata membawa 
memecahkan menangkap mendorong menerima mengandung 
mengejar mengeluarkan mengikat mengisi menjaga 
menurunkan menyusun nilai panas pembagian 
rapat sarung tengah tinggi '''.split()

In [None]:
TARGET_SIZE = 10000
for w in ambiguous_words:
    chosen_word = w
    print()
    print(w)
    execute()


cabang
Raw read: 100.0 % | Instance collected: 1015
cerah
Raw read: 100.0 % | Instance collected: 130
coklat
Raw read: 100.0 % | Instance collected: 268
dalam
Raw read: 4.4 % | Instance collected: 9967
target achieved

dasar
Raw read: 100.0 % | Instance collected: 3643
dunia
Raw read: 91.6 % | Instance collected: 9988
target achieved

halaman
Raw read: 1.0 % | Instance collected: 53

In [12]:
for k in obtained_words:
    print(k, len(dataset.query('kata == "{}"'.format(k))))

asing 400
cabang 400
membawa 400
harapan 400
kabur 131
jam 400
tinggi 400
menjaga 373
halaman 400
memecahkan 91
mengejar 193
kunci 317
layar 400
mendorong 329
menyusun 219
rapat 212
buah 400
lebat 72
mata 400
kaki 400
panas 400
cerah 64
pembagian 400
jaringan 400
kali 400
jalan 400
baru 400
ketat 130
tengah 400
mengandung 400
bintang 400
menangkap 301
bulan 400
dasar 400
kepala 400
berat 400
kulit 400
dalam 400
sarung 54
coklat 159
lingkungan 400
menurunkan 198
mengisi 310
bidang 400
mengeluarkan 400
badan 400
bisa 400
dunia 400
bunga 400
atas 400
besar 400
mengikat 117
nilai 400
menerima 400


In [16]:
dataset.to_csv('../wikipedia_no_annotator.csv')