In [2]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
single_agree = pd.read_csv('../single_annotator.csv')
double_agree = pd.read_csv('../double_annotator_agree.csv')
triple_agree = pd.read_csv('../triple_annotator_agree.csv')

In [3]:
double_disagree = pd.read_csv('../double_annotator_disagree.csv')
triple_disagree = pd.read_csv('../triple_annotator_disagree.csv')

In [4]:
single_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Summarize

In [5]:
len({*set(single_agree.kata), *set(double_agree.kata), *set(triple_agree.kata), *set(double_disagree.kata), *set(triple_disagree.kata)})

54

In [6]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id),
     *set(double_disagree.kalimat_id), *set(triple_disagree.kalimat_id)})

10314

In [7]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id)})

8998

However, there are only 8992 annotated sentence in dataset without disagreement between annotators

# Merge no-conflict-between-annotators-dataset

In [43]:
dataset_kalimat_id = [*list(single_agree.kalimat_id), *list(double_agree.kalimat_id), *list(triple_agree.kalimat_id)]
dataset_kata = [*list(single_agree.kata), *list(double_agree.kata), *list(triple_agree.kata)]
dataset_sense = [*list(single_agree.sense), *list(double_agree.sense), *list(triple_agree.sense)]
dataset_kalimat = [*list(single_agree.kalimat), *list(double_agree.kalimat), *list(triple_agree.kalimat)]

# Test Dataset

In [3]:
dataset = pd.read_csv('../testing_data.csv').rename(columns={'word': 'kata'})

In [4]:
dataset.head()

Unnamed: 0,id,kata,kalimat
0,13,asing,"Para pecinta film indonesia atau tv, pasti tak..."
1,19,asing,Pasti telinga kita merasa asing dan aneh mende...
2,41,asing,Warga negara asing atau warga negara Persemakm...
3,44,asing,"Selama lima belas tahun memerintah, Sultan Mah..."
4,121,asing,Yang kemudian diikuti dengan donat-donat waral...


# Only WSD Tasks

In [44]:
wsd_dataset_kalimat = []
wsd_dataset_kata = []
wsd_dataset_sense = []

for sense, kata, kalimat in zip(dataset_sense, dataset_kata, dataset_kalimat):
    if str(sense)[-1] not in 'abcdx':
        wsd_dataset_kata.append(kata)
        wsd_dataset_sense.append(sense)
        wsd_dataset_kalimat.append(kalimat)
        
dataset = pd.DataFrame({
    'kata': wsd_dataset_kata,
    'sense': wsd_dataset_sense,
    'kalimat': wsd_dataset_kalimat
})

dataset.head()

Unnamed: 0,kata,sense,kalimat
0,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


In [45]:
len(dataset)

8407

# POS Tagging

In [5]:
import nltk
from preprocessor import pipe

In [6]:
annotated_words = set(dataset.kata)

ct = nltk.tag.CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

In [7]:
dataset['pos_tags'] = list(map(
    pipe(
        nltk.word_tokenize,
        lambda tokens: [tokens],
        ct.tag_sents,
        lambda pos_tags_sentence: pos_tags_sentence[0]
    ),
    dataset.kalimat
))

In [8]:
dataset.iloc[0].pos_tags

[('Para', 'DT'),
 ('pecinta', 'NN'),
 ('film', 'NN'),
 ('indonesia', 'NN'),
 ('atau', 'CC'),
 ('tv', 'NNP'),
 (',', 'Z'),
 ('pasti', 'RB'),
 ('tak', 'NEG'),
 ('asing', 'JJ'),
 ('mendengar', 'VB'),
 ('namanya', 'RB'),
 ('.', 'Z')]

# Begin data cleaning

In [9]:
dataset['clean'] = pd.Series(['' for i in range(len(dataset))])

In [10]:
dataset.head()

Unnamed: 0,id,kata,kalimat,pos_tags,clean
0,13,asing,"Para pecinta film indonesia atau tv, pasti tak...","[(Para, DT), (pecinta, NN), (film, NN), (indon...",
1,19,asing,Pasti telinga kita merasa asing dan aneh mende...,"[(Pasti, NN), (telinga, NN), (kita, PRP), (mer...",
2,41,asing,Warga negara asing atau warga negara Persemakm...,"[(Warga, NN), (negara, NN), (asing, JJ), (atau...",
3,44,asing,"Selama lima belas tahun memerintah, Sultan Mah...","[(Selama, IN), (lima, CD), (belas, NN), (tahun...",
4,121,asing,Yang kemudian diikuti dengan donat-donat waral...,"[(Yang, DT), (kemudian, CC), (diikuti, VB), (d...",


## Execute cleaning

In [11]:
import time
from preprocessor import (
    normalize_money, normalize_number, remove_punctuation,
    create_stemmer, create_stop_words_remover, stemmer,
    clean_word, normalize_weekday, normalize_month, remove_normalized,
    normalize_personal_pronoun, normalize_demonstrative_pronoun, normalize_coordinating_conjunction,
    normalize_determiner, normalize_preposition, create_obvious_verb_normalizer
)

In [12]:
# exceptions, do not stem, because pysastrawi is stupid
exception_words = {'senilai', 'bernilai', 'menilainya', 'dinilainya', *set(map(stemmer.stem, annotated_words))}

In [13]:
begin = time.perf_counter()
dataset['kalimat'] = pd.Series(
    map(
        pipe(
            remove_punctuation,
            normalize_money,
            normalize_number,
            normalize_weekday,
            normalize_month,
            create_stemmer(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 31.10038994899878


In [14]:
dataset.iloc[4244].kalimat

'bagaimana jaringan nordsud hilang diri dari saing dengan dekorasi stasiun yang kualitas tinggi'

In [15]:
begin = time.perf_counter()
dataset['clean'] = pd.Series(
    map(
        pipe(
            create_obvious_verb_normalizer(annotated_words, exception_words),
            normalize_personal_pronoun,
            normalize_demonstrative_pronoun,
            normalize_coordinating_conjunction,
            normalize_determiner,
            normalize_preposition,
            remove_normalized,
            create_stop_words_remover(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 3.3653268730013224


In [16]:
dataset.head()

Unnamed: 0,id,kata,kalimat,pos_tags,clean
0,13,asing,para cinta film indonesia atau tv pasti tak as...,"[(Para, DT), (pecinta, NN), (film, NN), (indon...",cinta film indonesia tv asing dengar nama
1,19,asing,pasti telinga kita rasa asing dan aneh dengar ...,"[(Pasti, NN), (telinga, NN), (kita, PRP), (mer...",telinga asing aneh dengar menu masakan soto ke...
2,41,asing,warga negara asing atau warga negara makmur ya...,"[(Warga, NN), (negara, NN), (asing, JJ), (atau...",warga negara asing warga negara makmur kepala ...
3,44,asing,lama somenumber tahun perintah sultan mahmud j...,"[(Selama, IN), (lima, CD), (belas, NN), (tahun...",perintah sultan mahmud jalin kerja asing belan...
4,121,asing,yang kemudian ikut dengan donatdonat waralaba ...,"[(Yang, DT), (kemudian, CC), (diikuti, VB), (d...",donatdonat waralaba asing master ring master d...


# Write down the position of target word

In [17]:
targetpos = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].clean.split()
    for j in range(len(tokens)):
        if tokens[j].find(target) >= 0 or stemmer.stem(tokens[j]) == stemmer.stem(target):
            targetpos[i] = j
            break
    if targetpos[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_clean'] = pd.Series(targetpos)

In [18]:
targetpos_ori = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].kalimat.split()
    for j in range(len(tokens)):
        token = clean_word(tokens[j])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_ori[i] = j
            break
    if targetpos_ori[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_ori'] = pd.Series(targetpos_ori)

# Write down the ambiguous word POS tag and its immediate verb and noun

In [19]:
targetpos_pos_tag = [-1 for i in range(len(dataset))]
tags_only = ['' for i in range(len(dataset))]
immediate_verbs = []
immediate_nouns = []

def get_immediate_word_tag(tokens, targetpos, tag):
    res = []
    for i in range(targetpos+1, len(tokens)):
        if tokens[i][1] == tag:
            res.append(stemmer.stem(tokens[i][0]))
    for i in range(targetpos-1, -1, -1):
        if tokens[i][1] == tag:
            res.append(stemmer.stem(tokens[i][0]))
    return ' '.join(res)

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].pos_tags
    for j in range(len(tokens)):
        token = clean_word(tokens[j][0])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_pos_tag[i] = j
            immediate_verbs.append(get_immediate_word_tag(tokens, j, 'VB'))
            immediate_nouns.append(get_immediate_word_tag(tokens, j, 'NN'))
            break
    if targetpos_pos_tag[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception
    tags_only[i] = ' '.join(list(map(lambda t: t[1], tokens)))

dataset['targetpos_pos_tag'] = pd.Series(targetpos_pos_tag)
dataset['pos_tags'] = pd.Series(tags_only)
dataset['verbs'] = pd.Series(immediate_verbs)
dataset['nouns'] = pd.Series(immediate_nouns)

In [21]:
dataset.iloc[1].pos_tags

'NN NN PRP VB JJ CC JJ VB NN VB NN NN Z SC SC JJ VB VB NNP NNP CC NNP NNP Z'

In [22]:
dataset.query('kata == "dalam"')

Unnamed: 0,id,kata,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag,verbs,nouns
2853,339858,dalam,haji piobang adalah orang ulama dan tokoh pent...,NNP NNP VB NND NN CC NN JJ IN NN NNP IN NNP Z,haji piobang orang ulama tokoh dalam gera padr...,5,8,8,adalah,gera tokoh ulama
2854,342460,dalam,minta inggris atas bantu militer dalam hadap k...,NN NNP IN NN NN IN VB NNP NNP IN NN SC VB IN N...,inggris atas bantu militer dalam hadap krisis ...,4,5,5,hadap tolak,tahun militer bantu minta
2855,345318,dalam,partaipartai politik besar di sarawak terbagi ...,NN NN JJ IN NNP VB IN CD NN Z NN JJ NN Z NN JJ...,partaipartai politik besar sarawak terbagi dal...,5,6,6,liput bagi,kategori duduk non-muslim duduk muslim non-pen...
2856,345861,dalam,naruto rasa salah karena luka sakura dalam wuj...,NN VB VB SC VB NNP IN NNP NNP VB CD Z,naruto salah luka sakura dalam wujud kyuubi ekor,4,6,6,ekor luka salah rasa,naruto
2857,346598,dalam,latih tiger atau operasi harimau adalah nama k...,NNP NNP Z CC NNP NNP Z VB NN NN SC CD IN NN NN...,latih tiger operasi harimau nama kode dalam ra...,6,10,12,langsung adalah,rangkai skala latih invasi latih pantai kode nama
2858,347739,dalam,mereka umum dukung semua klub dalam fk partiza...,PRP RB VB CD NN IN NNP NNP Z CC VB NN JJ CC JJ...,dukung klub dalam fk partizan pakai simbol hit...,2,5,5,pakai rupa dukung,simbol warna klub klub
2859,349312,dalam,agak mirip dengan herbarium yang murni ering b...,RB JJ IN NN SC VB NN NN CC NN JJ CC NN NN Z IN...,herbarium murni ering bunga tumbuh kering warn...,8,14,15,hasil warna murni,terap oshibana bunga coklat warna tumbuh bunga...
2860,350071,dalam,ayub somenumber disingkat ayb somenumber adala...,NNP CD Z NN NNP CD Z VB NN IN NNP NNP IN NNP N...,ayub ayb kitab ayub alkitab ibrani janji dalam...,7,16,18,adalah,bagi singkat
2861,350396,dalam,bijak sebut lalu timbul friksi dalam tubuh ira...,NN PR CC VB NN IN NN NNP SC VB NN JJ NNP IN NN...,bijak timbul friksi dalam tubuh ira perang sip...,3,5,5,jadi pro-traktat anti-traktat satu timbul,tubuh perang kelompok kelompok hendak friksi b...
2862,350529,dalam,sementara sebab wasir yang sungguh belum tahu ...,SC NN NN SC RB NEG VB Z CD NN SC VB NN NN Z NN...,wasir sungguh faktor tingkat tekan intraabdome...,10,18,21,punya tingkat tahu,kembang kondisi percaya konstipasi khusus intr...


# Training

In [63]:
dataset.to_csv('train_data.csv')

# Testing

In [23]:
dataset.to_csv('testing_data_clean.csv')

In [26]:
dataset

Unnamed: 0,kata,sense,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
0,cerah,4801,cuaca cerah adalah lazim panjang tahun,NN NN VB NN NN NN Z,cuaca cerah lazim,1,1,1
1,cerah,4801,gambar yang hasil oleh layarnya cukup cerah da...,NNP SC VB IN NN RB JJ CC VB NN SC JJ VB NN SC ...,gambar hasil layarnya cerah milik speaker hasi...,3,6,6
2,cerah,4803,masa depan yang cerah bagi pemuda umur somenum...,NN NN SC VB IN NN NN CD IN NNP NNP CD Z,cerah bagi pemuda umur prancis abad,0,3,3
3,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,NNP NNP Z NNP NNP NNP Z Z Z NN RB VB NNP NNP N...,cor caroli alpha canum venaticorum nama lengka...,12,16,21
4,cerah,4801,sanders lebih suka cat air untuk lilo dengan m...,NN RB VB NN NN SC NNP IN NN VB NN NN NN NN NN Z,sanders suka cat air lilo maksud tampil warna ...,8,11,11
5,cerah,4801,ulleungdo milik iklim subtropis basah klasifik...,NNP VB NN JJ VB Z NN NN NNP NNP Z Z SC IN NN N...,ulleungdo milik iklim subtropis basah klasifik...,14,18,21
6,cerah,4801,ikan hias mungkin besar sulit tahan hidup di a...,VB NN NN JJ JJ VB NN IN NN JJ NN NN SC NN CC R...,ikan hias besar sulit tahan hidup alam liar ak...,10,13,13
7,cerah,4801,sebuah entremet tanda akhir saji suatu set men...,NND NN VB NN NN CD NN NN CC MD VB WH RB VB IN ...,entremet tanda saji set menu upa frumenty sede...,11,22,24
8,cerah,4801,alangkah lega hati ketika ia mulai lihat binat...,NN NN NN SC PRP VB VB NN NN NN IN NN NN NN Z V...,alangkah lega hati lihat binatangbinatang tern...,13,19,20
9,coklat,4703,sisi atas tubuh warna coklat tembaga emas,NN IN NN NN NN NN NN Z,sisi atas tubuh warna coklat tembaga emas,4,4,4
