In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
single_agree = pd.read_csv('../single_annotator.csv')
double_agree = pd.read_csv('../double_annotator_agree.csv')
triple_agree = pd.read_csv('../triple_annotator_agree.csv')

In [3]:
double_disagree = pd.read_csv('../double_annotator_disagree.csv')
triple_disagree = pd.read_csv('../triple_annotator_disagree.csv')

In [4]:
single_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Summarize

In [5]:
len({*set(single_agree.kata), *set(double_agree.kata), *set(triple_agree.kata), *set(double_disagree.kata), *set(triple_disagree.kata)})

54

In [6]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id),
     *set(double_disagree.kalimat_id), *set(triple_disagree.kalimat_id)})

10314

In [7]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id)})

8998

However, there are only 8992 annotated sentence in dataset without disagreement between annotators

# Merge no-conflict-between-annotators-dataset

In [8]:
dataset_kalimat_id = [*list(single_agree.kalimat_id), *list(double_agree.kalimat_id), *list(triple_agree.kalimat_id)]
dataset_kata = [*list(single_agree.kata), *list(double_agree.kata), *list(triple_agree.kata)]
dataset_sense = [*list(single_agree.sense), *list(double_agree.sense), *list(triple_agree.sense)]
dataset_kalimat = [*list(single_agree.kalimat), *list(double_agree.kalimat), *list(triple_agree.kalimat)]

# Test Dataset

In [2]:
dataset = pd.read_csv('../testing_data.csv').rename(columns={'word': 'kata'})

In [3]:
dataset.head()

Unnamed: 0,id,kata,kalimat
0,13,asing,"Para pecinta film indonesia atau tv, pasti tak..."
1,19,asing,Pasti telinga kita merasa asing dan aneh mende...
2,41,asing,Warga negara asing atau warga negara Persemakm...
3,44,asing,"Selama lima belas tahun memerintah, Sultan Mah..."
4,121,asing,Yang kemudian diikuti dengan donat-donat waral...


# Only WSD Tasks

In [9]:
wsd_dataset_kalimat = []
wsd_dataset_kata = []
wsd_dataset_sense = []

for sense, kata, kalimat in zip(dataset_sense, dataset_kata, dataset_kalimat):
    if str(sense)[-1] not in 'abcdx':
        wsd_dataset_kata.append(kata)
        wsd_dataset_sense.append(sense)
        wsd_dataset_kalimat.append(kalimat)
        
dataset = pd.DataFrame({
    'kata': wsd_dataset_kata,
    'sense': wsd_dataset_sense,
    'kalimat': wsd_dataset_kalimat
})

dataset.head()

Unnamed: 0,kata,sense,kalimat
0,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# POS Tagging

In [4]:
import nltk
from preprocessor import pipe

In [5]:
annotated_words = set(dataset.kata)

ct = nltk.tag.CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

In [7]:
dataset['pos_tags'] = list(map(
    pipe(
        nltk.word_tokenize,
        lambda tokens: [tokens],
        ct.tag_sents,
        lambda pos_tags_sentence: pos_tags_sentence[0]
    ),
    dataset.kalimat
))

In [8]:
dataset.iloc[0].pos_tags

[('Para', 'DT'),
 ('pecinta', 'NN'),
 ('film', 'NN'),
 ('indonesia', 'NN'),
 ('atau', 'CC'),
 ('tv', 'NNP'),
 (',', 'Z'),
 ('pasti', 'RB'),
 ('tak', 'NEG'),
 ('asing', 'JJ'),
 ('mendengar', 'VB'),
 ('namanya', 'RB'),
 ('.', 'Z')]

# Begin data cleaning

In [9]:
dataset['clean'] = pd.Series(['' for i in range(len(dataset))])

In [10]:
dataset.head()

Unnamed: 0,id,kata,kalimat,pos_tags,clean
0,13,asing,"Para pecinta film indonesia atau tv, pasti tak...","[(Para, DT), (pecinta, NN), (film, NN), (indon...",
1,19,asing,Pasti telinga kita merasa asing dan aneh mende...,"[(Pasti, NN), (telinga, NN), (kita, PRP), (mer...",
2,41,asing,Warga negara asing atau warga negara Persemakm...,"[(Warga, NN), (negara, NN), (asing, JJ), (atau...",
3,44,asing,"Selama lima belas tahun memerintah, Sultan Mah...","[(Selama, IN), (lima, CD), (belas, NN), (tahun...",
4,121,asing,Yang kemudian diikuti dengan donat-donat waral...,"[(Yang, DT), (kemudian, CC), (diikuti, VB), (d...",


## Execute cleaning

In [18]:
import time
from preprocessor import (
    normalize_money, normalize_number, remove_punctuation,
    create_stemmer, create_stop_words_remover, stemmer,
    clean_word, normalize_weekday, normalize_month, remove_normalized,
    normalize_personal_pronoun, normalize_demonstrative_pronoun, normalize_coordinating_conjunction,
    normalize_determiner, normalize_preposition, create_obvious_verb_normalizer
)

In [25]:
# exceptions, do not stem, because pysastrawi is stupid
exception_words = {'senilai', 'bernilai', 'menilainya', 'dinilainya', *set(map(stemmer.stem, annotated_words))}

In [26]:
begin = time.perf_counter()
dataset['kalimat'] = pd.Series(
    map(
        pipe(
            remove_punctuation,
            normalize_money,
            normalize_number,
            normalize_weekday,
            normalize_month,
            create_stemmer(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 23.804717339000035


In [27]:
dataset.iloc[4244].kalimat

'bagaimana jaringan nordsud hilang diri dari saing dengan dekorasi stasiun yang kualitas tinggi'

In [28]:
begin = time.perf_counter()
dataset['clean'] = pd.Series(
    map(
        pipe(
            create_obvious_verb_normalizer(annotated_words, exception_words),
            normalize_personal_pronoun,
            normalize_demonstrative_pronoun,
            normalize_coordinating_conjunction,
            normalize_determiner,
            normalize_preposition,
            remove_normalized,
            create_stop_words_remover(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 3.4251222549996783


In [29]:
dataset.head()

Unnamed: 0,id,kata,kalimat,pos_tags,clean
0,13,asing,para cinta film indonesia atau tv pasti tak as...,"[(Para, DT), (pecinta, NN), (film, NN), (indon...",cinta film indonesia tv asing dengar nama
1,19,asing,pasti telinga kita rasa asing dan aneh dengar ...,"[(Pasti, NN), (telinga, NN), (kita, PRP), (mer...",telinga asing aneh dengar menu masakan soto ke...
2,41,asing,warga negara asing atau warga negara makmur ya...,"[(Warga, NN), (negara, NN), (asing, JJ), (atau...",warga negara asing warga negara makmur kepala ...
3,44,asing,lama somenumber tahun perintah sultan mahmud j...,"[(Selama, IN), (lima, CD), (belas, NN), (tahun...",perintah sultan mahmud jalin kerja asing belan...
4,121,asing,yang kemudian ikut dengan donatdonat waralaba ...,"[(Yang, DT), (kemudian, CC), (diikuti, VB), (d...",donatdonat waralaba asing master ring master d...


# Write down the position of target word

In [30]:
targetpos = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].clean.split()
    for j in range(len(tokens)):
        if tokens[j].find(target) >= 0 or stemmer.stem(tokens[j]) == stemmer.stem(target):
            targetpos[i] = j
            break
    if targetpos[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_clean'] = pd.Series(targetpos)

In [31]:
targetpos_ori = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].kalimat.split()
    for j in range(len(tokens)):
        token = clean_word(tokens[j])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_ori[i] = j
            break
    if targetpos_ori[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_ori'] = pd.Series(targetpos_ori)

In [32]:
targetpos_pos_tag = [-1 for i in range(len(dataset))]
tags_only = ['' for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].pos_tags
    for j in range(len(tokens)):
        token = clean_word(tokens[j][0])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_pos_tag[i] = j
            break
    if targetpos_pos_tag[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception
    tags_only[i] = ' '.join(list(map(lambda t: t[1], tokens)))

dataset['targetpos_pos_tag'] = pd.Series(targetpos_pos_tag)
dataset['pos_tags'] = pd.Series(tags_only)

# Training

In [25]:
dataset.to_csv('train_data.csv')

# Testing

In [33]:
dataset.to_csv('testing_data_clean.csv')

In [26]:
dataset

Unnamed: 0,kata,sense,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
0,cerah,4801,cuaca cerah adalah lazim panjang tahun,NN NN VB NN NN NN Z,cuaca cerah lazim,1,1,1
1,cerah,4801,gambar yang hasil oleh layarnya cukup cerah da...,NNP SC VB IN NN RB JJ CC VB NN SC JJ VB NN SC ...,gambar hasil layarnya cerah milik speaker hasi...,3,6,6
2,cerah,4803,masa depan yang cerah bagi pemuda umur somenum...,NN NN SC VB IN NN NN CD IN NNP NNP CD Z,cerah bagi pemuda umur prancis abad,0,3,3
3,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,NNP NNP Z NNP NNP NNP Z Z Z NN RB VB NNP NNP N...,cor caroli alpha canum venaticorum nama lengka...,12,16,21
4,cerah,4801,sanders lebih suka cat air untuk lilo dengan m...,NN RB VB NN NN SC NNP IN NN VB NN NN NN NN NN Z,sanders suka cat air lilo maksud tampil warna ...,8,11,11
5,cerah,4801,ulleungdo milik iklim subtropis basah klasifik...,NNP VB NN JJ VB Z NN NN NNP NNP Z Z SC IN NN N...,ulleungdo milik iklim subtropis basah klasifik...,14,18,21
6,cerah,4801,ikan hias mungkin besar sulit tahan hidup di a...,VB NN NN JJ JJ VB NN IN NN JJ NN NN SC NN CC R...,ikan hias besar sulit tahan hidup alam liar ak...,10,13,13
7,cerah,4801,sebuah entremet tanda akhir saji suatu set men...,NND NN VB NN NN CD NN NN CC MD VB WH RB VB IN ...,entremet tanda saji set menu upa frumenty sede...,11,22,24
8,cerah,4801,alangkah lega hati ketika ia mulai lihat binat...,NN NN NN SC PRP VB VB NN NN NN IN NN NN NN Z V...,alangkah lega hati lihat binatangbinatang tern...,13,19,20
9,coklat,4703,sisi atas tubuh warna coklat tembaga emas,NN IN NN NN NN NN NN Z,sisi atas tubuh warna coklat tembaga emas,4,4,4
