In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
single_agree = pd.read_csv('../single_annotator.csv')
double_agree = pd.read_csv('../double_annotator_agree.csv')
triple_agree = pd.read_csv('../triple_annotator_agree.csv')

In [3]:
double_disagree = pd.read_csv('../double_annotator_disagree.csv')
triple_disagree = pd.read_csv('../triple_annotator_disagree.csv')

In [60]:
single_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Summarize

In [4]:
len({*set(single_agree.kata), *set(double_agree.kata), *set(triple_agree.kata), *set(double_disagree.kata), *set(triple_disagree.kata)})

54

In [5]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id),
     *set(double_disagree.kalimat_id), *set(triple_disagree.kalimat_id)})

10314

In [6]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id)})

8998

However, there are only 8992 annotated sentence in dataset without disagreement between annotators

# Merge no-conflict-between-annotators-dataset

In [7]:
dataset_kalimat_id = [*list(single_agree.kalimat_id), *list(double_agree.kalimat_id), *list(triple_agree.kalimat_id)]
dataset_kata = [*list(single_agree.kata), *list(double_agree.kata), *list(triple_agree.kata)]
dataset_sense = [*list(single_agree.sense), *list(double_agree.sense), *list(triple_agree.sense)]
dataset_kalimat = [*list(single_agree.kalimat), *list(double_agree.kalimat), *list(triple_agree.kalimat)]

# Only NER Tasks

In [8]:
wsd_dataset_kalimat = []
wsd_dataset_kata = []
wsd_dataset_sense = []

for kalimat_id, sense, kata, kalimat in zip(dataset_kalimat_id, dataset_sense, dataset_kata, dataset_kalimat):
    if str(sense)[-1] in 'abcd':
        if kalimat_id == 852692: continue # malformed sentence
        wsd_dataset_kata.append(kata)
        wsd_dataset_sense.append(sense)
        wsd_dataset_kalimat.append(kalimat)
        
dataset = pd.DataFrame({
    'kata': wsd_dataset_kata,
    'sense': wsd_dataset_sense,
    'raw': wsd_dataset_kalimat,
    'kalimat': wsd_dataset_kalimat
})

display(dataset.head())
print(len(dataset))

Unnamed: 0,kata,sense,raw,kalimat
0,coklat,470d,Dari hasil pengujian menunjukkan bahwa ada beb...,Dari hasil pengujian menunjukkan bahwa ada beb...
1,jalan,190b,Perbatasan dengan Paroki Santo Petrus Lubukbaj...,Perbatasan dengan Paroki Santo Petrus Lubukbaj...
2,jalan,190b,Pulo Geulis dapat dicapai dengan berjalan kaki...,Pulo Geulis dapat dicapai dengan berjalan kaki...
3,sarung,120d,"Pemain bertelanjang dada, namun mengenakan pak...","Pemain bertelanjang dada, namun mengenakan pak..."
4,dasar,450d,SDN Jati 07 Pagi merupakan sebuah Sekolah Dasa...,SDN Jati 07 Pagi merupakan sebuah Sekolah Dasa...


437


# POS Tagging

In [9]:
import nltk
from preprocessor import pipe

In [10]:
annotated_words = set(dataset.kata)

ct = nltk.tag.CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

In [11]:
dataset['pos_tags'] = list(map(
    pipe(
        nltk.word_tokenize,
        lambda tokens: [tokens],
        ct.tag_sents,
        lambda pos_tags_sentence: pos_tags_sentence[0]
    ),
    dataset.kalimat
))

In [12]:
dataset.iloc[0].pos_tags

[('Dari', 'IN'),
 ('hasil', 'NN'),
 ('pengujian', 'NN'),
 ('menunjukkan', 'VB'),
 ('bahwa', 'SC'),
 ('ada', 'VB'),
 ('beberapa', 'CD'),
 ('varietas', 'NN'),
 ('yang', 'SC'),
 ('tahan', 'NN'),
 ('terhadap', 'IN'),
 ('wereng', 'NN'),
 ('coklat', 'NN'),
 ('biotipe', 'NN'),
 ('2', 'CD'),
 ('dan', 'CC'),
 ('3', 'CD'),
 (',', 'Z'),
 ('namun', 'CC'),
 ('menjadi', 'VB'),
 ('rentan', 'NN'),
 ('terhadap', 'IN'),
 ('wereng', 'NN'),
 ('punggung', 'NN'),
 ('putih', 'JJ'),
 ('.', 'Z')]

# Begin data cleaning

In [13]:
dataset['clean'] = pd.Series(['' for i in range(len(dataset))])

In [14]:
dataset.head()

Unnamed: 0,kata,sense,raw,kalimat,pos_tags,clean
0,coklat,470d,Dari hasil pengujian menunjukkan bahwa ada beb...,Dari hasil pengujian menunjukkan bahwa ada beb...,"[(Dari, IN), (hasil, NN), (pengujian, NN), (me...",
1,jalan,190b,Perbatasan dengan Paroki Santo Petrus Lubukbaj...,Perbatasan dengan Paroki Santo Petrus Lubukbaj...,"[(Perbatasan, NN), (dengan, IN), (Paroki, NNP)...",
2,jalan,190b,Pulo Geulis dapat dicapai dengan berjalan kaki...,Pulo Geulis dapat dicapai dengan berjalan kaki...,"[(Pulo, NNP), (Geulis, NNP), (dapat, MD), (dic...",
3,sarung,120d,"Pemain bertelanjang dada, namun mengenakan pak...","Pemain bertelanjang dada, namun mengenakan pak...","[(Pemain, IN), (bertelanjang, NN), (dada, NN),...",
4,dasar,450d,SDN Jati 07 Pagi merupakan sebuah Sekolah Dasa...,SDN Jati 07 Pagi merupakan sebuah Sekolah Dasa...,"[(SDN, NNP), (Jati, NNP), (07, NNP), (Pagi, NN...",


## Execute cleaning

In [15]:
import time
from preprocessor import (
    normalize_money, normalize_number, remove_punctuation,
    create_stemmer, create_stop_words_remover, stemmer,
    clean_word, normalize_weekday, normalize_month, remove_normalized,
    normalize_personal_pronoun, normalize_demonstrative_pronoun, normalize_coordinating_conjunction,
    normalize_determiner, normalize_preposition, create_obvious_verb_normalizer
)

In [16]:
# exceptions, do not stem, because pysastrawi is stupid
exception_words = {'senilai', 'bernilai', 'menilainya', 'dinilainya', *set(map(stemmer.stem, annotated_words))}

In [17]:
begin = time.perf_counter()
dataset['kalimat'] = pd.Series(
    map(
        pipe(
            remove_punctuation,
            normalize_money,
            normalize_number,
            normalize_weekday,
            normalize_month,
            create_stemmer(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 1.736391067999648


In [18]:
dataset.iloc[1].kalimat

'perbatasan dengan paroki santo petrus lubukbaja adalah jalan yos sudarso'

In [19]:
begin = time.perf_counter()
dataset['clean'] = pd.Series(
    map(
        pipe(
            create_obvious_verb_normalizer(annotated_words, exception_words),
            normalize_personal_pronoun,
            normalize_demonstrative_pronoun,
            normalize_coordinating_conjunction,
            normalize_determiner,
            normalize_preposition,
            remove_normalized,
            create_stop_words_remover(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 0.2541015390015673


In [20]:
dataset.head()

Unnamed: 0,kata,sense,raw,kalimat,pos_tags,clean
0,coklat,470d,Dari hasil pengujian menunjukkan bahwa ada beb...,dari hasil uji tunjuk bahwa ada beberapa varie...,"[(Dari, IN), (hasil, NN), (pengujian, NN), (me...",hasil uji varietas tahan hadap wereng coklat b...
1,jalan,190b,Perbatasan dengan Paroki Santo Petrus Lubukbaj...,perbatasan dengan paroki santo petrus lubukbaj...,"[(Perbatasan, NN), (dengan, IN), (Paroki, NNP)...",perbatasan paroki santo petrus lubukbaja jalan...
2,jalan,190b,Pulo Geulis dapat dicapai dengan berjalan kaki...,pulo geulis dapat capai dengan berjalan kaki l...,"[(Pulo, NNP), (Geulis, NNP), (dapat, MD), (dic...",pulo geulis capai berjalan kaki jembatanjembat...
3,sarung,120d,"Pemain bertelanjang dada, namun mengenakan pak...",main telanjang dada namun kena pakai perang li...,"[(Pemain, IN), (bertelanjang, NN), (dada, NN),...",main telanjang dada kena pakai perang lindung ...
4,dasar,450d,SDN Jati 07 Pagi merupakan sebuah Sekolah Dasa...,sdn jati somenumber pagi rupa sebuah sekolah d...,"[(SDN, NNP), (Jati, NNP), (07, NNP), (Pagi, NN...",sdn jati pagi rupa sekolah dasar negeri letak ...


# Write down the position of target word

In [21]:
targetpos = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].clean.split()
    for j in range(len(tokens)):
        if tokens[j].find(target) >= 0 or stemmer.stem(tokens[j]) == stemmer.stem(target):
            targetpos[i] = j
            break
    if targetpos[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_clean'] = pd.Series(targetpos)

In [22]:
targetpos_ori = [-1 for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].kalimat.split()
    for j in range(len(tokens)):
        token = clean_word(tokens[j])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_ori[i] = j
            break
    if targetpos_ori[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

dataset['targetpos_ori'] = pd.Series(targetpos_ori)

In [23]:
targetpos_pos_tag = [-1 for i in range(len(dataset))]
tags_only = ['' for i in range(len(dataset))]

for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].pos_tags
    for j in range(len(tokens)):
        token = clean_word(tokens[j][0])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_pos_tag[i] = j
            break
    if targetpos_pos_tag[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception
    tags_only[i] = ' '.join(list(map(lambda t: t[1], tokens)))

dataset['targetpos_pos_tag'] = pd.Series(targetpos_pos_tag)
dataset['pos_tags'] = pd.Series(tags_only)

# Training

In [24]:
dataset.to_csv('train_data_ner.csv')

In [83]:
dataset

Unnamed: 0,kata,sense,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
0,coklat,470d,dari hasil uji tunjuk bahwa ada beberapa varie...,IN NN NN VB SC VB CD NN SC NN IN NN NN NN CD C...,hasil uji varietas tahan hadap wereng coklat b...,6,12,12
1,jalan,190b,Perbatasan dengan paroki santo petrus lubukbaj...,NN IN NNP NNP NNP NNP VB NNP NNP NNP Z,Perbatasan paroki santo petrus lubukbaja jalan...,5,7,7
2,jalan,190b,pulo geulis dapat capai dengan berjalan kaki l...,NNP NNP MD VB SC VB NN VB NN SC RB IN NN NNP N...,pulo geulis capai berjalan kaki jembatanjembat...,3,5,5
3,sarung,120d,main telanjang dada namun kena pakai perang li...,IN NN NN Z CC VB NN NN NN NN CC JJ VB NN JJ NN...,main telanjang dada kena pakai perang lindung ...,13,17,18
4,dasar,450d,sdn jati somenumber pagi rupa sebuah sekolah d...,NNP NNP NNP NNP VB NND NN NNP NNP SC VB IN NNP...,sdn jati pagi rupa sekolah dasar negeri letak ...,5,7,7
5,dasar,450d,sd kartika x somenumber pagi rupa sebuah sekol...,NNP NNP NNP Z CD NNP VB NND NN NNP NNP SC VB I...,sd kartika x pagi rupa sekolah dasar swasta le...,6,8,9
6,dasar,450d,sdn jatinegara somenumber adalah sekolah dasar...,NNP NNP CD VB NNP NNP NNP SC VB IN NNP NNP NNP...,sdn jatinegara sekolah dasar negeri letak buar...,3,5,5
7,dasar,450d,pada somenumber ia deklarasi asosiasi ekonomi ...,IN CD PRP VB NNP NNP NNP NNP Z NNP Z Z NND NN ...,deklarasi asosiasi ekonomi politik indonesia a...,13,20,23
8,harapan,350b,kampung dengan seks rasio terbesar adalah kamp...,NN IN NN NN JJ VB NNP NNP NNP IN NN NN CD CC S...,kampung seks rasio terbesar kampung harap jaya...,5,7,7
9,besar,540a,pada tahun somenumber mufti besar arab saudi a...,IN NN CD Z NNP NNP NNP NNP NNP NNP NNP NNP NNP...,mufti besar arab saudi abdul aziz bin abdullah...,1,4,5
