In [1]:
import pandas as pd
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import matplotlib.pyplot as plt
import string

# Load Data

In [2]:
single_agree = pd.read_csv('../single_annotator.csv')
double_agree = pd.read_csv('../double_annotator_agree.csv')
triple_agree = pd.read_csv('../triple_annotator_agree.csv')

In [3]:
double_disagree = pd.read_csv('../double_annotator_disagree.csv')
triple_disagree = pd.read_csv('../triple_annotator_disagree.csv')

In [4]:
single_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Summarize

In [5]:
len({*set(single_agree.kata), *set(double_agree.kata), *set(triple_agree.kata), *set(double_disagree.kata), *set(triple_disagree.kata)})

54

There are 54 annotated ambiguous words in combined dataset

In [6]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id),
     *set(double_disagree.kalimat_id), *set(triple_disagree.kalimat_id)})

10297

There are 10298 annotated sentence in combined dataset

In [7]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id)})

8981

However, there are only 8992 annotated sentence in dataset without disagreement between annotators

# Merge no-conflict-between-annotators-dataset

In [8]:
dataset_kalimat_id = [*list(single_agree.kalimat_id), *list(double_agree.kalimat_id), *list(triple_agree.kalimat_id)]
dataset_kata = [*list(single_agree.kata), *list(double_agree.kata), *list(triple_agree.kata)]
dataset_sense = [*list(single_agree.sense), *list(double_agree.sense), *list(triple_agree.sense)]
dataset_kalimat = [*list(single_agree.kalimat), *list(double_agree.kalimat), *list(triple_agree.kalimat)]

In [9]:
dataset = pd.DataFrame({
    'kalimat_id': dataset_kalimat_id,
    'kata': dataset_kata,
    'sense': dataset_sense,
    'kalimat': dataset_kalimat
})

In [10]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Begin data cleaning
Based on Edi Faisal, et. al (2018)

In [11]:
dataset['clean'] = pd.Series(['' for i in range(len(dataset))])

In [12]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,


In [12]:
annotated_words = set(dataset.kata)

In [14]:
# exceptions, do not stem
exception_words = {'senilai', 'bernilai'}

In [31]:
stemmer = StemmerFactory().create_stemmer()
clean_word = lambda w: w.translate(str.maketrans('', '', string.punctuation)).lower() 

## Money normalizer

In [57]:
def normalize_money(s):
    s = s.replace('Rp ', 'Rp')
    tokens = []
    for t in s.split():
        if t[:2] == 'Rp':
            tokens.append('somemoney')
        else:
            tokens.append(t)
    return ' '.join(tokens)

## Number normalizer

In [71]:
TERBILANG = [
    'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh', 'delapan', 'sembilan', 'nol',
    'sepuluh', 'sebelas', 'belas', 'puluh', 'seratus', 'ratus', 'seribu', 'ribu'
]

def normalize_number(s):
    tokens = []
    for t in s.split():
        try:
            int(clean_word(t))
            tokens.append('somenumber')
        except:
            if t in TERBILANG:
                tokens.append('somenumber')
            else:
                tokens.append(t)
    result = ' '.join(tokens)
    while 'somenumber somenumber' in result:
        result = result.replace('somenumber somenumber', 'somenumber')
    return result

## Stemming, lowercasing, punctuation removal, repeated-words removal

In [37]:
def stem(s):
    tokens = []
    for t in s.split():
        if clean_word(t) not in {*annotated_words, *exception_words}:
            tokens.append(stemmer.stem(t))
        else:
            tokens.append(clean_word(t))
    return ' '.join(tokens)

## Stop-words removal

In [15]:
with open('../stop_words.txt', 'r') as f:
    stop_words = f.readlines()
stop_words = list(map(str.strip, stop_words))
stop_words = list(filter(lambda w: w not in {*annotated_words, *exception_words}, stop_words))

remove_stop_words = lambda s: ' '.join(word for word in s.split() if word not in stop_words)

In [16]:
'paling' in stop_words

True

## Execute cleaning

In [38]:
from functools import reduce
pipe = lambda *args: lambda x: reduce(lambda acc, f: f(acc), args, x)

In [72]:
dataset['clean'] = pd.Series(
    map(
        pipe(
            normalize_money,
            normalize_number,
            stem,
            remove_stop_words,
        ),
        dataset.kalimat
    )
)

In [73]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean,targetpos_clean,targetpos_ori
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,cuaca cerah lazim panjang tahun,1,1
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,gambar hasil layar cukup cerah milik speaker h...,4,6
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,masa cerah pemuda umur somenumber prancis abad...,1,3
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",cor caroli alpha canum venaticorum nama lengka...,12,16
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,sanders suka cat air untuk lilo maksud tampil ...,9,11


# Write down the position of target word

In [74]:
targetpos = [-1 for i in range(len(dataset))]

In [75]:
for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].clean.split()
    for j in range(len(tokens)):
        if tokens[j].find(target) >= 0 or stemmer.stem(tokens[j]) == stemmer.stem(target):
            targetpos[i] = j
            break
    if targetpos[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

In [76]:
dataset['targetpos_clean'] = pd.Series(targetpos)

In [77]:
targetpos_ori = [-1 for i in range(len(dataset))]

In [78]:
for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].kalimat.split()
    for j in range(len(tokens)):
        token = clean_word(tokens[j])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_ori[i] = j
            break
    if targetpos_ori[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

In [79]:
dataset['targetpos_ori'] = pd.Series(targetpos_ori)

In [80]:
dataset

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean,targetpos_clean,targetpos_ori
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,cuaca cerah lazim panjang tahun,1,1
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,gambar hasil layar cukup cerah milik speaker h...,4,6
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,masa cerah pemuda umur somenumber prancis abad...,1,3
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",cor caroli alpha canum venaticorum nama lengka...,12,16
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,sanders suka cat air untuk lilo maksud tampil ...,9,11
5,336406,cerah,4801,Ulleungdo memiliki iklim subtropis basah (Klas...,ulleungdo milik iklim subtropis basah klasifik...,14,18
6,336324,cerah,4801,Ikan hias kemungkinan besar sulit bertahan hid...,ikan hias besar sulit tahan hidup alam liar ak...,10,13
7,336401,cerah,4801,Sebuah entremet menandai berakhirnya sajian su...,buah entremet tanda akhir saji set menu upa mu...,14,22
8,336426,cerah,4801,Alangkah lega hatinya ketika ia mulai melihat ...,alangkah lega hati mulai lihat binatang ternak...,14,19
9,337760,coklat,4703,Sisi atas tubuh berwarna coklat tembaga keemasan.,sisi atas tubuh warna coklat tembaga emas,4,4


In [81]:
dataset.iloc[100].clean

'misal orang tenaga ajar honorer punya beban ajar somenumber jam dalam minggu honor besar somemoney jam dalam masa somenumber minggu bulan honor somemoney'

In [82]:
dataset.to_csv('train_data.csv')