In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
single_agree = pd.read_csv('../single_annotator.csv')
double_agree = pd.read_csv('../double_annotator_agree.csv')
triple_agree = pd.read_csv('../triple_annotator_agree.csv')

In [3]:
double_disagree = pd.read_csv('../double_annotator_disagree.csv')
triple_disagree = pd.read_csv('../triple_annotator_disagree.csv')

In [4]:
single_agree.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Summarize

In [5]:
len({*set(single_agree.kata), *set(double_agree.kata), *set(triple_agree.kata), *set(double_disagree.kata), *set(triple_disagree.kata)})

54

There are 54 annotated ambiguous words in combined dataset

In [6]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id),
     *set(double_disagree.kalimat_id), *set(triple_disagree.kalimat_id)})

10297

There are 10298 annotated sentence in combined dataset

In [7]:
len({*set(single_agree.kalimat_id), *set(double_agree.kalimat_id), *set(triple_agree.kalimat_id)})

8981

However, there are only 8992 annotated sentence in dataset without disagreement between annotators

# Merge no-conflict-between-annotators-dataset

In [3]:
dataset_kalimat_id = [*list(single_agree.kalimat_id), *list(double_agree.kalimat_id), *list(triple_agree.kalimat_id)]
dataset_kata = [*list(single_agree.kata), *list(double_agree.kata), *list(triple_agree.kata)]
dataset_sense = [*list(single_agree.sense), *list(double_agree.sense), *list(triple_agree.sense)]
dataset_kalimat = [*list(single_agree.kalimat), *list(double_agree.kalimat), *list(triple_agree.kalimat)]

In [4]:
dataset = pd.DataFrame({
    'kalimat_id': dataset_kalimat_id,
    'kata': dataset_kata,
    'sense': dataset_sense,
    'kalimat': dataset_kalimat
})

In [5]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le..."
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...


# Begin data cleaning

In [6]:
dataset['clean'] = pd.Series(['' for i in range(len(dataset))])

In [7]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,


In [8]:
annotated_words = set(dataset.kata)

In [9]:
# exceptions, do not stem
exception_words = {'senilai', 'bernilai'}

## Execute cleaning

In [18]:
from preprocessor import pipe, normalize_money, normalize_number, create_stemmer, create_stop_words_remover, stemmer, clean_word

In [11]:
dataset['clean'] = pd.Series(
    map(
        pipe(
            normalize_money,
            normalize_number,
            create_stemmer(annotated_words, exception_words),
            create_stop_words_remover(annotated_words, exception_words),
        ),
        dataset.kalimat
    )
)

In [12]:
dataset.head()

Unnamed: 0,kalimat_id,kata,sense,kalimat,clean
0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,cuaca cerah lazim
1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,gambar hasil layarnya cerah milik speaker hasi...
2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,cerah pemuda umur somenumber prancis abad some...
3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",cor caroli alpha canum venaticorum nama lengka...
4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,sanders suka cat air lilo maksud tampil warna ...


# Write down the position of target word

In [13]:
targetpos = [-1 for i in range(len(dataset))]

In [14]:
for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].clean.split()
    for j in range(len(tokens)):
        if tokens[j].find(target) >= 0 or stemmer.stem(tokens[j]) == stemmer.stem(target):
            targetpos[i] = j
            break
    if targetpos[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

In [15]:
dataset['targetpos_clean'] = pd.Series(targetpos)

In [16]:
targetpos_ori = [-1 for i in range(len(dataset))]

In [19]:
for i in range(len(dataset)):
    target = dataset.iloc[i].kata
    tokens = dataset.iloc[i].kalimat.split()
    for j in range(len(tokens)):
        token = clean_word(tokens[j])
        if token.find(target) >= 0 or stemmer.stem(token) == stemmer.stem(target):
            targetpos_ori[i] = j
            break
    if targetpos_ori[i] == -1:
        print(target)
        print(tokens)
        print(dataset.iloc[i].kalimat)
        print(i)
        raise Exception

In [20]:
dataset['targetpos_ori'] = pd.Series(targetpos_ori)

# Clean Words Bigram

In [29]:
bigrams = []

for i in range(len(dataset)):
    tokens = dataset.iloc[i].clean.split()
    bigram = []
    if len(tokens) > 1:
        for j in range(1, len(tokens)):
            bigram.append('{}_{}'.format(tokens[j-1], tokens[j]))
    else:
        bigram.append(tokens[0])
    bigrams.append(' '.join(bigram))

In [30]:
dataset['clean_bigram'] = bigrams

In [22]:
dataset[['clean']].to_csv('clean_sentence.csv')

In [31]:
dataset.to_csv('train_data.csv')