In [1]:
import pandas as pd
import numpy as np

# Load Data

In [103]:
dataset = pd.read_csv('train_data.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,kalimat_id,kata,sense,kalimat,clean,targetpos_clean,targetpos_ori
0,0,336691,cerah,4801,Cuaca cerah adalah lazim sepanjang tahun.,cuaca cerah lazim panjang tahun,1,1
1,1,336270,cerah,4801,Gambar yang dihasilkan oleh layarnya cukup cer...,gambar hasil layar cukup cerah milik speaker h...,4,6
2,2,336555,cerah,4803,Masa depan yang cerah bagi pemuda berumur 20 d...,masa cerah pemuda umur somenumber prancis abad...,1,3
3,3,336618,cerah,4801,"Cor Caroli (Alpha Canum Venaticorum), (nama le...",cor caroli alpha canum venaticorum nama lengka...,12,16
4,4,336613,cerah,4801,Sanders lebih menyukai cat air untuk Lilo deng...,sanders suka cat air untuk lilo maksud tampil ...,9,11


# Drop rare sense from training set

In [104]:
RARE_LIMIT = 5
sense_set = set(dataset.sense)

In [105]:
rare_sense = set(filter(lambda s: len(dataset.query('sense == "{}"'.format(s))) <= RARE_LIMIT, sense_set))
len(rare_sense)

119

In [106]:
dataset_kata = []
dataset_sense = []
dataset_kalimat = []
dataset_clean = []
dataset_pos_clean = []
dataset_pos_ori = []
for i in range(len(dataset)):
    row = dataset.iloc[i]
    if row.sense not in rare_sense:
        dataset_kata.append(row.kata)
        dataset_sense.append(row.sense)
        dataset_kalimat.append(row.kalimat)
        dataset_clean.append(row.clean)
        dataset_pos_clean.append(row.targetpos_clean)
        dataset_pos_ori.append(row.targetpos_ori)

dataset = pd.DataFrame({
    'kata': dataset_kata,
    'sense': dataset_sense,
    'kalimat': dataset_kalimat,
    'clean': dataset_clean,
    'targetpos_clean': dataset_pos_clean,
    'targetpos_ori': dataset_pos_ori,
})

In [107]:
len(dataset)

8720

# Get Context Words

In [108]:
CONTEXT_WINDOW = 5

In [109]:
context_words = [[] for i in range(len(dataset))]

for i in range(len(dataset)):
    tokens = dataset.iloc[i].clean.split()
    pos = dataset.iloc[i].targetpos_clean
    for j in range(max(0, pos-CONTEXT_WINDOW), pos):
        context_words[i].append(tokens[j])
    for j in range(pos+1, min(len(tokens), pos+CONTEXT_WINDOW+1)):
        context_words[i].append(tokens[j])
    context_words[i] = ' '.join(context_words[i])

In [110]:
dataset.iloc[100].clean

'cikelet belah selatan pusat kota garut sebut garut selatan garsel garsela jarak tempuh dalam waktu somenumber jam jalan somenumber km garut kota'

In [111]:
dataset.iloc[100].kata

'jam'

In [112]:
context_words[100]

'jarak tempuh dalam waktu somenumber jalan somenumber km garut kota'

# Bag of Words

In [113]:
from sklearn.feature_extraction.text import CountVectorizer

In [114]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(context_words)

# Form Training Set

### Bag of Words only

In [115]:
X_train = bag_of_words

## Labels

In [116]:
annotated_words = set(dataset.kata)

In [117]:
mappers = dict()
for w in annotated_words:
    possible_sense = set(dataset.query('kata == "{}"'.format(w)).sense)
    mappers[w] = []
    for sense, i in zip(list(possible_sense),  [n for n in range(len(possible_sense))]):
        mappers[w].append((sense, i))

In [118]:
y_train = np.array([list(filter(lambda m: m[0] == sense, mappers[kata]))[0][1] for sense, kata in zip(dataset.sense, dataset.kata)])

# Training

Dummy classifier: always choose the most frequent sense

In [119]:
from sklearn.model_selection import GridSearchCV

In [120]:
classifier = {w: None for w in annotated_words}

In [121]:
'''
Select best parameter using k-fold cross validation
'''
def train(X, y, clf, possible_param, fold=5):
    clf = GridSearchCV(clf, possible_param, cv=fold, n_jobs=7, iid=False)
    clf.fit(X, y)
    label_counts = np.bincount(y)
    most_freq_label = np.argmax(label_counts)
    print('Cross validation accuracy:', clf.best_score_)
    dummy_score = label_counts[most_freq_label] / len(y)
    print('Dummy baseline: ', dummy_score)
    return (clf.best_estimator_, clf.best_score_, dummy_score)

In [122]:
def train_all(clf, possible_param, fold=5, algorithm_name=''):
    print(algorithm_name)
    scores = []
    dummy_scores = []
    for w in classifier.keys():
        print('==================================')
        print(w)
        indexes = list(dataset.query('kata == "{}"'.format(w)).index)
        best_clf, best_score, dummy_score = train(X_train[indexes], y_train[indexes], clf, possible_param, fold)
        scores.append(best_score)
        dummy_scores.append(dummy_score)
        classifier[w] = best_clf
        print('----------------------------------')
    print('Cross validation macro average accuracy:', sum(scores)/len(scores))
    print('Dummy classifier macro average accuracy:', sum(dummy_scores)/len(dummy_scores))

In [124]:
y_train[list(dataset.query('kata == "{}"'.format('panas')).index)]

array([2, 1, 2, 1, 2, 2, 0, 1, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 2, 0, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 2, 1, 0, 0, 2, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 0, 2,
       0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

## Logistic Regression

In [125]:
from sklearn.linear_model import LogisticRegression

In [126]:
train_all(
    LogisticRegression(),
    {'solver':['newton-cg', 'lbfgs'], 'max_iter':[100, 1000], 'multi_class': ['ovr', 'multinomial']},
    algorithm_name='Logistic Regression'
)

Logistic Regression
dunia
Cross validation accuracy: 0.44195879019015843
Dummy baseline:  0.4697986577181208
----------------------------------
dalam
Cross validation accuracy: 0.30565173425017694
Dummy baseline:  0.3269230769230769
----------------------------------
bisa
Cross validation accuracy: 0.7766798418972332
Dummy baseline:  0.7767857142857143
----------------------------------
harapan
Cross validation accuracy: 0.6568775995246583
Dummy baseline:  0.6153846153846154
----------------------------------
panas
Cross validation accuracy: 0.8661751152073732
Dummy baseline:  0.527027027027027
----------------------------------
mata
Cross validation accuracy: 0.6307636402725718
Dummy baseline:  0.3116883116883117
----------------------------------
nilai
Cross validation accuracy: 0.4875061094819159
Dummy baseline:  0.31875
----------------------------------
kali
Cross validation accuracy: 0.7341911764705882
Dummy baseline:  0.5301204819277109
----------------------------------
tinggi




Cross validation accuracy: 0.460569588539228
Dummy baseline:  0.2909090909090909
----------------------------------
lingkungan
Cross validation accuracy: 0.5403675634710118
Dummy baseline:  0.4755244755244755
----------------------------------
jaringan
Cross validation accuracy: 0.7122332859174965
Dummy baseline:  0.43478260869565216
----------------------------------
lebat
Cross validation accuracy: 0.962012012012012
Dummy baseline:  0.644808743169399
----------------------------------
kunci
Cross validation accuracy: 0.4996313364055299
Dummy baseline:  0.38271604938271603
----------------------------------
memecahkan




Cross validation accuracy: 0.8478583847004899
Dummy baseline:  0.4782608695652174
----------------------------------
pembagian
Cross validation accuracy: 0.5172322568874292
Dummy baseline:  0.5251798561151079
----------------------------------
tengah
Cross validation accuracy: 0.5683565033410234
Dummy baseline:  0.4044943820224719
----------------------------------
rapat




Cross validation accuracy: 0.7308794902833213
Dummy baseline:  0.7290322580645161
----------------------------------
kaki
Cross validation accuracy: 0.7902027796764639
Dummy baseline:  0.5288461538461539
----------------------------------
bidang
Cross validation accuracy: 0.9438058748403575
Dummy baseline:  0.9432624113475178
----------------------------------
menurunkan
Cross validation accuracy: 0.639462280116929
Dummy baseline:  0.6121212121212121
----------------------------------
kepala
Cross validation accuracy: 0.8605828445747801
Dummy baseline:  0.8417721518987342
----------------------------------
berat
Cross validation accuracy: 0.3945288121489494
Dummy baseline:  0.3431372549019608
----------------------------------
mengisi
Cross validation accuracy: 0.5810609243697479
Dummy baseline:  0.42011834319526625
----------------------------------
membawa
Cross validation accuracy: 0.29793730680827457
Dummy baseline:  0.23529411764705882
----------------------------------
mengeluark



Cross validation accuracy: 0.6084123179052185
Dummy baseline:  0.4591194968553459
----------------------------------
kabur
Cross validation accuracy: 0.9143024227234754
Dummy baseline:  0.9135135135135135
----------------------------------
bintang
Cross validation accuracy: 0.6674831619992909
Dummy baseline:  0.4657534246575342
----------------------------------
cerah
Cross validation accuracy: 0.9212606599925843
Dummy baseline:  0.9078947368421053
----------------------------------
menerima
Cross validation accuracy: 0.46444252774423556
Dummy baseline:  0.3950617283950617
----------------------------------
jam
Cross validation accuracy: 0.6846176046176046
Dummy baseline:  0.5588235294117647
----------------------------------
layar
Cross validation accuracy: 0.8245493358633776
Dummy baseline:  0.48427672955974843
----------------------------------
mendorong
Cross validation accuracy: 0.8772087877351037
Dummy baseline:  0.8770053475935828
----------------------------------
halaman
Cross

## Linear SVM

In [127]:
from sklearn.svm import LinearSVC

In [128]:
train_all(
    LinearSVC(),
    {'max_iter': [1000, 2000, 4000, 8000], 'C':[0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0]},
    algorithm_name='Linear SVC'
)

Linear SVC
dunia
Cross validation accuracy: 0.4143249112770803
Dummy baseline:  0.4697986577181208
----------------------------------
dalam
Cross validation accuracy: 0.32895521238124126
Dummy baseline:  0.3269230769230769
----------------------------------
bisa
Cross validation accuracy: 0.750197628458498
Dummy baseline:  0.7767857142857143
----------------------------------
harapan
Cross validation accuracy: 0.6808266488413547
Dummy baseline:  0.6153846153846154
----------------------------------
panas
Cross validation accuracy: 0.8595084485407065
Dummy baseline:  0.527027027027027
----------------------------------
mata
Cross validation accuracy: 0.6109191702264889
Dummy baseline:  0.3116883116883117
----------------------------------
nilai
Cross validation accuracy: 0.5238330889540567
Dummy baseline:  0.31875
----------------------------------
kali
Cross validation accuracy: 0.7404411764705883
Dummy baseline:  0.5301204819277109
----------------------------------
tinggi
Cross valid



Cross validation accuracy: 0.4419663290536156
Dummy baseline:  0.2909090909090909
----------------------------------
lingkungan
Cross validation accuracy: 0.5257787040545662
Dummy baseline:  0.4755244755244755
----------------------------------
jaringan
Cross validation accuracy: 0.7234866445392761
Dummy baseline:  0.43478260869565216
----------------------------------
lebat
Cross validation accuracy: 0.9564564564564565
Dummy baseline:  0.644808743169399
----------------------------------
kunci
Cross validation accuracy: 0.5147567844342038
Dummy baseline:  0.38271604938271603
----------------------------------
memecahkan




Cross validation accuracy: 0.8588114430219693
Dummy baseline:  0.4782608695652174
----------------------------------
pembagian
Cross validation accuracy: 0.5161901955005403
Dummy baseline:  0.5251798561151079
----------------------------------
tengah
Cross validation accuracy: 0.5894466691370717
Dummy baseline:  0.4044943820224719
----------------------------------
rapat




Cross validation accuracy: 0.7248728162010076
Dummy baseline:  0.7290322580645161
----------------------------------
kaki
Cross validation accuracy: 0.8092503987240829
Dummy baseline:  0.5288461538461539
----------------------------------
bidang
Cross validation accuracy: 0.9438058748403575
Dummy baseline:  0.9432624113475178
----------------------------------
menurunkan
Cross validation accuracy: 0.638985332581158
Dummy baseline:  0.6121212121212121
----------------------------------
kepala
Cross validation accuracy: 0.8982966764418379
Dummy baseline:  0.8417721518987342
----------------------------------
berat
Cross validation accuracy: 0.3934886623673809
Dummy baseline:  0.3431372549019608
----------------------------------
mengisi
Cross validation accuracy: 0.5750105042016807
Dummy baseline:  0.42011834319526625
----------------------------------
membawa
Cross validation accuracy: 0.31369993527251594
Dummy baseline:  0.23529411764705882
----------------------------------
mengeluark



Cross validation accuracy: 0.6206048312742025
Dummy baseline:  0.4591194968553459
----------------------------------
kabur
Cross validation accuracy: 0.9195655806182123
Dummy baseline:  0.9135135135135135
----------------------------------
bintang
Cross validation accuracy: 0.7080786954980504
Dummy baseline:  0.4657534246575342
----------------------------------
cerah
Cross validation accuracy: 0.9610752688172044
Dummy baseline:  0.9078947368421053
----------------------------------
menerima
Cross validation accuracy: 0.43193030878040367
Dummy baseline:  0.3950617283950617
----------------------------------
jam
Cross validation accuracy: 0.7017604617604618
Dummy baseline:  0.5588235294117647
----------------------------------
layar
Cross validation accuracy: 0.8374525616698293
Dummy baseline:  0.48427672955974843
----------------------------------
mendorong
Cross validation accuracy: 0.8772087877351037
Dummy baseline:  0.8770053475935828
----------------------------------
halaman
Cross

## RBF SVM

In [129]:
from sklearn.svm import SVC

In [130]:
train_all(
    SVC(gamma='auto', kernel='rbf'),
    {'max_iter': [1000, 2000, 4000, 8000], 'C':[0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0]},
    algorithm_name='RBF SVM'
)

RBF SVM
dunia
Cross validation accuracy: 0.4705302187615869
Dummy baseline:  0.4697986577181208
----------------------------------
dalam
Cross validation accuracy: 0.32837342116541224
Dummy baseline:  0.3269230769230769
----------------------------------
bisa
Cross validation accuracy: 0.7766798418972332
Dummy baseline:  0.7767857142857143
----------------------------------
harapan
Cross validation accuracy: 0.6159982174688057
Dummy baseline:  0.6153846153846154
----------------------------------
panas
Cross validation accuracy: 0.5274039938556067
Dummy baseline:  0.527027027027027
----------------------------------
mata
Cross validation accuracy: 0.3121943382469784
Dummy baseline:  0.3116883116883117
----------------------------------
nilai
Cross validation accuracy: 0.3186827956989247
Dummy baseline:  0.31875
----------------------------------
kali
Cross validation accuracy: 0.5301470588235294
Dummy baseline:  0.5301204819277109
----------------------------------
tinggi
Cross validat



Cross validation accuracy: 0.2908869632897147
Dummy baseline:  0.2909090909090909
----------------------------------
lingkungan
Cross validation accuracy: 0.47607553366174055
Dummy baseline:  0.4755244755244755
----------------------------------
jaringan
Cross validation accuracy: 0.4349612770665402
Dummy baseline:  0.43478260869565216
----------------------------------
lebat
Cross validation accuracy: 0.6447447447447447
Dummy baseline:  0.644808743169399
----------------------------------
kunci
Cross validation accuracy: 0.38392729134664616
Dummy baseline:  0.38271604938271603
----------------------------------
memecahkan




Cross validation accuracy: 0.48918128654970766
Dummy baseline:  0.4782608695652174
----------------------------------
pembagian
Cross validation accuracy: 0.5254354202630065
Dummy baseline:  0.5251798561151079
----------------------------------
tengah
Cross validation accuracy: 0.4047343318550748
Dummy baseline:  0.4044943820224719
----------------------------------
rapat




Cross validation accuracy: 0.7308794902833213
Dummy baseline:  0.7290322580645161
----------------------------------
kaki
Cross validation accuracy: 0.5300751879699248
Dummy baseline:  0.5288461538461539
----------------------------------
bidang
Cross validation accuracy: 0.9438058748403575
Dummy baseline:  0.9432624113475178
----------------------------------
menurunkan
Cross validation accuracy: 0.6142250884660752
Dummy baseline:  0.6121212121212121
----------------------------------
kepala
Cross validation accuracy: 0.8422116324535679
Dummy baseline:  0.8417721518987342
----------------------------------
berat
Cross validation accuracy: 0.34555856043270233
Dummy baseline:  0.3431372549019608
----------------------------------
mengisi
Cross validation accuracy: 0.42027310924369743
Dummy baseline:  0.42011834319526625
----------------------------------
membawa
Cross validation accuracy: 0.23658423819714142
Dummy baseline:  0.23529411764705882
----------------------------------
mengelu



Cross validation accuracy: 0.45993853340709323
Dummy baseline:  0.4591194968553459
----------------------------------
kabur
Cross validation accuracy: 0.9143024227234754
Dummy baseline:  0.9135135135135135
----------------------------------
bintang
Cross validation accuracy: 0.46683563748079876
Dummy baseline:  0.4657534246575342
----------------------------------
cerah
Cross validation accuracy: 0.9081423804226919
Dummy baseline:  0.9078947368421053
----------------------------------
menerima
Cross validation accuracy: 0.39560002300040253
Dummy baseline:  0.3950617283950617
----------------------------------
jam
Cross validation accuracy: 0.5595815295815296
Dummy baseline:  0.5588235294117647
----------------------------------
layar
Cross validation accuracy: 0.5406902277039849
Dummy baseline:  0.48427672955974843
----------------------------------
mendorong
Cross validation accuracy: 0.8772087877351037
Dummy baseline:  0.8770053475935828
----------------------------------
halaman
Cro

## Random Forest

In [131]:
from sklearn.ensemble import RandomForestClassifier

In [132]:
train_all(
    RandomForestClassifier(),
    {'n_estimators': [400]},
    algorithm_name='Random Forest'
)

Random Forest
dunia
Cross validation accuracy: 0.4633873616187299
Dummy baseline:  0.4697986577181208
----------------------------------
dalam
Cross validation accuracy: 0.3336411789915683
Dummy baseline:  0.3269230769230769
----------------------------------
bisa
Cross validation accuracy: 0.7411067193675889
Dummy baseline:  0.7767857142857143
----------------------------------
harapan
Cross validation accuracy: 0.6218805704099821
Dummy baseline:  0.6153846153846154
----------------------------------
panas
Cross validation accuracy: 0.8242703533026112
Dummy baseline:  0.527027027027027
----------------------------------
mata
Cross validation accuracy: 0.5784944289172844
Dummy baseline:  0.3116883116883117
----------------------------------
nilai
Cross validation accuracy: 0.5375183284457478
Dummy baseline:  0.31875
----------------------------------
kali
Cross validation accuracy: 0.6926470588235294
Dummy baseline:  0.5301204819277109
----------------------------------
tinggi
Cross va



Cross validation accuracy: 0.490370327016437
Dummy baseline:  0.2909090909090909
----------------------------------
lingkungan
Cross validation accuracy: 0.5052696728558798
Dummy baseline:  0.4755244755244755
----------------------------------
jaringan
Cross validation accuracy: 0.6040698593330172
Dummy baseline:  0.43478260869565216
----------------------------------
lebat
Cross validation accuracy: 0.962012012012012
Dummy baseline:  0.644808743169399
----------------------------------
kunci
Cross validation accuracy: 0.5314234511008704
Dummy baseline:  0.38271604938271603
----------------------------------
memecahkan




Cross validation accuracy: 0.8424529792950846
Dummy baseline:  0.4782608695652174
----------------------------------
pembagian
Cross validation accuracy: 0.5400242796794521
Dummy baseline:  0.5251798561151079
----------------------------------
tengah
Cross validation accuracy: 0.5943375927896052
Dummy baseline:  0.4044943820224719
----------------------------------
rapat




Cross validation accuracy: 0.7308794902833213
Dummy baseline:  0.7290322580645161
----------------------------------
kaki
Cross validation accuracy: 0.761494645705172
Dummy baseline:  0.5288461538461539
----------------------------------
bidang
Cross validation accuracy: 0.9438058748403575
Dummy baseline:  0.9432624113475178
----------------------------------
menurunkan
Cross validation accuracy: 0.6142250884660752
Dummy baseline:  0.6121212121212121
----------------------------------
kepala
Cross validation accuracy: 0.8484616324535679
Dummy baseline:  0.8417721518987342
----------------------------------
berat
Cross validation accuracy: 0.3916579987518203
Dummy baseline:  0.3431372549019608
----------------------------------
mengisi
Cross validation accuracy: 0.4969747899159664
Dummy baseline:  0.42011834319526625
----------------------------------
membawa
Cross validation accuracy: 0.24258226334839236
Dummy baseline:  0.23529411764705882
----------------------------------
mengeluark



Cross validation accuracy: 0.5562591431556949
Dummy baseline:  0.4591194968553459
----------------------------------
kabur
Cross validation accuracy: 0.9143024227234754
Dummy baseline:  0.9135135135135135
----------------------------------
bintang
Cross validation accuracy: 0.5773130095710741
Dummy baseline:  0.4657534246575342
----------------------------------
cerah
Cross validation accuracy: 0.9081423804226919
Dummy baseline:  0.9078947368421053
----------------------------------
menerima
Cross validation accuracy: 0.42034385601748026
Dummy baseline:  0.3950617283950617
----------------------------------
jam
Cross validation accuracy: 0.5954401154401154
Dummy baseline:  0.5588235294117647
----------------------------------
layar
Cross validation accuracy: 0.8118121442125238
Dummy baseline:  0.48427672955974843
----------------------------------
mendorong
Cross validation accuracy: 0.8772087877351037
Dummy baseline:  0.8770053475935828
----------------------------------
halaman
Cross