# Evaluation

In [41]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import gensim
from scipy.sparse.csr import csr_matrix
import time
from functools import reduce
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from functools import reduce
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from preprocessor import normalize_money, normalize_number, stemmer, pipe
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import sys

In [2]:
class MajorLabelClassifier(BaseEstimator, ClassifierMixin):
    
    def fit(self, X, y=None):
        y = list(y)
        self.possible_y_ = set(y)
        self.total_example_ = len(y)
        self.major_label_count_ = 0
        for p in self.possible_y_:
            count = y.count(p)
            if count > self.major_label_count_:
                self.major_label_ = p
                self.major_label_count_ = count
        self.fitted_ = True
        return self
    
    def predict(self, X):
        return np.array([self.major_label_ for i in range(len(X))])


# Load Training Data

In [3]:
train_raw = pd.read_csv('train_data.csv')
train_raw.head()

Unnamed: 0.1,Unnamed: 0,kata,sense,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
0,0,cerah,4801,cuaca cerah adalah lazim panjang tahun,NN NN VB NN NN NN Z,cuaca cerah lazim,1,1,1
1,1,cerah,4801,gambar yang hasil oleh layarnya cukup cerah da...,NNP SC VB IN NN RB JJ CC VB NN SC JJ VB NN SC ...,gambar hasil layarnya cerah milik speaker hasi...,3,6,6
2,2,cerah,4803,masa depan yang cerah bagi pemuda umur somenum...,NN NN SC VB IN NN NN CD IN NNP NNP CD Z,cerah bagi pemuda umur prancis abad,0,3,3
3,3,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,NNP NNP Z NNP NNP NNP Z Z Z NN RB VB NNP NNP N...,cor caroli alpha canum venaticorum nama lengka...,12,16,21
4,4,cerah,4801,sanders lebih suka cat air untuk lilo dengan m...,NN RB VB NN NN SC NNP IN NN VB NN NN NN NN NN Z,sanders suka cat air lilo maksud tampil warna ...,8,11,11


In [4]:
ambiguous_word = set(train_raw.kata)

In [5]:
RARE_LIMIT = 5
sense_set = set(train_raw.sense)

In [6]:
rare_sense = set(filter(lambda s: len(train_raw.query('sense == "{}"'.format(s))) <= RARE_LIMIT, sense_set))
len(rare_sense)

37

In [7]:
train_raw_kata = []
train_raw_sense = []
train_raw_kalimat = []
train_raw_clean = []
train_raw_pos_clean = []
train_raw_pos_ori = []
train_raw_pos_tags = []
train_raw_pos_pos_tag = []
for i in range(len(train_raw)):
    row = train_raw.iloc[i]
    if row.sense not in rare_sense:
        train_raw_kata.append(row.kata)
        train_raw_sense.append(row.sense)
        train_raw_kalimat.append(row.kalimat)
        train_raw_clean.append(row.clean)
        train_raw_pos_clean.append(row.targetpos_clean)
        train_raw_pos_ori.append(row.targetpos_ori)
        train_raw_pos_tags.append(row.pos_tags)
        train_raw_pos_pos_tag.append(row.targetpos_pos_tag)

train_raw = pd.DataFrame({
    'kata': train_raw_kata,
    'sense': train_raw_sense,
    'kalimat': train_raw_kalimat,
    'clean': train_raw_clean,
    'targetpos_clean': train_raw_pos_clean,
    'targetpos_ori': train_raw_pos_ori,
    'pos_tags': train_raw_pos_tags,
    'targetpos_pos_tag': train_raw_pos_pos_tag,
})

In [8]:
set(train_raw.query('kata == "{}"'.format('panas')).sense)

{'4901', '4903', '4904'}

# Load Test Data

In [9]:
test_raw = pd.read_csv('testing_data_clean.csv')
test_raw.head()

Unnamed: 0.1,Unnamed: 0,id,kata,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
0,0,13,asing,para cinta film indonesia atau tv pasti tak as...,DT NN NN NN CC NNP Z RB NEG JJ VB RB Z,cinta film indonesia tv asing dengar nama,4,8,9
1,1,19,asing,pasti telinga kita rasa asing dan aneh dengar ...,NN NN PRP VB JJ CC JJ VB NN VB NN NN Z SC SC J...,telinga asing aneh dengar menu masakan soto ke...,1,4,4
2,2,41,asing,warga negara asing atau warga negara makmur ya...,NN NN JJ CC NN NN NN SC NN RB NEG CD MD VB NNP...,warga negara asing warga negara makmur kepala ...,2,2,2
3,3,44,asing,lama somenumber tahun perintah sultan mahmud j...,IN CD NN NN NN Z NNP NNP VB NN JJ IN NN JJ Z N...,perintah sultan mahmud jalin kerja asing belan...,5,11,13
4,4,121,asing,yang kemudian ikut dengan donatdonat waralaba ...,DT CC VB IN NN NN JJ RB IN NNP NNP Z NNP NNP Z...,donatdonat waralaba asing master ring master d...,2,6,6


In [10]:
test = dict()
for w in ambiguous_word:
    test[w] = test_raw.query('kata == "{}"'.format(w))
    
test['baru'].head()

Unnamed: 0.1,Unnamed: 0,id,kata,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag
631,631,76787,baru,namun teliti hadap peran dna di dalam sel baru...,CC Z NN IN NN NNP IN NN NN JJ VB IN NN NN CD Z...,teliti hadap peran dna dalam sel baru abad tem...,6,8,9
632,632,77113,baru,tim atkinson milik main baru seperti jesper ol...,NNP NNP VB NN JJ IN NNP NNP Z NNP NNP CC NNP N...,tim atkinson milik main baru jesper olsen paul...,4,4,4
633,633,77460,baru,dalam usia baru injak somenumber tahun malik t...,IN NN JJ VB CD NN Z NNP MD VB VB IN NN NNP Z,dalam usia baru injak malik niat pergi pulau jawa,2,2,2
634,634,77587,baru,somenumber tindak bersamasama mungkin lahir ba...,RB VB VB VB NN JJ IN NNP Z,tindak bersamasama lahir baru dalam kristus,3,5,5
635,635,77813,baru,film pertama dari seri ini tayang perdana di e...,NN OD IN NN PR VB NN IN NNP NNP IN NNP Z NNP N...,film seri tayang perdana embassy theatre welli...,8,13,14


# Dummy

In [11]:
# dummy_clf = {w: MajorLabelClassifier().fit(train[w], train[w].sense) for w in ambiguous_word}

In [12]:
# res_file = open('dummy_baseline_classification.csv', 'w')

# for i in range(len(test_raw)):
#     row = test_raw.iloc[i]
#     res_file.write('{},{},{}'.format(row.id, row.word, dummy_clf[row.word].predict([None])[0]))
    
# res_file.close()

# Iacobacci, et. al (2016)
Embeddings for Word Sense Disambiguation: An Evaluation Study

In [13]:
POS_TAGS_WINDOW = 2

UNIGRAM = 0
BIGRAM = 1

collocation_pos = [
    (-2, -2), (-1, -1), (1, 1), (2, 2), (-2, -1), (-1, 1), (1, 2),
]

collocation_type = [
    UNIGRAM, UNIGRAM, UNIGRAM, UNIGRAM, BIGRAM, BIGRAM, BIGRAM
]

In [17]:
TAGSET = [
    '-', 'CC', 'CD', 'DT', 'FW', 'IN', 'JJ', 'MD', 'NEG', 'NN',
    'NND','NNP','OD','PR','PRP','RB','RP','SC','SYM','VB','WH','X','Z'
]

TAG_LABEL = {t: [1 if t == x else 0 for x in TAGSET] for t,i in zip(TAGSET, range(len(TAGSET)))}

class POSTagTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X, y=None):
        res = []
        for sentence in X:
            r = []
            for tag in sentence:
                r = [*r, *TAG_LABEL[tag]]
            res.append(r)
        res = np.array(res, dtype=np.bool)
        return csr_matrix(res)


def get_collocation(sentence, targetpos, L, R):
    col = ['-' for i in range(R-L+1 - (1 if L < 0 and R > 0 else 0))]
    tokens = sentence.split()
    L = targetpos+L
    R = targetpos+R
    j = L
    i = 0
    while j <= R:
        if j < 0:
            j += 1
            i += 1
            continue
        if j == targetpos:
            j += 1
            continue
        if j >= len(tokens):
            break
        col[i] = tokens[j]
        j += 1
        i += 1
    
    return ' '.join(col)

def get_collocation_vectors_vectorizer(dataset):
    context_window = []

    for i in range(len(dataset)):
        instance = dataset.iloc[i]
        context_window.append(get_collocation(instance.kalimat, instance.targetpos_ori, -2, 2))

    unigram_vectorizer = CountVectorizer(ngram_range=(1,1), min_df=.0002).fit(context_window)
    bigram_vectorizer = CountVectorizer(ngram_range=(2,2), min_df=.0002).fit(context_window)
    
    return unigram_vectorizer, bigram_vectorizer

def get_collocation_vectors(vectorizers, dataset):
    collocation_words = [[] for i in range(len(dataset))]

    for i in range(len(dataset)):
        instance = dataset.iloc[i]
        for l, r in collocation_pos:
            collocation_words[i].append(get_collocation(instance.kalimat, instance.targetpos_ori, l, r))
        
    unigram_vectorizer, bigram_vectorizer = vectorizers
    collocation_vectors = []

    vectorizer = [None, None]
    vectorizer[UNIGRAM] = unigram_vectorizer
    vectorizer[BIGRAM] = bigram_vectorizer

    for i in range(len(dataset)):
        vec = []
        for j in range(len(collocation_pos)):
            vec = [
                *vec, 
                *np.array(vectorizer[collocation_type[j]].transform([collocation_words[i][j]]).toarray()[0], dtype=np.bool)
            ]
        collocation_vectors.append(vec)
        
    collocation_vectors = np.array(collocation_vectors, dtype=np.bool)

    return csr_matrix(collocation_vectors)
    


def get_pos_tags(dataset):
    pos_tags = [['-' for j in range(2*POS_TAGS_WINDOW+1)] for i in range(len(dataset))]
    possible_tags = set()

    for i in range(len(dataset)):
        row = dataset.iloc[i]
        tags = row.pos_tags.split()
        position = row.targetpos_pos_tag
        pos_tags[i][POS_TAGS_WINDOW] = tags[position]
        j = position-1
        k = POS_TAGS_WINDOW - 1
        while j >= 0 and j >= position - POS_TAGS_WINDOW:
            if tags[j] == 'Z':
                break # do not even include
            pos_tags[i][k] = tags[j]
            k -= 1
            j -= 1
        j = position+1
        k = POS_TAGS_WINDOW + 1
        while j < len(tags) and j <= position + POS_TAGS_WINDOW:
            pos_tags[i][k] = tags[j]
            if tags[j] == 'Z':
                break # include, then break

            k += 1
            j += 1

    return POSTagTransformer().transform(pos_tags)
    
def get_surrounding_words_vectorizer(dataset):
    cv = CountVectorizer(min_df=.0002)
    surrounding_words = cv.fit_transform(
        list(map(lambda s: ' '.join(set(s.split())), dataset.clean))
    )
    return cv

def get_surrounding_words(vectorizer, dataset):
    return vectorizer.transform(
        list(map(lambda s: ' '.join(set(s.split())), dataset.clean))
    )

def get_svd_transformer(X, size):
    transformer = make_pipeline(TruncatedSVD(size), Normalizer(copy=False))
    transformer.fit_transform(X)
    return transformer
    
class ItMakesSenseFeatures(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        print('fitting pos tags.. ', end='')
        pos_tags = get_pos_tags(X)
        print('Done! | ', end='')
#         print('\n', pos_tags)
        print('fitting collocation vectors.. ', end='')
        self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer = (
            get_collocation_vectors_vectorizer(X)
        )
        collocation_vectors = get_collocation_vectors(
            (self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer),
            X
        )
        print('Done! | ', end='')
        print('fitting collocation vectors.. ', end='')
        self.surrounding_words_vectorizer_ = (
            get_surrounding_words_vectorizer(X)
        )
        surrounding_words = get_surrounding_words(
            self.surrounding_words_vectorizer_,
            X
        )
        print('Done! | ', end='')
        print('fitting SVDs ', end='')
        self.possvd_transformer_ = get_svd_transformer(pos_tags, 80)
        self.imscvsvd_transformer_ = get_svd_transformer(collocation_vectors, 1000)
        self.swsvd_transformer_ = get_svd_transformer(surrounding_words, 1000) 
        print('Done!')
        return self
        
    def transform(self, X, y=None):
        pos_tags = get_pos_tags(X)
        
        collocation_vectors = get_collocation_vectors(
            (self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer),
            X
        )
        
        surrounding_words = get_surrounding_words(
            self.surrounding_words_vectorizer_,
            X
        )
        
        possvd = self.possvd_transformer_.transform(pos_tags)
        imscvsvd = self.imscvsvd_transformer_.transform(collocation_vectors)
        swsvd = self.swsvd_transformer_.transform(surrounding_words)
        
        return np.array(
            list(
                map(
                    lambda i: [*possvd[i], *imscvsvd[i], *swsvd[i]],
                    [i for i in range(len(X))]
                )
            ), dtype=np.float32
        )
    

class IacobacciFeatures(ItMakesSenseFeatures):
    
    def __init__(self, embedding_src=None):
        if embedding_src is None:
            raise ValueError('Embeddings is required')
        self.word_vectors_ = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(embedding_src)
        self.embedding_size_ = self.word_vectors_.get_vector('wikipedia').shape[0]
        self.embedding_src = embedding_src
        
    def transform(self, X, y=None):
        ims = super().transform(X)
        
        print('IMS feature extraction finished, now working on word embeddings.. ', end='')
        
        embedding = []

        W = 5
        alpha = 1 - (np.power(0.1, np.power(W-1.0, -1)))

        for p in range(len(X)):
            instance = X.iloc[p]
            e = np.zeros(self.embedding_size_)
            I = instance.targetpos_ori
            words = instance.kalimat.split()
            for i in range(self.embedding_size_):
                for j in range(max(0, I-W), min(len(words), I+W+1)):
                    if j == I:
                        continue
                    try:
                        e[i] += (self.word_vectors_.get_vector(words[j])[i] * (np.power(1 - alpha, abs(I-j) - 1)))
                    except:
                        continue
            embedding.append(e)
        embedding = np.array(embedding)
        
        print('Done!')
        
        return np.array(
            list(
                map(
                    lambda i: [*ims[i], *embedding[i]],
                    [i for i in range(len(X))]
                )
            )
        )
        

class WordSenseDisambiguator(LinearSVC):
    pass


# Label Transformer
Note: At this line of code, the rare label might already be dropped, double check this

In [15]:
label_transformer = {w: LabelEncoder().fit(train_raw.query('kata == "{}"'.format(w)).sense) for w in ambiguous_word}

# One Classifier per Ambiguous Word

In [16]:
classifier = {w: None for w in ambiguous_word}

# Transform the Raw Training Data

### It Makes Sense (Zhong & Ng, 2010)

In [100]:
begin = time.perf_counter()
imsTransformer = ItMakesSenseFeatures()
imsTransformer.fit(train_raw)
print('elapsed time:', time.perf_counter() - begin)

fitting pos tags.. Done! | fitting collocation vectors.. Done! | fitting collocation vectors.. Done! | fitting SVDs Done!
elapsed time: 49.61393998599988


In [101]:
begin = time.perf_counter()
X_train = imsTransformer.transform(train_raw)
print('elapsed time:', time.perf_counter() - begin)

elapsed time: 34.681436462999955


### Iacobacci, et. al (2016)

In [18]:
begin = time.perf_counter()
iaco = IacobacciFeatures(embedding_src='../wikipedia_indonesia_embedding50.model')
iaco.fit(train_raw)
print('elapsed time:', time.perf_counter() - begin)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


fitting pos tags.. Done! | fitting collocation vectors.. Done! | fitting collocation vectors.. Done! | fitting SVDs Done!
elapsed time: 45.23202269499998


In [19]:
begin = time.perf_counter()
X_train = iaco.transform(train_raw)
print('elapsed time:', time.perf_counter() - begin)

IMS feature extraction finished, now working on word embeddings.. Done!
elapsed time: 42.024855675000026


In [20]:
X_train.shape

(8311, 2130)

In [21]:
X_train[0][2080:]

array([ 0.85252614, -0.58905887,  0.17187213,  0.032917  ,  0.17269847,
        2.13160081, -0.13912448, -0.78382006,  1.14674167, -1.49959393,
        0.87349293,  0.81714298, -1.03026495,  0.31008111, -0.12096191,
        0.00334453,  0.84030503, -0.95367855, -0.52984713, -0.43365839,
        0.26815864,  2.16419343, -0.6888158 , -0.42631189,  0.02961208,
        0.72068175,  0.66082762, -0.39731232,  0.13838263, -1.40844508,
        0.16267357, -0.56762642,  0.52092731, -0.34003051, -0.41646776,
       -0.56478803,  0.43807962, -1.02195942, -0.41822475, -1.02481925,
        1.27909185, -0.02604193, -0.39847989, -1.24388813,  0.15393232,
       -0.84586307,  1.14630138, -0.91808733,  0.03596264, -1.38456686])

### Labels 

In [22]:
y_train = np.array([label_transformer[w].transform([y])[0] for w, y in zip(list(train_raw.kata), list(train_raw.sense))])

# Replicate "Supervised WSD Learning" to check the correctness of implementation

In [23]:
'''
Select best parameter using k-fold cross validation
'''

def train_f1(X, y, clf, possible_param, fold=5):
    clf = GridSearchCV(clf, possible_param, cv=fold, n_jobs=7, iid=False, scoring='f1_macro')
    clf.fit(X, y)
    label_counts = np.bincount(y)
    most_freq_label = np.argmax(label_counts)
    print()
    print('Training f1-score:', classification_report(y, clf.predict(X), output_dict=True)['macro avg']['f1-score'])
    print('Cross validation f1-score:', clf.best_score_)
    dummy_score = classification_report(y, [most_freq_label for i in y], output_dict=True)['macro avg']['f1-score']
    print('Dummy classifier f1-score: ', dummy_score)
    print_param(clf.best_params_)
    return (clf.best_estimator_, clf.best_score_, dummy_score)

def print_param(param):
    print('Best parameters:')
    for p in param:
        print(p, ':', param[p])

def train_all_f1(clf, possible_param, fold=5, algorithm_name=''):
    print(algorithm_name)
    scores = []
    dummy_scores = []
    for w in classifier.keys():
        print('==================================')
        print(w)
        indexes = list(train_raw.query('kata == "{}"'.format(w)).index)
        best_clf, best_score, dummy_score = train_f1(X_train[indexes], y_train[indexes], clf, possible_param, fold)
        scores.append(best_score)
        dummy_scores.append(dummy_score)
        classifier[w] = best_clf
        print('----------------------------------')
    print('Cross validation macro average f1-score:', sum(scores)/len(scores))
    print('Dummy classifier macro average f1-score:', sum(dummy_scores)/len(dummy_scores))
    
begin = time.perf_counter()
train_all_f1(
    WordSenseDisambiguator(),
    {'max_iter': [10, 20, 40], 'C':[0.25, 0.5, 1.0, 2.0, 4.0, 8.0]},
    algorithm_name='Linear SVM'
)
print('elapsed time:', time.perf_counter() - begin)

Linear SVM
tinggi


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9730847574139373
Cross validation f1-score: 0.5562251755812129
Dummy classifier f1-score:  0.10232558139534884
Best parameters:
C : 0.25
max_iter : 20
----------------------------------
layar


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.8889704289036882
Dummy classifier f1-score:  0.3347826086956522
Best parameters:
C : 2.0
max_iter : 20
----------------------------------
kunci


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8929881213239533
Cross validation f1-score: 0.477037742514416
Dummy classifier f1-score:  0.14832535885167464
Best parameters:
C : 2.0
max_iter : 10
----------------------------------
kali


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9162003622306923
Cross validation f1-score: 0.7553689280462736
Dummy classifier f1-score:  0.2309711286089239
Best parameters:
C : 1.0
max_iter : 20
----------------------------------
bintang


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8285664023615831
Cross validation f1-score: 0.7061195902862569
Dummy classifier f1-score:  0.24113475177304963
Best parameters:
C : 4.0
max_iter : 10
----------------------------------
buah


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9956482123672364
Cross validation f1-score: 0.8562935907848189
Dummy classifier f1-score:  0.2397003745318352
Best parameters:
C : 1.0
max_iter : 20
----------------------------------
dunia


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9390248282005978
Cross validation f1-score: 0.3802316896066896
Dummy classifier f1-score:  0.1794871794871795
Best parameters:
C : 1.0
max_iter : 20
----------------------------------
bidang


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.748885077186964
Dummy classifier f1-score:  0.4854014598540146
Best parameters:
C : 2.0
max_iter : 10
----------------------------------
ketat


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9833181270046715
Cross validation f1-score: 0.7023812932169569
Dummy classifier f1-score:  0.1534090909090909
Best parameters:
C : 0.5
max_iter : 20
----------------------------------
baru


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9569731081926204
Cross validation f1-score: 0.6849244723530438
Dummy classifier f1-score:  0.284037558685446
Best parameters:
C : 0.25
max_iter : 10
----------------------------------
bunga


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.8916200326284359
Dummy classifier f1-score:  0.4703703703703704
Best parameters:
C : 2.0
max_iter : 10
----------------------------------
cabang


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9732731980288983
Cross validation f1-score: 0.6881347946841834
Dummy classifier f1-score:  0.3177570093457944
Best parameters:
C : 8.0
max_iter : 10
----------------------------------
berat


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.975488689714042
Cross validation f1-score: 0.43359307359307364
Dummy classifier f1-score:  0.10769230769230768
Best parameters:
C : 1.0
max_iter : 20
----------------------------------
bulan


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.990195561646724
Cross validation f1-score: 0.8772890360676863
Dummy classifier f1-score:  0.45614035087719296
Best parameters:
C : 0.5
max_iter : 10
----------------------------------
nilai


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9809125520395177
Cross validation f1-score: 0.6068359676099613
Dummy classifier f1-score:  0.12085308056872036
Best parameters:
C : 4.0
max_iter : 40
----------------------------------
menangkap


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9335112207636644
Cross validation f1-score: 0.6460624767502551
Dummy classifier f1-score:  0.29508196721311475
Best parameters:
C : 0.25
max_iter : 10
----------------------------------
sarung


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9945528431852898
Cross validation f1-score: 0.9290965830501181
Dummy classifier f1-score:  0.37
Best parameters:
C : 0.5
max_iter : 20
----------------------------------
tengah


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.6228455988455989
Dummy classifier f1-score:  0.1624203821656051
Best parameters:
C : 0.5
max_iter : 40
----------------------------------
besar


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9823934930317909
Cross validation f1-score: 0.49992959635961665
Dummy classifier f1-score:  0.15732758620689655
Best parameters:
C : 1.0
max_iter : 20
----------------------------------
mengeluarkan


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9407570207570206
Cross validation f1-score: 0.46480226213666526
Dummy classifier f1-score:  0.13970588235294118
Best parameters:
C : 8.0
max_iter : 20
----------------------------------
memecahkan


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.8137133928326756
Dummy classifier f1-score:  0.2164821648216482
Best parameters:
C : 0.25
max_iter : 20
----------------------------------
panas


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8184863047248369
Cross validation f1-score: 0.6866133943242495
Dummy classifier f1-score:  0.23008849557522124
Best parameters:
C : 4.0
max_iter : 10
----------------------------------
badan


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.7739941744011081
Dummy classifier f1-score:  0.27255985267034993
Best parameters:
C : 0.5
max_iter : 10
----------------------------------
cerah


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.8179616923956546
Dummy classifier f1-score:  0.47586206896551725
Best parameters:
C : 4.0
max_iter : 10
----------------------------------
mengikat


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.5744248227664785
Dummy classifier f1-score:  0.13793103448275862
Best parameters:
C : 4.0
max_iter : 40
----------------------------------
pembagian


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9788723181580324
Cross validation f1-score: 0.43043680856180855
Dummy classifier f1-score:  0.1721698113207547
Best parameters:
C : 0.5
max_iter : 40
----------------------------------
kabur


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.4893955596088353
Dummy classifier f1-score:  0.3182674199623352
Best parameters:
C : 0.25
max_iter : 10
----------------------------------
mendorong


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9307407407407409
Cross validation f1-score: 0.7037744448374328
Dummy classifier f1-score:  0.4672364672364672
Best parameters:
C : 8.0
max_iter : 10
----------------------------------
mengandung


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.7934011934011933
Dummy classifier f1-score:  0.4895833333333333
Best parameters:
C : 0.25
max_iter : 20
----------------------------------
kaki


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9911490437806227
Cross validation f1-score: 0.9017498926194578
Dummy classifier f1-score:  0.24444444444444446
Best parameters:
C : 0.5
max_iter : 10
----------------------------------
coklat


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9914311524067623
Cross validation f1-score: 0.6892647842111764
Dummy classifier f1-score:  0.3037974683544304
Best parameters:
C : 1.0
max_iter : 10
----------------------------------
harapan


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.5730835486649439
Dummy classifier f1-score:  0.2818428184281843
Best parameters:
C : 8.0
max_iter : 40
----------------------------------
membawa


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9627820069680534
Cross validation f1-score: 0.3061915987103957
Dummy classifier f1-score:  0.05442176870748299
Best parameters:
C : 1.0
max_iter : 40
----------------------------------
menjaga


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8493725682378449
Cross validation f1-score: 0.35705142521318994
Dummy classifier f1-score:  0.1634980988593156
Best parameters:
C : 2.0
max_iter : 20
----------------------------------
halaman


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9002057613168724
Cross validation f1-score: 0.6939938556067589
Dummy classifier f1-score:  0.24637681159420288
Best parameters:
C : 0.5
max_iter : 10
----------------------------------
atas


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9950170785613823
Cross validation f1-score: 0.4838505938505938
Dummy classifier f1-score:  0.05970149253731344
Best parameters:
C : 4.0
max_iter : 40
----------------------------------
asing


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.7192344497607654
Dummy classifier f1-score:  0.4825174825174825
Best parameters:
C : 0.5
max_iter : 20
----------------------------------
jam


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.993789138692126
Cross validation f1-score: 0.7174182193501835
Dummy classifier f1-score:  0.17924528301886794
Best parameters:
C : 0.25
max_iter : 40
----------------------------------
lingkungan


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9665097692481059
Cross validation f1-score: 0.537354603452607
Dummy classifier f1-score:  0.2222222222222222
Best parameters:
C : 0.25
max_iter : 20
----------------------------------
bisa


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8868686868686868
Cross validation f1-score: 0.5705478642978643
Dummy classifier f1-score:  0.43718592964824116
Best parameters:
C : 2.0
max_iter : 10
----------------------------------
mata


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.7764941383305228
Dummy classifier f1-score:  0.14371257485029942
Best parameters:
C : 8.0
max_iter : 40
----------------------------------
mengisi


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8604085182460081
Cross validation f1-score: 0.4691328151054236
Dummy classifier f1-score:  0.14791666666666667
Best parameters:
C : 8.0
max_iter : 10
----------------------------------
kulit


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9727526521496369
Cross validation f1-score: 0.71497994802231
Dummy classifier f1-score:  0.26506024096385544
Best parameters:
C : 0.25
max_iter : 40
----------------------------------
mengejar


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9266756635177688
Cross validation f1-score: 0.7725471025164053
Dummy classifier f1-score:  0.38257575757575757
Best parameters:
C : 0.25
max_iter : 10
----------------------------------
jaringan


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9907828282828284
Cross validation f1-score: 0.7106927713810742
Dummy classifier f1-score:  0.20202020202020202
Best parameters:
C : 0.5
max_iter : 40
----------------------------------
lebat


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.9821666395972064
Cross validation f1-score: 0.9157203374441225
Dummy classifier f1-score:  0.3920265780730897
Best parameters:
C : 0.25
max_iter : 10
----------------------------------
rapat


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.8662601673411885
Dummy classifier f1-score:  0.45964912280701753
Best parameters:
C : 0.5
max_iter : 10
----------------------------------
dalam


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8981164973599075
Cross validation f1-score: 0.2865048821460952
Dummy classifier f1-score:  0.07039337474120083
Best parameters:
C : 2.0
max_iter : 40
----------------------------------
menyusun


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8987850204250192
Cross validation f1-score: 0.5864585794312791
Dummy classifier f1-score:  0.24242424242424243
Best parameters:
C : 0.25
max_iter : 20
----------------------------------
kepala


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.6356492145171391
Dummy classifier f1-score:  0.3046964490263459
Best parameters:
C : 0.5
max_iter : 20
----------------------------------
menerima


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.7970733893557422
Cross validation f1-score: 0.27839514860691333
Dummy classifier f1-score:  0.11327433628318584
Best parameters:
C : 4.0
max_iter : 10
----------------------------------
menurunkan


  'precision', 'predicted', average, warn_for)



Training f1-score: 0.8393232941499343
Cross validation f1-score: 0.35922782938082815
Dummy classifier f1-score:  0.12656641604010024
Best parameters:
C : 8.0
max_iter : 10
----------------------------------
jalan


  'precision', 'predicted', average, warn_for)



Training f1-score: 1.0
Cross validation f1-score: 0.43575024334565227
Dummy classifier f1-score:  0.1651376146788991
Best parameters:
C : 0.25
max_iter : 40
----------------------------------
dasar

Training f1-score: 0.9779991697799917
Cross validation f1-score: 0.5020301025139735
Dummy classifier f1-score:  0.176056338028169
Best parameters:
C : 0.25
max_iter : 40
----------------------------------
Cross validation macro average f1-score: 0.6362953149843741
Dummy classifier macro average f1-score: 0.2526642298604586
elapsed time: 104.01280576100004


  'precision', 'predicted', average, warn_for)


In [21]:
del X_train

# Actual Test

In [24]:
begin = time.perf_counter()
X_test = iaco.transform(test_raw)
print('elapsed time:', time.perf_counter() - begin)

IMS feature extraction finished, now working on word embeddings.. Done!
elapsed time: 44.57902134


In [43]:
res_file = open('../supervised_wsd_no_ner_no_mwe_best_of_5fold_cv.csv', 'w')
begin = time.perf_counter()

for i in range(len(test_raw)):
    row = test_raw.iloc[i]
    prediction = classifier[row.kata].predict([X_test[i]])
    prediction = label_transformer[row.kata].inverse_transform(prediction)
    prediction = prediction[0]
    res_file.write('{},{},{}\n'.format(row.id, row.kata, prediction))
    sys.stdout.write("\rRaw read: {0:.2f} % | Time elapsed: {1}".format(
    i/len(test_raw)*10, time.perf_counter() - begin
    ))
    sys.stdout.flush()
    
res_file.close()

Raw read: 10.00 % | Time elapsed: 21.56081454099998458

In [34]:
r = test_raw.iloc[2]

In [35]:
r

Unnamed: 0                                                           2
id                                                                  41
kata                                                             asing
kalimat              warga negara asing atau warga negara makmur ya...
pos_tags             NN NN JJ CC NN NN NN SC NN RB NEG CD MD VB NNP...
clean                warga negara asing warga negara makmur kepala ...
targetpos_clean                                                      2
targetpos_ori                                                        2
targetpos_pos_tag                                                    2
Name: 2, dtype: object

In [36]:
classifier['asing'].predict([X_test[2]])

array([1])

In [37]:
label_transformer['asing'].inverse_transform([0])

array(['5301'], dtype=object)