# Evaluation

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
import gensim
from scipy.sparse.csr import csr_matrix
import time
from functools import reduce
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from functools import reduce
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from preprocessor import normalize_money, normalize_number, stemmer, pipe
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
import sys

In [3]:
class MajorLabelClassifier(BaseEstimator, ClassifierMixin):
    
    def fit(self, X, y=None):
        y = list(y)
        self.possible_y_ = set(y)
        self.total_example_ = len(y)
        self.major_label_count_ = 0
        for p in self.possible_y_:
            count = y.count(p)
            if count > self.major_label_count_:
                self.major_label_ = p
                self.major_label_count_ = count
        self.fitted_ = True
        return self
    
    def predict(self, X):
        return np.array([self.major_label_ for i in range(len(X))])


# Load Training Data

In [4]:
train_raw = pd.read_csv('train_data.csv').fillna('')
train_raw.head()

Unnamed: 0.1,Unnamed: 0,kata,sense,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag,verbs,nouns
0,0,cerah,4801,cuaca cerah adalah lazim panjang tahun,NN NN VB NN NN NN Z,cuaca cerah lazim,1,1,1,adalah,lazim panjang tahun cuaca
1,1,cerah,4801,gambar yang hasil oleh layarnya cukup cerah da...,NNP SC VB IN NN RB JJ CC VB NN SC JJ VB NN SC ...,gambar hasil layarnya cerah milik speaker hasi...,3,6,6,milik hasil hasil,speaker suara jernih layar
2,2,cerah,4803,masa depan yang cerah bagi pemuda umur somenum...,NN NN SC VB IN NN NN CD IN NNP NNP CD Z,cerah bagi pemuda umur prancis abad,0,3,3,,pemuda umur depan masa
3,3,cerah,4801,cor caroli alpha canum venaticorum nama lengka...,NNP NNP Z NNP NNP NNP Z Z Z NN RB VB NNP NNP N...,cor caroli alpha canum venaticorum nama lengka...,12,16,21,adalah adalah,rasi bintang nama
4,4,cerah,4801,sanders lebih suka cat air untuk lilo dengan m...,NN RB VB NN NN SC NNP IN NN VB NN NN NN NN NN Z,sanders suka cat air lilo maksud tampil warna ...,8,11,11,tampil suka,buku cerita gambar warna maksud air cat sanders


In [5]:
ambiguous_word = set(train_raw.kata)

In [6]:
RARE_LIMIT = 2
sense_set = set(train_raw.sense)

rare_sense = set(filter(lambda s: len(train_raw.query('sense == "{}"'.format(s))) <= RARE_LIMIT, sense_set))
len(rare_sense)

train_raw_kata = []
train_raw_sense = []
train_raw_kalimat = []
train_raw_clean = []
train_raw_pos_clean = []
train_raw_pos_ori = []
train_raw_pos_tags = []
train_raw_pos_pos_tag = []
train_raw_verbs = []
train_raw_nouns = []

for i in range(len(train_raw)):
    row = train_raw.iloc[i]
    if row.sense not in rare_sense:
        train_raw_kata.append(row.kata)
        train_raw_sense.append(row.sense)
        train_raw_kalimat.append(row.kalimat)
        train_raw_clean.append(row.clean)
        train_raw_pos_clean.append(row.targetpos_clean)
        train_raw_pos_ori.append(row.targetpos_ori)
        train_raw_pos_tags.append(row.pos_tags)
        train_raw_pos_pos_tag.append(row.targetpos_pos_tag)
        train_raw_verbs.append(row.verbs)
        train_raw_nouns.append(row.nouns)

train_raw = pd.DataFrame({
    'kata': train_raw_kata,
    'sense': train_raw_sense,
    'kalimat': train_raw_kalimat,
    'clean': train_raw_clean,
    'targetpos_clean': train_raw_pos_clean,
    'targetpos_ori': train_raw_pos_ori,
    'pos_tags': train_raw_pos_tags,
    'targetpos_pos_tag': train_raw_pos_pos_tag,
    'verbs': train_raw_verbs,
    'nouns': train_raw_nouns,
})

In [7]:
len(rare_sense)

21

In [8]:
len(train_raw)

8379

In [9]:
set(train_raw.query('kata == "asing"').sense)

{'5301', '5302'}

# Load Test Data

In [10]:
test_raw = pd.read_csv('testing_data_clean.csv').fillna('')
test_raw.head()

Unnamed: 0.1,Unnamed: 0,id,kata,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag,verbs,nouns
0,0,13,asing,para cinta film indonesia atau tv pasti tak as...,DT NN NN NN CC NNP Z RB NEG JJ VB RB Z,cinta film indonesia tv asing dengar nama,4,8,9,dengar,indonesia film cinta
1,1,19,asing,pasti telinga kita rasa asing dan aneh dengar ...,NN NN PRP VB JJ CC JJ VB NN VB NN NN Z SC SC J...,telinga asing aneh dengar menu masakan soto ke...,1,4,4,dengar masakan jaja adalah rasa,menu soto kerbau telinga pasti
2,2,41,asing,warga negara asing atau warga negara makmur ya...,NN NN JJ CC NN NN NN SC NN RB NEG CD MD VB NNP...,warga negara asing warga negara makmur kepala ...,2,2,2,jadi,warga negara makmur kepala negara warga
3,3,44,asing,lama somenumber tahun perintah sultan mahmud j...,IN CD NN NN NN Z NNP NNP VB NN JJ IN NN JJ Z N...,perintah sultan mahmud jalin kerja asing belan...,5,11,13,jalin,dll pihak kerja perintah tahun belas
4,4,121,asing,yang kemudian ikut dengan donatdonat waralaba ...,DT CC VB IN NN NN JJ RB IN NNP NNP Z NNP NNP Z...,donatdonat waralaba asing master ring master d...,2,6,6,ikut,waralaba donat


In [34]:
test = dict()
for w in ambiguous_word:
    test[w] = test_raw.query('kata == "{}"'.format(w))
    
test['baru'].head()

Unnamed: 0.1,Unnamed: 0,id,kata,kalimat,pos_tags,clean,targetpos_clean,targetpos_ori,targetpos_pos_tag,verbs,nouns
631,631,76787,baru,namun teliti hadap peran dna di dalam sel baru...,CC Z NN IN NN NNP IN NN NN JJ VB IN NN NN CD Z...,teliti hadap peran dna dalam sel baru abad tem...,6,8,9,mulai sama,awal abad temu postulat sel dalam peran teliti
632,632,77113,baru,tim atkinson milik main baru seperti jesper ol...,NNP NNP VB NN JJ IN NNP NNP Z NNP NNP CC NNP N...,tim atkinson milik main baru jesper olsen paul...,4,4,4,main milik,main
633,633,77460,baru,dalam usia baru injak somenumber tahun malik t...,IN NN JJ VB CD NN Z NNP MD VB VB IN NN NNP Z,dalam usia baru injak malik niat pergi pulau jawa,2,2,2,injak niat pergi,tahun pulau usia
634,634,77587,baru,dua tindak bersamasama mungkin lahir baru dala...,RB VB VB VB NN JJ IN NNP Z,tindak bersamasama lahir baru dalam kristus,3,5,5,mungkin sama tindak,lahir
635,635,77813,baru,film pertama dari seri ini tayang perdana di e...,NN OD IN NN PR VB NN IN NNP NNP IN NNP Z NNP N...,film seri tayang perdana embassy theatre welli...,8,13,14,tayang,tanggal perdana seri film


# Dummy

In [10]:
# dummy_clf = {w: MajorLabelClassifier().fit(train[w], train[w].sense) for w in ambiguous_word}

In [11]:
# res_file = open('dummy_baseline_classification.csv', 'w')

# for i in range(len(test_raw)):
#     row = test_raw.iloc[i]
#     res_file.write('{},{},{}'.format(row.id, row.word, dummy_clf[row.word].predict([None])[0]))
    
# res_file.close()

# Iacobacci, et. al (2016)
Embeddings for Word Sense Disambiguation: An Evaluation Study

In [11]:
POS_TAGS_WINDOW = 2

UNIGRAM = 0
BIGRAM = 1

collocation_pos = [
    (-2, -2), (-1, -1), (1, 1), (2, 2), (-2, -1), (-1, 1), (1, 2),
]

collocation_type = [
    UNIGRAM, UNIGRAM, UNIGRAM, UNIGRAM, BIGRAM, BIGRAM, BIGRAM
]

In [349]:
TAGSET = [
    '-', 'CC', 'CD', 'DT', 'FW', 'IN', 'JJ', 'MD', 'NEG', 'NN', 'UH',
    'NND','NNP','OD','PR','PRP','RB','RP','SC','SYM','VB','WH','X','Z'
]

TAG_LABEL = {t: [1 if t == x else 0 for x in TAGSET] for t,i in zip(TAGSET, range(len(TAGSET)))}

class POSTagTransformer(BaseEstimator, TransformerMixin):
    def transform(self, X, y=None):
        res = []
        for sentence in X:
            r = []
            for tag in sentence:
                r = [*r, *TAG_LABEL[tag]]
            res.append(r)
        res = np.array(res, dtype=np.bool)
        return csr_matrix(res)


def get_collocation(sentence, targetpos, L, R):
    col = ['-' for i in range(R-L+1 - (1 if L < 0 and R > 0 else 0))]
    tokens = sentence.split()
    L = targetpos+L
    R = targetpos+R
    j = L
    i = 0
    while j <= R:
        if j < 0:
            j += 1
            i += 1
            continue
        if j == targetpos:
            j += 1
            continue
        if j >= len(tokens):
            break
        col[i] = tokens[j]
        j += 1
        i += 1
    
    return ' '.join(col)

def get_collocation_vectors_vectorizer(dataset):
    context_window = []

    for i in range(len(dataset)):
        instance = dataset.iloc[i]
        context_window.append(get_collocation(instance.kalimat, instance.targetpos_ori, -2, 2))

    unigram_vectorizer = CountVectorizer(ngram_range=(1,1), min_df=.0002).fit(context_window)
    bigram_vectorizer = CountVectorizer(ngram_range=(2,2), min_df=.0002).fit(context_window)
    
    return unigram_vectorizer, bigram_vectorizer

def get_collocation_vectors(vectorizers, dataset):
    collocation_words = [[] for i in range(len(dataset))]

    for i in range(len(dataset)):
        instance = dataset.iloc[i]
        for l, r in collocation_pos:
            collocation_words[i].append(get_collocation(instance.kalimat, instance.targetpos_ori, l, r))
        
    unigram_vectorizer, bigram_vectorizer = vectorizers
    collocation_vectors = []

    vectorizer = [None, None]
    vectorizer[UNIGRAM] = unigram_vectorizer
    vectorizer[BIGRAM] = bigram_vectorizer

    for i in range(len(dataset)):
        vec = []
        for j in range(len(collocation_pos)):
            vec = [
                *vec, 
                *np.array(vectorizer[collocation_type[j]].transform([collocation_words[i][j]]).toarray()[0], dtype=np.bool)
            ]
        collocation_vectors.append(vec)
        
    collocation_vectors = np.array(collocation_vectors, dtype=np.bool)

    return csr_matrix(collocation_vectors)
    


def get_pos_tags(dataset):
    pos_tags = [['-' for j in range(2*POS_TAGS_WINDOW+1)] for i in range(len(dataset))]
    possible_tags = set()

    for i in range(len(dataset)):
        row = dataset.iloc[i]
        tags = row.pos_tags.split()
        position = row.targetpos_pos_tag
        pos_tags[i][POS_TAGS_WINDOW] = tags[position]
        j = position-1
        k = POS_TAGS_WINDOW - 1
        while j >= 0 and j >= position - POS_TAGS_WINDOW:
            if tags[j] == 'Z':
                break # do not even include
            pos_tags[i][k] = tags[j]
            k -= 1
            j -= 1
        j = position+1
        k = POS_TAGS_WINDOW + 1
        while j < len(tags) and j <= position + POS_TAGS_WINDOW:
            pos_tags[i][k] = tags[j]
            if tags[j] == 'Z':
                break # include, then break

            k += 1
            j += 1

    return POSTagTransformer().transform(pos_tags)
    
def get_surrounding_words_vectorizer(dataset):
    cv = CountVectorizer(min_df=.0002)
    surrounding_words = cv.fit_transform(
        list(map(lambda s: ' '.join(set(s.split())), dataset.clean))
    )
    return cv

def get_surrounding_words(vectorizer, dataset):
    return vectorizer.transform(
        list(map(lambda s: ' '.join(set(s.split())), dataset.clean))
    )

def get_svd_transformer(X, size):
    transformer = make_pipeline(TruncatedSVD(size), Normalizer(copy=False))
    transformer.fit_transform(X)
    return transformer
    
class ItMakesSenseFeatures(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        print('fitting pos tags.. ', end='')
        pos_tags = get_pos_tags(X)
        print('Done! | ', end='')
#         print('\n', pos_tags)
        print('fitting collocation vectors.. ', end='')
        self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer = (
            get_collocation_vectors_vectorizer(X)
        )
        collocation_vectors = get_collocation_vectors(
            (self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer),
            X
        )
        print('Done! | ', end='')
        print('fitting collocation vectors.. ', end='')
        self.surrounding_words_vectorizer_ = (
            get_surrounding_words_vectorizer(X)
        )
        surrounding_words = get_surrounding_words(
            self.surrounding_words_vectorizer_,
            X
        )
        print('Done! | ', end='')
        print('fitting SVDs ', end='')
        self.possvd_transformer_ = get_svd_transformer(pos_tags, 80)
        self.imscvsvd_transformer_ = get_svd_transformer(collocation_vectors, 1000)
        self.swsvd_transformer_ = get_svd_transformer(surrounding_words, 1000) 
        print('Done!')
        return self
        
    def transform(self, X, y=None):
        pos_tags = get_pos_tags(X)
        
        collocation_vectors = get_collocation_vectors(
            (self.collocation_unigram_vectorizer_, self.collocation_bigram_vectoirzer),
            X
        )
        
        surrounding_words = get_surrounding_words(
            self.surrounding_words_vectorizer_,
            X
        )
        
        possvd = self.possvd_transformer_.transform(pos_tags)
        imscvsvd = self.imscvsvd_transformer_.transform(collocation_vectors)
        swsvd = self.swsvd_transformer_.transform(surrounding_words)
        
        return np.array(
            list(
                map(
                    lambda i: [*possvd[i], *imscvsvd[i], *swsvd[i]],
                    [i for i in range(len(X))]
                )
            ), dtype=np.float32
        )
    

class IacobacciFeatures(ItMakesSenseFeatures):
    
    def __init__(self, embedding_src=None, ims=True):
        if embedding_src is None:
            raise ValueError('Embeddings is required')
        self.word_vectors_ = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(embedding_src)
        self.embedding_size_ = self.word_vectors_.get_vector('wikipedia').shape[0]
        self.embedding_src = embedding_src
        self.ims = ims
        
    def fit(self, X, y=None):
        return super().fit(X, y) if self.ims else self

    def transform(self, X, y=None):
        
        ims = super().transform(X) if self.ims else [[] for i in range(len(X))]
        
        print('IMS feature extraction finished, now working on word embeddings.. ', end='')
        
        embedding = []

        W = 5
        alpha = 1 - (np.power(0.1, np.power(W-1.0, -1)))

        for p in range(len(X)):
            instance = X.iloc[p]
            e = np.zeros(self.embedding_size_, dtype=np.float32)
            I = instance.targetpos_ori
            words = instance.kalimat.split()
            for i in range(self.embedding_size_):
                for j in range(max(0, I-W), min(len(words), I+W+1)):
                    if j == I:
                        continue
                    try:
                        e[i] += (self.word_vectors_.get_vector(words[j])[i] * (np.power(1 - alpha, abs(I-j) - 1)))
                    except:
                        continue
            embedding.append(e)
        embedding = np.array(embedding, dtype=np.float32)
        
        print('Done!')
        
        return np.array(
            list(
                map(
                    lambda i: [*ims[i], *embedding[i]],
                    [i for i in range(len(X))]
                )
            ),
            dtype=np.float32
        )

class IacobacciExtendedFeatures(IacobacciFeatures):
    def __init__(self, embedding_src=None, ims=True):
        super().__init__(embedding_src, ims)
        self.word_vectors_200_ = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(
            '../wikipedia_indonesia_embedding200_more.model'
        )
        self.word_vectors_50_ = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(
            '../wikipedia_indonesia_embedding50_more.model'
        )
        
    def transform(self, X, y=None):
        iaco = super().transform(X, y)
        print('now working on extended features..', end='')
        
        verb_embeddings = []
        noun_embeddings = []
        embedding200 = []
        embedding50 = []
        
        W = 5
        alpha = 1 - (np.power(0.1, np.power(W-1.0, -1)))
        
        for p in range(len(X)):
            instance = X.iloc[p]
            verbs = instance.verbs.split()
            nouns = instance.nouns.split()
            try:
                verb_embeddings.append(
                    np.array(self.word_vectors_.get_vector(verbs[0]), dtype=np.float32) if len(verbs[0]) > 1 else np.zeros(self.embedding_size_)
                )
            except:
                verb_embeddings.append(np.zeros(self.embedding_size_))
            try:
                noun_embeddings.append(
                    np.array(self.word_vectors_.get_vector(nouns[0]), dtype=np.float32) if len(nouns[0]) > 1 else np.zeros(self.embedding_size_)
                )
            except:
                noun_embeddings.append(np.zeros(self.embedding_size_))

            e200 = np.zeros(200, dtype=np.float32)
            e50 = np.zeros(50, dtype=np.float32)
            
            I = instance.targetpos_ori
            words = instance.kalimat.split()
            for i in range(200):
                for j in range(max(0, I-W), min(len(words), I+W+1)):
                    if j == I:
                        continue
                    try:
                        e200[i] += (self.word_vectors_200_.get_vector(words[j])[i] * (np.power(1 - alpha, abs(I-j) - 1)))
                    except:
                        continue
            for i in range(50):
                for j in range(max(0, I-W), min(len(words), I+W+1)):
                    if j == I:
                        continue
                    try:
                        e50[i] += (self.word_vectors_50_.get_vector(words[j])[i] * (np.power(1 - alpha, abs(I-j) - 1)))
                    except:
                        continue
            embedding200.append(e200)
            embedding50.append(e50)
            
        embedding200 = np.array(embedding200, dtype=np.float32)
        embedding50 = np.array(embedding50, dtype=np.float32)
        verb_embeddings = np.array(verb_embeddings, dtype=np.float32)
        noun_embeddings = np.array(noun_embeddings, dtype=np.float32)
        print('Done!')
        
        return np.array(
            list(
                map(
                    lambda i: [*iaco[i], *embedding200[i], *embedding50[i], *verb_embeddings[i], *noun_embeddings[i]],
                    [i for i in range(len(X))]
                )
            ),
            dtype=np.float32
        )            
    
        
class WordExpertWSD(BaseEstimator, ClassifierMixin):
    
    FEATURE_COLUMNS = {
        'postag': np.arange(0, 80),
        'collocations': np.arange(80, 1080),
        'surrounding_words': np.arange(1080, 2080),
        'embedding400': np.arange(2080, 2480),
        'embedding200': np.arange(2480, 2680),
        'embedding50': np.arange(2680, 2730),
        'surrounding_verbs': np.arange(2730, 3130),
        'surrounding_nouns': np.arange(3130, 3530)
    }
   
    def __init__(
        self,
        use_collocations=True,
        use_pos_tags=True,
        use_surrounding_words=True,
        embedding_size=400,
        use_immediate_verbs=True,
        use_immediate_nouns=True,
        svm_c=1.0,
        svm_max_iter=10
    ):
        self.svm_c = svm_c
        self.svm_max_iter = svm_max_iter
        self.use_collocations = use_collocations
        self.use_pos_tags = use_pos_tags
        self.use_surrounding_words = use_surrounding_words
        self.embedding_size = embedding_size
        self.use_immediate_verbs = use_immediate_verbs
        self.use_immediate_nouns = use_immediate_nouns
        
    def fit(self, X, y=None):
        self.estimator_ = LinearSVC(C=self.svm_c, max_iter=self.svm_max_iter)
        return self.estimator_.fit(X.T[self._select_features()].T, y)
    
    def predict(self, X):
        return self.estimator_.predict(X.T[self._select_features()].T)
    
    def decision_function(self, X):
        return self.estimator_.decision_function(X.T[self._select_features()].T)
    
    def _select_features(self):
        selectedFeatures = []
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['postag'] if self.use_pos_tags else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['collocations'] if self.use_collocations else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['surrounding_words'] if self.use_surrounding_words else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['embedding400'] if self.embedding_size == 400 else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['embedding200'] if self.embedding_size == 200  else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['embedding50'] if self.embedding_size == 50  else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['surrounding_verbs'] if self.use_immediate_verbs else [])]
        selectedFeatures = [*selectedFeatures, *(WordExpertWSD.FEATURE_COLUMNS['surrounding_nouns'] if self.use_immediate_nouns else [])]
        return selectedFeatures

class WordSenseDisambiguator(LinearSVC):
    pass


# Label Transformer
Note: At this line of code, the rare label might already be dropped, double check this

In [350]:
label_transformer = {w: LabelEncoder().fit(train_raw.query('kata == "{}"'.format(w)).sense) for w in ambiguous_word}

# One Classifier per Ambiguous Word

In [351]:
classifier = {w: None for w in ambiguous_word}

# Transform the Raw Training Data

### It Makes Sense (Zhong & Ng, 2010)

In [18]:
# begin = time.perf_counter()
# imsTransformer = ItMakesSenseFeatures()
# imsTransformer.fit(train_raw)
# print('elapsed time:', time.perf_counter() - begin)

In [17]:
# begin = time.perf_counter()
# X_train = imsTransformer.transform(train_raw)
# print('elapsed time:', time.perf_counter() - begin)

### Iacobacci, et. al (2016)

In [352]:
begin = time.perf_counter()
iaco = IacobacciFeatures(embedding_src='../wikipedia_indonesia_embedding400_more.model', ims=True)
iaco.fit(train_raw)
print('elapsed time:', time.perf_counter() - begin)

fitting pos tags.. Done! | fitting collocation vectors.. Done! | fitting collocation vectors.. Done! | fitting SVDs Done!
elapsed time: 71.63781664999988


In [353]:
begin = time.perf_counter()
X_train = iaco.transform(train_raw)
print('elapsed time:', time.perf_counter() - begin)

IMS feature extraction finished, now working on word embeddings.. Done!
elapsed time: 197.32473079199917


### Extended Iacobacci

In [18]:
begin = time.perf_counter()
iaco = IacobacciExtendedFeatures(embedding_src='../wikipedia_indonesia_embedding400_more.model', ims=True)
iaco.fit(train_raw)
print('elapsed time:', time.perf_counter() - begin)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


fitting pos tags.. Done! | fitting collocation vectors.. Done! | fitting collocation vectors.. Done! | fitting SVDs Done!
elapsed time: 66.22435734800001


In [19]:
begin = time.perf_counter()
X_train = iaco.transform(train_raw)
print('elapsed time:', time.perf_counter() - begin)

IMS feature extraction finished, now working on word embeddings.. Done!
now working on extended features..Done!
elapsed time: 305.97306678099994


In [354]:
X_train.shape

(8379, 2480)

In [21]:
X_train[0][2080:].shape

(1450,)

### Labels 

In [355]:
y_train = np.array([label_transformer[w].transform([y])[0] for w, y in zip(list(train_raw.kata), list(train_raw.sense))])

# Train the Model

In [18]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [356]:
'''
Select best parameter using k-fold cross validation
'''

def train_f1(X, y, clf, possible_param, fold=5):
    clf = GridSearchCV(clf, possible_param, cv=fold, n_jobs=-1, iid=False, scoring='f1_macro')
    clf.fit(X, y)
    label_counts = np.bincount(y)
    most_freq_label = np.argmax(label_counts)
    print()
#     train_score = clf.score(X, y)
    train_score = classification_report(y, clf.predict(X), output_dict=True)['macro avg']['f1-score']
    print('Training f1-score:', train_score)
    print('Cross validation f1-score:', clf.best_score_)
#     dummy_score = label_counts[most_freq_label] / len(y)
    dummy_score = classification_report(y, [most_freq_label for i in y], output_dict=True)['macro avg']['f1-score']
    print('Dummy classifier f1-score: ', dummy_score)
    print_param(clf.best_params_)
    return (clf.best_estimator_, clf.best_score_, train_score, dummy_score)

def print_param(param):
    print('Best parameters:')
    for p in param:
        print(p, ':', param[p])

def train_all_f1(clf, possible_param, fold=5, algorithm_name=''):
    print(algorithm_name)
    words = []
    train_scores = []
    scores = []
    dummy_scores = []
    for w in sorted(classifier.keys()):
        print('==================================')
        print(w)
        begin = time.perf_counter()
        indexes = list(train_raw.query('kata == "{}"'.format(w)).index)
        if len(set(y_train[indexes])) > 1:
            best_clf, best_score, train_score, dummy_score = train_f1(X_train[indexes], y_train[indexes], clf, possible_param, fold)
        else:
            print('only one label detected')
            best_clf = MajorLabelClassifier()
            best_clf.fit(X_train[indexes], y_train[indexes])
            best_score = classification_report(y_train[indexes], best_clf.predict(X_train[indexes]), output_dict=True)['macro avg']['f1-score']
            train_score = best_score
            dummy_score = train_score
            print('Dummy classifier f1-score: ', dummy_score)
        scores.append(best_score)
        train_scores.append(train_score)
        dummy_scores.append(dummy_score)
        words.append(w)
        classifier[w] = best_clf
        print('elapsed time:', time.perf_counter() - begin)
        print('----------------------------------')
    print('Cross validation macro average f1-score:', sum(scores)/len(scores))
    print('Dummy classifier macro average f1-score:', sum(dummy_scores)/len(dummy_scores))
    return pd.DataFrame({
        'word': words,
        'train_acc': train_scores,
        '{}-fold_validation_macro_f1'.format(fold): scores,
        'dummy_acc': dummy_scores
    })
    
begin = time.perf_counter()
train_result = train_all_f1(
    WordSenseDisambiguator(),
    {'max_iter': [10, 20, 40], 'C':[0.25, 0.5, 1.0, 2.0, 4.0, 8.0]},
    algorithm_name='Linear SVM'
)
print('elapsed time:', time.perf_counter() - begin)

Linear SVM
asing

Training f1-score: 1.0
Cross validation f1-score: 0.7623089241113075
Dummy classifier f1-score:  0.4825174825174825
Best parameters:
C : 0.25
max_iter : 20
elapsed time: 3.134761356999661
----------------------------------
atas

Training f1-score: 0.968204134366925
Cross validation f1-score: 0.40982164077752314
Dummy classifier f1-score:  0.050970873786407765
Best parameters:
C : 0.5
max_iter : 10
elapsed time: 3.8503641550014436
----------------------------------
badan

Training f1-score: 1.0
Cross validation f1-score: 0.7653465829615256
Dummy classifier f1-score:  0.27255985267034993
Best parameters:
C : 0.25
max_iter : 20
elapsed time: 0.8700690500008932
----------------------------------
baru

Training f1-score: 0.9936531335094076
Cross validation f1-score: 0.5516477281583665
Dummy classifier f1-score:  0.284037558685446
Best parameters:
C : 8.0
max_iter : 10
elapsed time: 1.8814094570007
----------------------------------
berat

Training f1-score: 0.9913871635610


Training f1-score: 0.9869275461380724
Cross validation f1-score: 0.8753261680847887
Dummy classifier f1-score:  0.3347826086956522
Best parameters:
C : 0.25
max_iter : 10
elapsed time: 2.0833364819991402
----------------------------------
lebat

Training f1-score: 0.9940143263664017
Cross validation f1-score: 0.9260558541987113
Dummy classifier f1-score:  0.3920265780730897
Best parameters:
C : 0.5
max_iter : 20
elapsed time: 2.2163405170031183
----------------------------------
lingkungan

Training f1-score: 1.0
Cross validation f1-score: 0.605326407796996
Dummy classifier f1-score:  0.2222222222222222
Best parameters:
C : 1.0
max_iter : 40
elapsed time: 1.7503375449996383
----------------------------------
mata

Training f1-score: 0.9947201336675022
Cross validation f1-score: 0.6449668081727578
Dummy classifier f1-score:  0.11228070175438598
Best parameters:
C : 0.25
max_iter : 20
elapsed time: 1.7648956609991728
----------------------------------
membawa

Training f1-score: 0.95699

In [136]:
train_result.to_csv('iacobacci_better_impl3.csv', index=False)

# Grow the Dataset

### Select which ambiguous word to grow

In [284]:
chosen_word = 'berat'
portion = train_raw.query('kata == "{}"'.format(chosen_word))
for s in set(portion.sense):
    print(s, ':', list(portion.sense).count(s))

5201 : 35
5202 : 20
5204 : 11
5203 : 12
5206 : 17


### Select sense to grow

In [299]:
grow_sense = ['5204', '5203', '5206', '5202', '5201']

### Grow!

In [286]:
untagged_dataset = pd.read_csv('../semi supervised dataset/{}_untagged.csv'.format(chosen_word))
X_untagged = iaco.transform(untagged_dataset)

IMS feature extraction finished, now working on word embeddings.. Done!


In [335]:
def grow(w, senses, target_growth):
    untagged_decision_fx = classifier[w].decision_function(X_untagged)
    pred_label = classifier[w].predict(X_untagged)
    pred_confidence = [
        abs(untagged_decision_fx[j]) if untagged_decision_fx.ndim == 1 
        else abs(untagged_decision_fx[j][pred_label[j]]) 
        for j in range(len(X_untagged))
    ]
    pred_label = label_transformer[w].inverse_transform(pred_label)
    untagged_dataset['sense'] = pred_label
    pred = sorted([
            (label, confidence, idx) for label, confidence, idx in zip(pred_label, pred_confidence, range(len(X_untagged)))
        ], key=lambda p: p[1], reverse=True
    )
    growth_count = {s: 0 for s in senses}
    selected = []
    for label, confidence, idx in pred:
        if label in growth_count.keys() and growth_count[label] < target_growth:
            growth_count[label] += 1
            selected.append(idx)
    return untagged_dataset.iloc[selected], X_untagged[selected], label_transformer[w].transform(untagged_dataset.iloc[selected].sense)
        

In [312]:
new_dataset, new_X, new_y = grow(chosen_word, grow_sense, 100)

elapsed time: 0.03571163500055263


In [314]:
new_dataset.to_csv('../semi supervised dataset/{}_growed.csv'.format(chosen_word))

In [293]:
new_dataset.head()

Unnamed: 0,kata,discourse_id,kalimat,pos_tags,clean,targetpos_ori,targetpos_clean,targetpos_pos_tag,sense
140,berat,24301,posthardcore kembang di amerika serikat khusus...,NNP VB IN NNP NNP Z RB IN NNP CC NNP NNP NNP Z...,posthardcore kembang amerika serikat khusus ch...,26,15,29,5206
450,berat,200743,walaupun sangat tarik dengan dunia televisi ge...,SC RB VB IN NN NN Z VB NN NN NNP PR VB RB IN N...,tarik dunia televisi gemar berat musik jazz ka...,7,4,8,5206
845,berat,902080,horan juga ungkap kalau dia rupa gemar berat m...,NN RB VB SC PRP VB NN NN NN JJ Z VB NNP NNP Z ...,horan rupa gemar berat musik swing frank sinat...,7,3,7,5206
524,berat,266058,film ini kisah tentang orang anggur kelas bera...,NN PR VB IN NND NN NN JJ VB NNP NNP Z NNP NNP Z Z,film kisah orang anggur kelas berat nama tino ...,7,5,7,5206
856,berat,980501,pada tanggal somenumber juni somenumber james ...,IN NN CD NNP CD Z NNP VB NNP SC RB NEG VB SC V...,tanggal juni james kalah max kalah juara kelas...,16,8,17,5206


### Re-train Classifier

In [331]:
target_indexes = list(train_raw.query('kata == "{}"'.format(chosen_word)).index)
X = [*X_train[target_indexes], *[]]
y = [*y_train[target_indexes], *[]]
classifier[chosen_word] = classifier[chosen_word].fit(X, y)

In [333]:
import pickle 

# Automated Growing

In [357]:
exception_words = '''
dalam atas
'''.split()

cnt = 100

for w in ambiguous_word:
    if w in exception_words:
        continue
    print(w)
    portion = train_raw.query('kata == "{}"'.format(w))
    sense_count = {s: 0 for s in set(portion.sense)}
    for s in set(portion.sense):
        sense_count[s] = list(portion.sense).count(s)
    untagged_dataset = pd.read_csv('../semi supervised dataset/{}_untagged.csv'.format(w))
    X_untagged = iaco.transform(untagged_dataset)
    target_indexes = list(train_raw.query('kata == "{}"'.format(w)).index)
    new_dataset = None
    for t in range(50):
        growing_sense = [s for s in sense_count.keys() if sense_count[s] < t]
        new_dataset, new_X, new_y = grow(w, growing_sense, t)
        X = [*X_train[target_indexes], *new_X]
        y = [*y_train[target_indexes], *new_y]
        classifier[chosen_word] = classifier[w].fit(X, y)
    new_dataset.to_csv('../semi supervised dataset/{}_growed.csv'.format(w))
    filehandler = open('semi_supervisedv{}.model'.format(cnt), 'wb') 
    pickle.dump(classifier, filehandler)
    cnt += 1

kepala
IMS feature extraction finished, now working on word embeddings.. Done!
buah
IMS feature extraction finished, now working on word embeddings.. Done!
mengeluarkan
IMS feature extraction finished, now working on word embeddings.. Done!
mengejar
IMS feature extraction finished, now working on word embeddings.. Done!
memecahkan
IMS feature extraction finished, now working on word embeddings.. Done!
panas
IMS feature extraction finished, now working on word embeddings.. Done!
rapat
IMS feature extraction finished, now working on word embeddings.. Done!
nilai
IMS feature extraction finished, now working on word embeddings.. Done!
pembagian
IMS feature extraction finished, now working on word embeddings.. Done!
jaringan
IMS feature extraction finished, now working on word embeddings.. Done!
asing
IMS feature extraction finished, now working on word embeddings.. Done!
menjaga
IMS feature extraction finished, now working on word embeddings.. Done!
berat
IMS feature extraction finished, n

# Actual Test

In [358]:
begin = time.perf_counter()
X_test = iaco.transform(test_raw)
print('elapsed time:', time.perf_counter() - begin)

IMS feature extraction finished, now working on word embeddings.. Done!
elapsed time: 198.3335736709996


## Pure Machine Learning

In [361]:
res_file = open('../semisupervised_wsd_no_mwe_no_ner_7.csv', 'w')
begin = time.perf_counter()

for i in range(len(test_raw)):
    row = test_raw.iloc[i]
    prediction = classifier[row.kata].predict(np.array([X_test[i]]))
    prediction = label_transformer[row.kata].inverse_transform(prediction)
    prediction = prediction[0]
    res_file.write('{},{},{}\n'.format(row.id, row.kata, prediction))
    sys.stdout.write("\rProgress: {0:.2f} % | Time elapsed: {1}".format(
    i/len(test_raw)*100, time.perf_counter() - begin
    ))
    sys.stdout.flush()
    
res_file.close()

Progress: 99.99 % | Time elapsed: 28.56120296200242655

## + MWE Corpus Knowledge

In [359]:
import json
from preprocessor import stemmer

In [360]:
res_file = open('../semisupervised_wsd_rulebased_mwe_no_ner_2.csv', 'w')
begin = time.perf_counter()

mwe_corpus = json.load(open('mwe.json'))

for i in range(len(test_raw)):
    row = test_raw.iloc[i]
    prediction = classifier[row.kata].predict(np.array([X_test[i]]))
    prediction = label_transformer[row.kata].inverse_transform(prediction)
    prediction = str(prediction[0])

    # MWE
    if row.kata in mwe_corpus.keys():
        for mwe in mwe_corpus[row.kata]:
            if stemmer.stem(mwe) in row.kalimat:
                prediction = prediction[:2] + '0x'
                break
    
    
    res_file.write('{},{},{}\n'.format(row.id, row.kata, prediction))
    sys.stdout.write("\rProgress: {0:.2f} % | Time elapsed: {1}".format(
    i/len(test_raw)*100, time.perf_counter() - begin
    ))
    sys.stdout.flush()
    
res_file.close()

Progress: 99.99 % | Time elapsed: 31.7843216750006834

In [342]:
mwe = json.load(open('mwe.json'))

In [344]:
'mwe'[:2]

'mw'

In [34]:
r = test_raw.iloc[2]

In [35]:
r

Unnamed: 0                                                           2
id                                                                  41
kata                                                             asing
kalimat              warga negara asing atau warga negara makmur ya...
pos_tags             NN NN JJ CC NN NN NN SC NN RB NEG CD MD VB NNP...
clean                warga negara asing warga negara makmur kepala ...
targetpos_clean                                                      2
targetpos_ori                                                        2
targetpos_pos_tag                                                    2
Name: 2, dtype: object

In [36]:
classifier['asing'].predict([X_test[2]])

array([1])

In [37]:
label_transformer['asing'].inverse_transform([0])

array(['5301'], dtype=object)

In [4]:
pd.DataFrame({'a': [1,2], 'b': [3,2]}).rename(columns={'a':'c', 'b':'d'})

Unnamed: 0,c,d
0,1,3
1,2,2
