In [1]:
import sys
import string
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from preprocessor import create_stop_words_remover, remove_punctuation
from rule import classify_from_existing_rule
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_validate

In [2]:
DECISION_MIN_EVIDENCE = 2
DECISION_MIN_STRENGTH = 10

def get_pos_tag(pos_tags, targetpos, L, R):
    return ' '.join(pos_tags.split()[max(0, targetpos+L):min(len(pos_tags.split()), targetpos+R+1)])

def get_pos_tag_with_targetword(clean, targetpos_clean, pos_tags, targetpos_pos, chosen_tag_offset):
    return (
        clean.split()[targetpos_clean] + ' ' + pos_tags.split()[targetpos_pos + chosen_tag_offset] if chosen_tag_offset > 0
        else pos_tags.split()[targetpos_pos + chosen_tag_offset] + ' ' + clean.split()[targetpos_clean]
    )


def get_collocation(clean, ori, targetpos, targetpos_ori, L, R):
    tokens = clean.split()
    tokens_ori = ori.split()
    clean_shift = targetpos_ori - targetpos
    L = max(0, targetpos+L)
    R = min(len(tokens)-1, targetpos+R)
    collocation = []
    if L < targetpos and targetpos < R:
        reversd = []
        for i in range(targetpos-1, L-1, -1):
            if '.' in tokens_ori[i+clean_shift] or ',' in tokens_ori[i+clean_shift]:
                break
            if tokens[i] == 'somecoordinatingconjunction':
                reversd.append(tokens[i])
                break
            reversd.append(tokens[i])
        collocation = [*reversd[::-1]]
        for i in range(targetpos+1, R):
            if '.' in tokens_ori[i+clean_shift-1] or ',' in tokens_ori[i+clean_shift-1]:
                break
            if tokens[i] == 'somecoordinatingconjunction' or tokens[i] == 'yang':
                break
            collocation.append(tokens[i])
    else:
        reversd = []
        if L >= targetpos:
            for i in range(L, R+1):
                if '.' in tokens_ori[i-1+clean_shift] or ',' in tokens_ori[i-1+clean_shift]:
                    break
                if tokens[i] == 'somecoordinatingconjunction' or tokens[i] == 'yang':
                    break
                collocation.append(tokens[i])
        else:
            for i in range(R, L-1, -1):
                if '.' in tokens_ori[i+clean_shift] or ',' in tokens_ori[i+clean_shift]:
                    break
                if tokens[i] == 'somecoordinatingconjunction':
                    reversd.append(tokens[i])
                    break
                reversd.append(tokens[i])
            collocation = [*reversd[::-1]]
    return ' '.join(list(collocation))

extract_collocation = lambda L, R:\
    lambda clean, pos, ori, pos_ori, _, __: get_collocation(clean, ori, pos, pos_ori, L, R)
extract_surrounding_words = lambda window_size:\
    lambda clean, pos, ori, pos_ori, _, __: get_collocation(clean, ori, pos, pos_ori, -window_size, window_size).split()
extract_pos_tags = lambda L, R:\
    lambda _, __, ___, ____, postag, postag_pos: get_pos_tag(postag, postag_pos, L, R)

class Decision:

    def __init__(self, idx, prob, factor, verdict, evidence_count, alpha=5.0, beta=2.0, gamma=0.4):
        self.idx = idx
        self.prob = prob
        self.evidence_count = evidence_count
        self.factor = factor
        self.verdict =  verdict
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma

    def get_strength(self):
        return np.power(self.prob * self.alpha, self.beta) + (self.evidence_count * self.gamma)
    
    def __eq__(self, other):
        return hash(self) == hash(other)
    
    def __ne__(self, other):
        return not self.__eq__(other)
    
    def __hash__(self):
        return hash(self.__repr__())
    
    def __repr__(self):
        return '[{}]: {} -> {} ({})'.format(self.idx, self.factor, self.verdict, self.get_strength())

remove_punctuation = lambda s: s.translate(str.maketrans('', '', string.punctuation))

def mwe_entity_aware_decision_list_predict(ori, clean, possible_senses, mwe, entities, dl, features):
    ori_no_punc = remove_punctuation(ori)
    for p in mwe:
        if p in clean:
            return [s for s in possible_senses if s[-1] == 'x'][0]
    for p in entities.keys():
        if p in ori_no_punc:
            return entities[p]
    return decision_list_predict(dl, features)

def decision_list_predict(dl, features):
    for decision in dl:
        if features[decision.idx] == decision.factor and decision.get_strength() >= DECISION_MIN_STRENGTH:
            return decision.verdict
    return None

def get_detailed_progress(annotated_senses, senses):
    res = ''
    for s in senses:
        res += '{}: {} |'.format(s, annotated_senses.count(s))
    return res

def one_sense_per_discourse_classification(discourses, senses):
    discourse_sense = {d: set() for d in set(discourses) if d is not None}
    new_senses = []
    for i in range(len(discourses)):
        if discourses[i] is not None and senses[i] is not None:
            discourse_sense[discourses[i]].add(senses[i])
    for i in range(len(discourses)):
        if discourses[i] is not None and senses[i] is None and len(discourse_sense[discourses[i]]) == 1:
            new_senses.append(next(iter(discourse_sense[discourses[i]])))
        else:
            new_senses.append(senses[i])
    return new_senses

# attempt to classify a multi word expression or proper nouns should be cancelled
def classification_should_be_cancelled(ori, pos_ori, postag, pos_postag, sense):
    if sense is None:
        return False
    sense = str(sense)
    return (
        postag.split()[pos_postag] == 'NNP'
    )

def detect_mwe(clean, targetpos, sense):
    target_sense = [s for s in set(sense) if s[-1] == 'x']
    possible_count = 0
    possibles = dict()
    for i in range(len(clean)):
        if sense[i] not in target_sense:
            continue
        possible_count += 1
        tokens = clean[i].split()
        pos = targetpos[i]
        possibles_i = set()
        if pos-1 >= 0:
            possibles_i.add(' '.join(tokens[pos-1:pos+1]))
        if pos+2 <= len(tokens):
            possibles_i.add(' '.join(tokens[pos:pos+2]))
        if pos-1 >= 0 and pos+2 <= len(tokens):
            possibles_i.add(' '.join(tokens[pos-1:pos+2]))
        for p in possibles_i:
            if 'some' in p or 'yang' in p:
                continue
            if p not in possibles.keys():
                possibles[p] = 0
            possibles[p] += 1
    least_mwe_freq = max(2, possible_count // 10)
    return {p for p in possibles.keys() if possibles[p] >= least_mwe_freq}
    
def detect_entities(ori, targetpos, sense):
    target_sense = [s for s in set(sense) if s[-1] in 'abcdx']
    entities = dict()
    for i in range(len(ori)):
        if sense[i] not in target_sense:
            continue
        pos = targetpos[i]
        tokens = ori[i].split()
        if tokens[pos][0] not in string.ascii_uppercase:
            continue
        entity = tokens[pos]
        j = pos + 1

        while j < len(tokens) and tokens[j][0] in string.ascii_uppercase:
            if ',' in tokens[j-1]:
                break
            entity = entity + ' ' + tokens[j]
            j += 1
        j = pos - 1
        while j > 0 and tokens[j][0] in string.ascii_uppercase:
            entity = tokens[j] + ' ' + entity
            j -= 1
            if ',' in tokens[j]:
                break
        if ' ' in entity:
            entities[remove_punctuation(entity)] = sense[i]
    return entities
    
def feature_freq(possible_senses, dataset_sense, dataset_features):
    feature_num = len(dataset_features[0])
    feature_count_per_sense = {s: [dict() for i in range(feature_num)] for s in set(dataset_sense)}
    feature_count = [dict() for i in range(feature_num)]
    for s, features in zip(dataset_sense, dataset_features):
        for f in range(feature_num):
            obs = features[f]
            if type(obs) == list:
                for word in obs:
                    if word not in feature_count_per_sense[s][f].keys():
                        feature_count_per_sense[s][f][word] = 0
                    feature_count_per_sense[s][f][word] += 1
                    if word not in feature_count[f].keys():
                        feature_count[f][word] = 0
                    feature_count[f][word] += 1
            else:
                if obs not in feature_count_per_sense[s][f].keys():
                    feature_count_per_sense[s][f][obs] = 0
                feature_count_per_sense[s][f][obs] += 1
                if obs not in feature_count[f].keys():
                    feature_count[f][obs] = 0
                feature_count[f][obs] += 1
    return feature_count_per_sense, feature_count

def feature_prob(possible_senses, feature_count_per_sense, feature_count):
    feature_num = len(feature_count)
    feature_prob_per_sense = {s: [dict() for i in range(feature_num)] for s in possible_senses}

    for f in range(feature_num):
        for s in possible_senses:
            for obs in feature_count_per_sense[s][f].keys():
                feature_prob_per_sense[s][f][obs] = feature_count_per_sense[s][f][obs] / feature_count[f][obs]
    return feature_prob_per_sense

def build_decision_list(
    possible_senses, feature_count_per_sense, feature_prob_per_sense, sense,
    alpha=5, beta=2, gamma=0.4
):
    feature_num = len(feature_count_per_sense[next(iter(possible_senses))])
    decision_list = []
    waiting_list = dict()
    decision_min_evidence = {
        s: max(DECISION_MIN_EVIDENCE, sense.count(s) // 400)
        for s in possible_senses
    }

    for s in possible_senses:
        for f in range(feature_num):
            for factor in feature_prob_per_sense[s][f].keys():
                if feature_count_per_sense[s][f][factor] < decision_min_evidence[s]:
                    continue
                if len(factor) < 2:
                    continue
                decision = Decision(
                    f, feature_prob_per_sense[s][f][factor], factor, s, feature_count_per_sense[s][f][factor],
                    alpha, beta, gamma
                )
                if feature_prob_per_sense[s][f][factor] >= .5:
                    decision_list.append(decision)
                else:
                    if (f, factor) in waiting_list.keys():
                        if decision.get_strength() > waiting_list[(f, factor)].get_strength():
                            waiting_list[(f, factor)] = decision
                    else:
                        waiting_list[(f,factor)] = decision


    for f, factor in waiting_list:
        decision_list.append(waiting_list[(f, factor)])

    return sorted(decision_list, key=lambda d: d.get_strength(), reverse=True)
                

def yarowski(seed, residual, extract_dataset_features, max_iter=999, alpha=5, beta=2, gamma=0.4, ospd=False, verbose=True):
    seed_kalimat = list(seed.kalimat)
    seed_sense = [str(s) for s in list(seed.sense)]
    seed_targetpos_clean = list(seed.targetpos_clean)
    seed_targetpos_ori = list(seed.targetpos_ori)
    seed_targetpos_pos = list(seed.targetpos_pos_tag)
    seed_pos = list(seed.pos_tags)
    seed_clean = list(seed.clean)
    seed_features = extract_dataset_features(
        seed_clean, seed_targetpos_clean, seed_kalimat, seed_targetpos_ori, seed_pos, seed_targetpos_pos
    )  

    residual_kalimat = list(residual.kalimat)
    residual_discourse = list(residual.discourse_id)
    residual_targetpos_clean = list(residual.targetpos_clean)
    residual_targetpos_ori = list(residual.targetpos_ori)
    residual_targetpos_pos = list(residual.targetpos_pos_tag)
    residual_pos = list(residual.pos_tags)
    residual_clean = list(residual.clean)
    residual_sense = [None for i in residual_clean]
    residual_features = extract_dataset_features(
        residual_clean, residual_targetpos_clean, residual_kalimat, residual_targetpos_ori,
        residual_pos, residual_targetpos_pos
    )
    

    
    prev_seed_size = len(seed_sense)
    
    iteration = 0
    mwe = []
    entities = []
    
    while (prev_seed_size != len(seed_sense) or iteration == 0) and iteration < max_iter:
        iteration += 1
        prev_seed_size = len(seed_sense)
        
        possible_senses = set(seed_sense)
        
        if verbose:
            sys.stdout.write('\rIteration: {} | Seed size: {} | {}'.format(iteration, prev_seed_size, get_detailed_progress(seed_sense, possible_senses)))
            sys.stdout.flush()
    
        feature_count_per_sense, feature_count = feature_freq(possible_senses, seed_sense, seed_features)
         
        feature_prob_per_sense = feature_prob(possible_senses, feature_count_per_sense, feature_count)
                    
        decision_list = build_decision_list(
            possible_senses, feature_count_per_sense, feature_prob_per_sense, seed_sense,
            alpha=alpha, beta=beta, gamma=gamma
        )

        residual_sense = [
            decision_list_predict(
                decision_list, f
            ) for ori, clean, sense, f in zip(
                residual_kalimat, residual_clean, residual_sense, residual_features
            )
        ]
        residual_sense = [
            None if classification_should_be_cancelled(ori, pos_ori, postag, pos_postag, sense) else sense
            for ori, pos_ori, postag, pos_postag, sense in zip(
                residual_kalimat, residual_targetpos_ori, residual_pos, residual_targetpos_pos, residual_sense
            )
        ]
        seed_sense = [
            decision_list_predict(
                decision_list, f
            ) for ori, clean, sense, f in zip(
                seed_kalimat, seed_clean, seed_sense, seed_features
            )
        ]
        seed_sense = [
            None if classification_should_be_cancelled(ori, pos_ori, postag, pos_postag, sense) else sense
            for ori, pos_ori, postag, pos_postag, sense in zip(
                seed_kalimat, seed_targetpos_ori, seed_pos, seed_targetpos_pos, seed_sense
            )
        ]

        for i in range(len(seed_sense)):
            rule_based_classification =  classify_from_existing_rule(seed_clean[i], seed_targetpos_clean[i])
            if rule_based_classification is not None:
                seed_sense[i] = rule_based_classification
                
        for i in range(len(residual_sense)):
            rule_based_classification =  classify_from_existing_rule(residual_clean[i], residual_targetpos_clean[i])
            if rule_based_classification is not None:
                residual_sense[i] = rule_based_classification
        
        if ospd:
            residual_sense = one_sense_per_discourse_classification(residual_discourse, residual_sense)
        
        
        new_seed_kalimat = []
        new_seed_discourse = []
        new_seed_targetpos_clean = []
        new_seed_targetpos_ori = []
        new_seed_targetpos_pos = []
        new_seed_pos = []
        new_seed_clean = []
        new_seed_sense = []
        new_seed_features = []
        
        new_residual_kalimat = []
        new_residual_discourse = []
        new_residual_targetpos_clean = []
        new_residual_targetpos_ori = []
        new_residual_targetpos_pos = []
        new_residual_pos = []
        new_residual_clean = []
        new_residual_sense = []
        new_residual_features = []
        
        for i in range(len(seed_sense)):
            if seed_sense[i]:
                new_seed_kalimat.append(seed_kalimat[i])
                new_seed_sense.append(seed_sense[i])
                new_seed_targetpos_clean.append(seed_targetpos_clean[i])
                new_seed_targetpos_ori.append(seed_targetpos_ori[i])
                new_seed_clean.append(seed_clean[i])
                new_seed_features.append(seed_features[i])
                new_seed_pos.append(seed_pos[i])
                new_seed_targetpos_pos.append(seed_targetpos_pos[i])
            else:
                new_residual_kalimat.append(seed_kalimat[i])
                new_residual_sense.append(seed_sense[i])
                new_residual_targetpos_clean.append(seed_targetpos_clean[i])
                new_residual_targetpos_ori.append(seed_targetpos_ori[i])
                new_residual_clean.append(seed_clean[i])
                new_residual_discourse.append(None)
                new_residual_sense.append(None)
                new_residual_features.append(seed_features[i])
                new_residual_pos.append(seed_pos[i])
                new_residual_targetpos_pos.append(seed_targetpos_pos[i])
                
        seed_kalimat = new_seed_kalimat
        seed_targetpos_clean = new_seed_targetpos_clean
        seed_targetpos_ori = new_seed_targetpos_ori
        seed_clean = new_seed_clean
        seed_sense = new_seed_sense
        seed_features = new_seed_features
        seed_targetpos_pos = new_seed_targetpos_pos
        seed_pos = new_seed_pos
        
        for i in range(len(residual_sense)):
            if residual_sense[i]:
                seed_kalimat.append(residual_kalimat[i])
                seed_sense.append(residual_sense[i])
                seed_targetpos_clean.append(residual_targetpos_clean[i])
                seed_targetpos_ori.append(residual_targetpos_ori[i])
                seed_clean.append(residual_clean[i])
                seed_features.append(residual_features[i])
                seed_pos.append(residual_pos[i])
                seed_targetpos_pos.append(residual_targetpos_pos[i])
            else:
                new_residual_kalimat.append(residual_kalimat[i])
                new_residual_sense.append(residual_sense[i])
                new_residual_targetpos_clean.append(residual_targetpos_clean[i])
                new_residual_targetpos_ori.append(residual_targetpos_ori[i])
                new_residual_clean.append(residual_clean[i])
                new_residual_discourse.append(residual_discourse[i])
                new_residual_sense.append(None)
                new_residual_features.append(residual_features[i])
                new_residual_pos.append(residual_pos[i])
                new_residual_targetpos_pos.append(residual_targetpos_pos[i])
                
        residual_kalimat = new_residual_kalimat
        residual_discourse = new_residual_discourse
        residual_targetpos_clean = new_residual_targetpos_clean
        residual_targetpos_ori = new_residual_targetpos_ori
        residual_clean = new_residual_clean
        residual_sense = new_residual_sense
        residual_features = new_residual_features
        residual_targetpos_pos = new_residual_targetpos_pos
        residual_pos = new_residual_pos

         
            
    if verbose:    
        sys.stdout.write('\rIteration: {} | Seed size: {} | {}\n'.format(iteration, len(seed_sense), get_detailed_progress(seed_sense, possible_senses)))
        sys.stdout.flush()
    
    return pd.DataFrame({
        'kalimat': seed_kalimat,
        'sense': seed_sense,
        'clean': seed_clean,
        'targetpos_clean': seed_targetpos_clean,
        'targetpos_ori': seed_targetpos_ori
    }), pd.DataFrame({
        'kalimat': residual_kalimat,
        'discourse_id': residual_discourse,
        'clean': residual_clean,
        'targetpos_clean': residual_targetpos_clean,
        'targetpos_ori': residual_targetpos_ori
    }), decision_list


In [3]:
class YarowskiWSD(BaseEstimator, ClassifierMixin):

    def __init__(
        self, untagged_src=None, context_window=3,
        ospd=False, max_iter=1,
        alpha=5, beta=2, gamma=.4, verbose=False
    ):
        if untagged_src is None or type(untagged_src) != str:
            raise ValueError('untagged_src must be the file name of untagged dataset')

        self._residual = pd.read_csv(untagged_src)
        
        if not self._is_residual_data_format_valid():
            raise ValueError(
                'Residual dataset format not valid; ' +
                'Please refer to "Semi Supervised Learning.ipynb" to see the valid data format"'
            )
            
        self.untagged_src = untagged_src
        self.context_window = context_window
        self.ospd = ospd
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.max_iter = max_iter
        self.verbose = verbose
        
    def _is_residual_data_format_valid(self):
        residual_columns = [ 
            'kata',
            'discourse_id',
            'kalimat',
            'clean',
            'targetpos_clean',
            'targetpos_ori',
            'pos_tags',
            'targetpos_pos_tag'
        ]
        return (
            type(self._residual) == pd.DataFrame and
            np.all([c in self._residual.columns for c in residual_columns])
        )

    def fit(self, X, y=None):

        if not self._is_seed_format_valid(X):
            raise ValueError(
                'Seed dataset format not valid; ' +
                'Please refer to "Semi Supervised Learning.ipynb" to see the valid data format"'
            )
            
        seed = X
        
        feature_extractors = [
            extract_collocation(-2, -1),
            extract_collocation(1, 2),
            extract_collocation(-1, 1),
            extract_surrounding_words(self.context_window),
            extract_pos_tags(0, 0),
            extract_pos_tags(0, 1),
            extract_pos_tags(-1, 0),
        ]

        self._extract_dataset_features = lambda dataset_clean, dataset_pos, dataset_ori, dataset_pos_ori, dataset_postag, dataset_targetpos_pos: [
            [feature_extractor(clean, pos, ori, pos_ori, postag, pos_postag) for feature_extractor in feature_extractors]
            for clean, pos, ori, pos_ori, postag, pos_postag in zip(dataset_clean, dataset_pos, dataset_ori, dataset_pos_ori, dataset_postag, dataset_targetpos_pos)
        ]
        
        (
            self._result_seed,
            self._result_residual,
            self._decision_list,
        ) = yarowski(
            seed, self._residual, self._extract_dataset_features, max_iter=self.max_iter,
            alpha=self.alpha, beta=self.beta, gamma=self.gamma, ospd=self.ospd, verbose=self.verbose
        )

        self._possible_senses = set(self._result_seed.sense)
        most_count = 0
        senses = list(self._result_seed.sense)
        for s in self._possible_senses:
            count = senses.count(s)
            if count > most_count:
                self._major_sense = s
                most_count = count
        
        return self
    
    def _is_seed_format_valid(self, X):
        seed_columns = [
            'kata',
            'sense',
            'kalimat',
            'clean',
            'targetpos_clean',
            'targetpos_ori',
            'pos_tags',
            'targetpos_pos_tag'
        ]
        return (
            type(X) == pd.DataFrame and
            np.all([c in X.columns for c in seed_columns])
        )

    def predict(self, X, y=None):
        try:
            getattr(self, "_decision_list")
        except AttributeError:
            raise RuntimeError("You must train classifer before predicting data!")

        if not self._is_prediction_input_valid(X):
            raise ValueError(
                'Prediction input format not valid; ' +
                'Please refer to "Semi Supervised Learning.ipynb" to see the valid data format"'
            )
            
        X_kalimat = list(X.kalimat)
        X_clean = list(X.clean)
        X_pos_tags = list(X.pos_tags)
        X_targetpos_ori = list(X.targetpos_ori)
        X_targetpos_clean = list(X.targetpos_clean)
        X_targetpos_pos = list(X.targetpos_pos_tag)
            
        return([
            self._yarowski_predict(
                ori, clean, postag, pos_ori, pos_clean, pos_postag
            )
            for ori, clean, postag, pos_ori, pos_clean, pos_postag in zip(
                X_kalimat, X_clean, X_pos_tags, X_targetpos_ori, X_targetpos_clean, X_targetpos_pos
            )
        ])
    
    def _yarowski_predict(self, ori, clean, pos_tags, targetpos_ori, targetpos_clean, targetpos_pos):
        rule_based_classification = classify_from_existing_rule(clean, targetpos_clean)
        if rule_based_classification:
            return int(rule_based_classification)
        features = self._extract_dataset_features([clean], [targetpos_clean], [ori], [targetpos_ori], [pos_tags], [targetpos_pos])[0]
        decision_list_prediction =  decision_list_predict(
            self._decision_list, features
        )
        if classification_should_be_cancelled(ori, targetpos_ori, pos_tags, targetpos_pos, decision_list_prediction):
            decision_list_prediction = None
        if decision_list_prediction is None:
            decision_list_prediction = self._major_sense
        return int(decision_list_prediction)
    
    def _is_prediction_input_valid(self, X):
        columns = [
            'kalimat',
            'clean',
            'targetpos_clean',
            'targetpos_ori',
            'pos_tags',
            'targetpos_pos_tag'
        ]
        return (
            type(X) == pd.DataFrame and
            np.all([c in X.columns for c in columns])
        )        

In [17]:
chosen_word = 'mata'
seed = pd.read_csv('../unsupervised dataset/{}_seed.csv'.format(chosen_word))

In [19]:
disambiguator = YarowskiWSD(
    untagged_src='../unsupervised dataset/{}_clean.csv'.format(chosen_word), 
    max_iter=10,
    ospd=True
)
clf = GridSearchCV(disambiguator, {}, cv=5, n_jobs=7, iid=False, scoring='f1_macro')

In [20]:
clf.fit(seed, seed.sense)
clf.best_score_



0.3413515406162465

In [22]:
disambiguator = YarowskiWSD(
    untagged_src='../unsupervised dataset/{}_clean.csv'.format(chosen_word), 
    max_iter=10,
    ospd=False,
    verbose=True
)
disambiguator.fit(seed)
pred = np.array(disambiguator.predict(seed))
accuracy_score(list(seed.sense), pred)

Iteration: 3 | Seed size: 2201 | 1001: 2153 |1006: 31 |1004: 12 |1005: 5 |4 |1006: 31 |1005: 5 |


0.448

In [9]:
pred == np.array(list(seed.sense))

array([ True, False, False,  True,  True,  True, False, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False,  True, False, False, False, False, False,  True,
       False,  True, False,  True,  True,  True, False, False,  True,
        True,  True,  True,  True, False, False,  True, False,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True, False,  True, False,  True,
        True, False,  True,  True, False, False,  True, False,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True, False,
        True,  True, False, False, False,  True, False, False, False,
        True,  True,

In [10]:
for k in clf.best_estimator_._result_seed.query('sense == "1602"').kalimat:
    print(k)
    print()

NameError: name 'clf' is not defined

In [23]:
for k in disambiguator._result_seed.query('sense == "1001"').kalimat:
    print(k)
    print()

Lukisan-lukisan di dinding goa atau di dinding-dinding karang itu antara lain yang berupa cap-cap tangan, babi rusa, burung, manusia, perahu, lambang matahari, lukisan mata dan sebagainya.

Dahulu kemungkinan nama Narmada digunakan untuk menamai nama mata air yang membentuk beberapa kolam dan sebuah sungai di tempat tersebut.

Larutan dengan konsentrasi yang tinggi dapat menyebabkan luka mata parah.

Apabila tidak ada sinar dari sumber cahaya yang dipantulkan oleh suatu benda dan kemudian tiba di mata maka benda tersebut dikatakan tidak terlihat.

Semua jenis kucing siam memiliki pola dan warna bulu yang sama dengan mata berwarna biru.

Ngaren terletak pada sisi utara kecamatan ngadirejo, mata pencaharian penduduknya masih didominasi oleh oleh bidang pertanian.

Pada awal abad ke-8, Sceat merupakan mata uang pertama yang diedarkan.

Namun di mata para gerilyawan, ketiganya lebih bersikap sebagai kepala suku yang otoriter ketimbang pemimpin pasukan perlawanan yang ahli mengatur strategi


Salah satu mata pencaharian masyarakat di Kabupaten Waropen adalah Pertanian.

Ein Gedi terkenal dengan goa, mata air, dan keanekaragaman flora dan fauna yang kaya.

Selain kolam dengan ikan dewanya yang jinak, di sudut barat pemandian ini juga terdapat tujuh sumber mata air yang dikeramatkan yang bernama Tujuh Sumur.

Tujuh mata air itu terletak mengelilingi sebuah petilasan yang konon merupakan petilasan Prabu Siliwangi ketika ia beristirahat sekembalinya dari perang melawan Kasultanan Mataram.

Semakin kecil bayi semakin sulit untuk mengalihkan perhatian pada hal lain, karena gerak motorik dan mata masih terbatas.

Tanpa peduli usia, Iroh mampu menjadi pemikat hati perempuan dalam keadaan tertentu, dan terlihat main mata dengan berbagai perempuan dalam banyak episode, dan disapa tampan berkali-kali.

Mata Pencaharian Penduduk Desa Tanjung Harapan.

Sebuah suku yang seragam, Suku Air biasanya memiliki rambut coklat dengan mata biru atau cokelat, meskipun terkadang beberapa orang mem


com/sites/robinandrews/2018/12/22/indonesias-krakatoa-just-made-a-deadly-tsunami-heres-everything-you-need-to-know/#708023bc4c3e&lt;/ref&gt; Setelah peristiwa tersebut, dia menerbitkan sebuah laporan saksi mata, dengan gambar-gambar yang memberi pandangan yang tak tertandingi tentang tsunami dan dampaknya.

Disekitar Telaga ada beberapa patung kera putih dan pemandangan yang tak akan merugikan mata anda, Ladang yang rindang serta alam berbukit melingkari 'Telaga Madiredo'.

Umbu saren merupakan sumber mata air yang dimanfaatkan sebagai tempat wisata pemandian,sekaligus irigasi sawah sekitar umbul.

Sebelum dan selama abad ke-19, perdagangan internasional menggunakan mata uang yang dinilai dengan emas.

Sejak runtuhnya rezim nilai tukar tetap dan standar emas dan sistem kurs mengambang mengikuti Smithsonian Agreement pada tahun 1971, sebagian besar mata uang di seluruh dunia tidak lagi dipatok dengan dolar Amerika Serikat.

Beberapa mata uang dunia masih dipatok terhadap dolar.

Hanya 

In [1]:
import nltk
from nltk.tag import CRFTagger

In [2]:
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')


In [5]:
token = nltk.word_tokenize('saya makan nasi bersama Budi tadi pagi')
tag = ct.tag_sents([token])

In [6]:
tag[0]

[('saya', 'PRP'),
 ('makan', 'VB'),
 ('nasi', 'NN'),
 ('bersama', 'IN'),
 ('Budi', 'NNP'),
 ('tadi', 'VB'),
 ('pagi', 'NN')]