## Sesi 13

# POS Tagging

Pada pembuatan POS Tagging kali ini, Algoritma yang akan digunakan adalah Random Forest

In [None]:
!pip install nlp_id  # untuk kepeluan tokenizer, bukan postag

In [None]:
import nltk
import os
import pickle
import warnings
import wget
from nlp_id.tokenizer import Tokenizer
from nltk.tree import Tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
tokenizer = Tokenizer()

In [None]:
dataset_path = "/content/dataset_postag.txt"

In [None]:
def read_dataset(dataset_path=None):

    with open(dataset_path) as f:
        raw_file = f.read().split("\n")

    files = [i.split("\t") for i in raw_file]

    sentences, tags, temp_sentences, temp_tags = [], [], [], []

    for file in files:
        if file != [""]:
            temp_sentences.append(file[0])  # get the sentences
            temp_tags.append(file[1])  # get the tag
        else:
            # check if the temp sentences and temp tags is not null
            # and both of them have the same length
            if len(temp_sentences) > 0 and (
                len(temp_sentences) == len(temp_tags)
            ):
                sentences.append(temp_sentences)
                tags.append(temp_tags)
            temp_sentences, temp_tags = [], []
    return sentences, tags

In [None]:
read_dataset(dataset_path)

In [None]:
sentences, tags = read_dataset(dataset_path)

In [None]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        "word": sentence[index],
        "is_first": index == 0,
        "is_last": index == len(sentence) - 1,
        "is_capitalized": sentence[index][0].upper()
        == sentence[index][0],
        "is_all_caps": sentence[index].upper() == sentence[index],
        "is_all_lower": sentence[index].lower() == sentence[index],
        "has_hyphen": "-" in sentence[index],
        "is_numeric": sentence[index].isdigit(),
        "capitals_inside": sentence[index][1:].lower()
        != sentence[index][1:],
        "prefix-1": sentence[index][0],
        "prefix-1-lower": sentence[index][0].lower(),
        "prefix-2": sentence[index][:2],
        "prefix-2-lower": sentence[index][:2].lower(),
        "prefix-3": sentence[index][:3],
        "prefix-3-lower": sentence[index][:3].lower(),
        "suffix-1": sentence[index][-1],
        "suffix-1-lower": sentence[index][-1].lower(),
        "suffix-2": sentence[index][-2:],
        "suffix-2-lower": sentence[index][-2:].lower(),
        "suffix-3": sentence[index][-3:],
        "suffix-3-lower": sentence[index][-3:].lower(),
        "lowercase_word": sentence[index].lower(),
        "prev_word": "" if index == 0 else sentence[index - 1],
        "next_word": ""
        if index == len(sentence) - 1
        else sentence[index + 1],
        "prev_word_is_capitalized": False
        if index == 0
        else sentence[index - 1][0].upper() == sentence[index - 1][0],
        "next_word_is_capitalized": False
        if index == len(sentence) - 1
        else sentence[index + 1][0].upper() == sentence[index + 1][0],
        "2-prev-word": "" if index <= 1 else sentence[index - 2],
        "2-next-word": ""
        if index >= len(sentence) - 2
        else sentence[index + 2],
    }

In [None]:
def transform_to_dataset(sentences, tags):
    X, y = [], []

    for sentence_idx in range(len(sentences)):
        for index in range(len(sentences[sentence_idx])):
            X.append(features(sentences[sentence_idx], index))
            y.append(tags[sentence_idx][index])

    return X, y

In [None]:
clf = Pipeline(
        [
            ("vectorizer", DictVectorizer(sparse=True)),
            (
                "classifier",
                RandomForestClassifier(
                    criterion="gini", n_estimators=15, random_state=2020
                ),
            ),
        ]
    )

In [None]:
def train(sentences, tags):
    """
    training
    """
    clf.fit(sentences, tags)

In [None]:
sentences, tags = transform_to_dataset(sentences, tags)

In [None]:
train(sentences, tags)

In [None]:
def save_model(model_path):
    pickle_out = open(model_path, "wb")
    pickle.dump(clf, pickle_out)
    pickle_out.close()

In [None]:
model_path = "/content/postagger_model.pkl"
save_model(model_path)

In [None]:
def load_model(model_path):
    pickle_in = open(model_path, "rb")
    load_data = pickle.load(pickle_in)
    return load_data

In [None]:
model = load_model(model_path)

In [None]:
def get_pos_tag(text):
    result = []
    sents = nltk.sent_tokenize(text)
    symbols = ['!', '&', '(', ')', '*', '?', ',', '.', '<', '>', '/', ':', ';',
                '[', ']', '\\', '^', '`', '{', '}', '|', '~', '"', '“', "'"]
    for sent in sents:
        tokenized_word = tokenizer.tokenize(sent)
        if sent:
            tags = model.predict(
                [
                    features(tokenized_word, index)
                    for index in range(len(tokenized_word))
                ]
            )
            for i in range(len(tags)):
                if tokenized_word[i] in symbols:
                    result.append((tokenized_word[i], "SYM"))
                else:
                    result.append((tokenized_word[i], tags[i]))
    return result

In [None]:
text = "Lionel Messi pergi ke pasar di daerah Jakarta Pusat."

In [None]:
get_pos_tag(text)

[('Lionel', 'NNP'),
 ('Messi', 'NNP'),
 ('pergi', 'VB'),
 ('ke', 'IN'),
 ('pasar', 'NN'),
 ('di', 'IN'),
 ('daerah', 'NN'),
 ('Jakarta', 'NNP'),
 ('Pusat', 'NNP'),
 ('.', 'SYM')]

# NER

Pada pembuatan NER kali ini, Tool yang digunakan adalah Spacy

In [None]:
import pickle
import spacy
import random
from spacy.util import minibatch, compounding
from spacy import load, displacy
from spacy.training.example import Example

In [None]:
with open('/content/dataset_ner_spacy.pickle', 'rb') as f:
    data = pickle.load(f)

In [None]:
nlp = spacy.blank("id")
nlp.add_pipe('ner')
nlp.begin_training()

<thinc.optimizers.Optimizer at 0x7fe597766340>

In [None]:
ner = nlp.get_pipe("ner")

In [None]:
for _, annotations in data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        break

In [None]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
# training model
with nlp.disable_pipes(*unaffected_pipes):

  # training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(data)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        nlp.update(
            example,
            drop=0.5,  # dropout - make it harder to memorise data
            losses=losses,
            )

    print("Losses at iteration {}".format(iteration), losses)

In [None]:
# test
doc = nlp("Lionel Messi pergi ke pasar di daerah Jakarta Pusat.")

print(doc.ents)
print("----")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

(Lionel Messi, Jakarta Pusat)
----
Entities [('Lionel Messi', 'PERSON'), ('Jakarta Pusat', 'LOCATION')]
