In [1]:
import nltk
import os
import pickle
import warnings
import wget
from nlp_id.tokenizer import Tokenizer
from nltk.tree import Tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hammam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
tokenizer = Tokenizer()

In [3]:
dataset_path = './dataset_postag.txt'

In [4]:
def read_dataset(dataset_path=None, encoding='utf-8'):
    with open(dataset_path, 'r', encoding=encoding) as f:
        raw_file = f.read().split("\n")

    files = [i.split("\t") for i in raw_file]

    sentences, tags, temp_sentences, temp_tags = [], [], [], []

    for file in files:
        if file != [""]:
            temp_sentences.append(file[0])  # get the sentences
            temp_tags.append(file[1])  # get the tag
        else:
            # check if the temp sentences and temp tags are not empty
            # and both of them have the same length
            if len(temp_sentences) > 0 and (
                len(temp_sentences) == len(temp_tags)
            ):
                sentences.append(temp_sentences)
                tags.append(temp_tags)
            temp_sentences, temp_tags = [], []
    return sentences, tags

In [5]:
read_dataset(dataset_path, encoding='iso=8859-1')

([['Ditjen',
   'Bea',
   'dan',
   'Cukai',
   'Kementerian',
   'Keuangan',
   '(',
   'Kemenkeu',
   ')',
   'memastikan',
   'ada',
   'nya',
   '18',
   'kotak',
   'selundupan',
   'yang',
   'dibawa',
   'melalui',
   'pesawat',
   'baru',
   'Airbus',
   'A330-900',
   'Neo',
   'milik',
   'Garuda',
   'Indonesia',
   '.'],
  ['Pengambilan',
   'foto',
   'dilakukan',
   'dari',
   'kepala',
   'perahu',
   'sehingga',
   'menghasilkan',
   'foto',
   'yang',
   'sangat',
   'menarik',
   '.'],
  ['Vice',
   'President',
   'Corporate',
   'Secretary',
   'Garuda',
   'Indonesia',
   ',',
   'M',
   '.'],
  ['Di',
   'sana',
   ',',
   'penumpang',
   'mengungkapkan',
   'atau',
   'men-declare',
   'barang',
   'bawaan',
   ',',
   'termasuk',
   'yang',
   'disebut',
   'sparepart',
   'Harley',
   'dan',
   'sepeda',
   'Brompton',
   '.'],
  ['Kita', '(', 'penumpang', ')', 'self', 'declare', '.'],
  ['Untuk',
   'clean',
   'and',
   'jerk',
   ',',
   'sosok',
   'berusia

In [6]:
sentence, tags = read_dataset(dataset_path, encoding='iso-8859-1')

In [7]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        "word": sentence[index],
        "is_first": index == 0,
        "is_last": index == len(sentence) - 1,
        "is_capitalized": sentence[index][0].upper()
        == sentence[index][0],
        "is_all_caps": sentence[index].upper() == sentence[index],
        "is_all_lower": sentence[index].lower() == sentence[index],
        "has_hyphen": "-" in sentence[index],
        "is_numeric": sentence[index].isdigit(),
        "capitals_inside": sentence[index][1:].lower()
        != sentence[index][1:],
        "prefix-1": sentence[index][0],
        "prefix-1-lower": sentence[index][0].lower(),
        "prefix-2": sentence[index][:2],
        "prefix-2-lower": sentence[index][:2].lower(),
        "prefix-3": sentence[index][:3],
        "prefix-3-lower": sentence[index][:3].lower(),
        "suffix-1": sentence[index][-1],
        "suffix-1-lower": sentence[index][-1].lower(),
        "suffix-2": sentence[index][-2:],
        "suffix-2-lower": sentence[index][-2:].lower(),
        "suffix-3": sentence[index][-3:],
        "suffix-3-lower": sentence[index][-3:].lower(),
        "lowercase_word": sentence[index].lower(),
        "prev_word": "" if index == 0 else sentence[index - 1],
        "next_word": ""
        if index == len(sentence) - 1
        else sentence[index + 1],
        "prev_word_is_capitalized": False
        if index == 0
        else sentence[index - 1][0].upper() == sentence[index - 1][0],
        "next_word_is_capitalized": False
        if index == len(sentence) - 1
        else sentence[index + 1][0].upper() == sentence[index + 1][0],
        "2-prev-word": "" if index <= 1 else sentence[index - 2],
        "2-next-word": ""
        if index >= len(sentence) - 2
        else sentence[index + 2],
    }

In [8]:
def transform_to_dataset(sentences, tags):
    X, y = [], []

    for sentence_idx in range(len(sentences)):
        for index in range(len(sentences[sentence_idx])):
            X.append(features(sentences[sentence_idx], index))
            y.append(tags[sentence_idx][index])

    return X, y

In [9]:
clf = Pipeline(
    [
        ('vectorizer', DictVectorizer(sparse=True)),
        (
            'classifier', 
            RandomForestClassifier(
                criterion='gini', n_estimators=1, random_state=2020, verbose=2
            ),
        ),
    ]
)

In [10]:
def train(senteces, tags):

    clf.fit(senteces, tags)

In [11]:
senteces, tags = transform_to_dataset(sentence, tags)

In [12]:
train(senteces, tags)

building tree 1 of 1


In [13]:
def save_model(model_path):
    pickle_out = open(model_path, 'wb')
    pickle.dump(clf, pickle_out)
    pickle_out.close()

In [14]:
model_path = './postager_model.pkl'
save_model(model_path)

In [15]:
def load_model(model_path):
    pickle_in = open(model_path, 'rb')
    load_data = pickle.load(pickle_in)
    return load_data

In [17]:
model = load_model(model_path)

In [18]:
display(model)

In [19]:
def get_pos_tag(text):
    result = []
    sents = nltk.sent_tokenize(text)
    symbols = ['!', '&', '(', ')', '*', '?', ',', '.', '<', '>', '/', ':', ';',
                '[', ']', '\\', '^', '`', '{', '}', '|', '~', '"', '“', "'"]
    for sent in sents:
        tokenized_word = tokenizer.tokenize(sent)
        if sent:
            tags = model.predict(
                [
                    features(tokenized_word, index)
                    for index in range(len(tokenized_word))
                ]
            )
            for i in range(len(tags)):
                if tokenized_word[i] in symbols:
                    result.append((tokenized_word[i], "SYM"))
                else:
                    result.append((tokenized_word[i], tags[i]))
    return result

In [20]:
text = "Lionel Messi pergi ke pasar di daerah Jakarta Pusat."

In [21]:
get_pos_tag(text)

[('Lionel', 'NNP'),
 ('Messi', 'NNP'),
 ('pergi', 'VB'),
 ('ke', 'IN'),
 ('pasar', 'NN'),
 ('di', 'IN'),
 ('daerah', 'NN'),
 ('Jakarta', 'NNP'),
 ('Pusat', 'NNP'),
 ('.', 'SYM')]

In [22]:
get_pos_tag('Ronaldo Ikut capres')

[('Ronaldo', 'NNP'), ('Ikut', 'NNP'), ('capres', 'NN')]