# POS TAGGING BAHASA INDONESIA

In [40]:
from pprint import pprint
from nltk.tag import hmm
import dill
from nltk.tag.hmm  import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
from nltk.probability import LidstoneProbDist
from nltk.tokenize import WordPunctTokenizer
import string

In [4]:
fname = "Indonesian_Manually_Tagged_Corpus.tsv"
fopen = open(fname,'r')
fdata = fopen.readlines()

In [6]:
sentences = []
sentence = []
for word in fdata:
    dt = word.replace('\n','').split('\t')
    if(len(dt) == 1):
        sentences.append(sentence)
        sentence = []
    else:
        sentence.append((dt[0],dt[1]))

In [11]:
print('All Sentences : %d ' % (len(sentences)))
cutoff = int(.8 * len(sentences))
training_sentences = sentences[:cutoff]
test_sentences = sentences[cutoff:]
print('Training Sentences : %d ' % (len(training_sentences)))
print('Testing Sentences : %d ' % (len(test_sentences)))

All Sentences : 10029 
Training Sentences : 8023 
Testing Sentences : 2006 


In [13]:
print('Training Start')
trainer = HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(training_sentences, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
print('Training Completed')

Training Start
Training Completed


In [14]:
print('Testing Start')
tagger.test(test_sentences, verbose=True)
print('Testing Completed')

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
Entropy: 9.99519217699052

------------------------------------------------------------
Test: Satu/CD orang/NN tewas/VB dan/CC belasan/CD lain/JJ -nya/PRP luka-luka/NN ketika/SC para/DT petani/NN coca/FW terlibat/VB bentrok/VB dengan/SC sejumlah/CD pemuda/NN dari/IN kalangan/NN menengah/JJ saat/NN unjuk rasa/NN soal/NN rencana/NN otonomi/NN regional/JJ oleh/IN gubernur/NN setempat/NN ./Z

Untagged: Satu orang tewas dan belasan lain -nya luka-luka ketika para petani coca terlibat bentrok dengan sejumlah pemuda dari kalangan menengah saat unjuk rasa soal rencana otonomi regional oleh gubernur setempat .

HMM-tagged: Satu/CD orang/NN tewas/VB dan/CC belasan/RB lain/JJ -nya/PRP luka-luka/NN ketika/SC para/DT petani/NN coca/SC terlibat/VB bentrok/VB dengan/IN sejumlah/CD pemuda/CD dari/IN kalangan/NN menengah/JJ saat/NN unjuk rasa/NN soal/NN rencana/NN otonomi/NN regional/JJ oleh/IN gubernur/NN setempat/NN ./Z

Entropy

In [19]:
# save model
filename = 'tagger_model.dill'
with open(filename, 'wb') as f:
    dill.dump(tagger, f)

In [20]:
# load model
with open(filename, 'rb') as f:
    hmm_tagger = dill.load(f)

In [27]:
# checking tag
def xcheck_tag(word,tag):
    punc = list(string.punctuation)
    punc.append('.')
    punc.append(',')
    punc.append('"')
    punc.append("'")
    
    dates = ['Januari','Februari','Maret','April','Mei','Juni','Juli','Agustus','September','Oktober','November','Desember',\
            'Jan','Feb','Mar','Apr','Mei','Jun','Jul','Agt','Sep','Okt','Nov','Des',\
            'januari','februari','maret','april','mei','juni','juli','agustus','september','oktober','november','desember',\
            'Senin','Selasa','Rabu','Kamis','Jumat','Sabtu','Minggu'
        ]
    
    if(word in dates):
        tag = 'DATE'
    
    if(word in punc):
        tag = 'Z'
        
    if(tag == 'CD' and word.isdigit()):
        tag = 'CD'
        
    if(tag in ['SYM','Z','CD','MD'] and word.upper() != word and hasNumbers(word) == False \
      and word[-3:] not in ['nya','kah','lah']):
        tag = 'NNP'
    
    if(tag == 'NN' and word[:1].upper() == word):
        tag = 'NNP'
        
    if(tag == 'NNP' and word.lower() == word):
        tag = 'NN'
    
    if(tag == 'NNP' and len(word) == 1):
        tag = 'NN'
        
    if(tag == 'FW' and word.lower() == word):
        tag = 'NN'
        
    return word,tag

In [130]:
kalimat = "Andi selalu menjadi anak bawang di kelasnya."

word_punct_tokenizer = WordPunctTokenizer()
kalimat_token = word_punct_tokenizer.tokenize(kalimat)
kalimat_token

['Andi', 'selalu', 'menjadi', 'anak', 'bawang', 'di', 'kelasnya', '.']

In [131]:
pos_tagging = hmm_tagger.tag(kalimat_token)
pos_tagging

[('Andi', 'NNP'),
 ('selalu', 'RB'),
 ('menjadi', 'VB'),
 ('anak', 'NN'),
 ('bawang', 'JJ'),
 ('di', 'IN'),
 ('kelasnya', 'NNP'),
 ('.', 'Z')]

In [132]:
final_tagged = []
for pt in pos_tagging:
  w,t = xcheck_tag(pt[0], pt[1])
  final_tagged.append((w,t))
  

In [133]:
final_tagged

[('Andi', 'NNP'),
 ('selalu', 'RB'),
 ('menjadi', 'VB'),
 ('anak', 'NN'),
 ('bawang', 'JJ'),
 ('di', 'IN'),
 ('kelasnya', 'NN'),
 ('.', 'Z')]

# CHUNKING

In [77]:
from nltk.tokenize import WordPunctTokenizer
from nltk import sent_tokenize
from nltk import word_tokenize, RegexpParser, ne_chunk
from collections import defaultdict
import string
import nltk

In [95]:
def chunk(kalimat):
    grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><CD>}", "CHUNK: {<CD><NN>}", "CHUNK: {<NNP><NN>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}", "CHUNK: {<JJ><NN>}", "CHUNK: {<NN><JJ>}", "CHUNK: {<JJ><JJ>}"]
    
    result = []

    for i in grammar:
      chunkParser = RegexpParser(i)
      tree = chunkParser.parse(kalimat)
      for subtree in tree.subtrees():
          if(subtree.label()=="CHUNK"):
              tampung_entity = []
              for se in subtree.leaves():
                  tampung_entity.append(se[0])
              result.append(' '.join(tampung_entity))
    return result

In [122]:
def chunk(kalimat):
    grammar = ["CHUNK: {<NN>{2,}}", "CHUNK: {<NN><CD>}", "CHUNK: {<CD><NN>}", "CHUNK: {<NNP><NN>}", "CHUNK: {<VB><NN>}",
               "CHUNK: {<VB><JJ>}", "CHUNK: {<VB><CD>}", "CHUNK: {<JJ><NN>}", "CHUNK: {<NN><JJ>}", "CHUNK: {<JJ><JJ>}"]
    
    extract = []

    for i in grammar:
      cp = nltk.RegexpParser(i)
      result = cp.parse(kalimat)

      leaves = [chunk.leaves() for chunk in result if ((type(chunk) == nltk.tree.Tree) and chunk.label() == 'CHUNK')]
      noun_bigram_groups = [list(nltk.bigrams([w for w, t in leaf])) for leaf in leaves]

      ph = [' '.join(nouns) for group in noun_bigram_groups for nouns in group]
      extract = extract + ph

    return extract

In [123]:
ftag = [('Anak', 'NN'),
 ('kutu', 'NN'),
 ('buku', 'NN'),
 ('itu', 'PR'),
 ('menjuarai', 'VB'),
 ('lomba', 'NN'),
 ('matematika', 'PR'),
 ('.', 'Z')]

In [134]:
frasa = chunk(final_tagged)
frasa

['menjadi anak', 'anak bawang']