In [1]:
import pickle
import spacy
import random
from spacy.util import minibatch, compounding
from spacy import load, displacy

In [2]:
# load datasets 

with open('ner_spacy_fmt_datasets.pickle', 'rb') as f:
    ner_spacy_fmt_datasets = pickle.load(f)

In [None]:
nlp=spacy.blank("id")


In [None]:
nlp.add_pipe(nlp.create_pipe('ner'))


In [None]:
nlp.begin_training()

In [None]:
import random
from spacy.util import minibatch, compounding

In [None]:
ner=nlp.get_pipe("ner")

In [None]:
for _, annotations in ner_spacy_fmt_datasets:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        break

In [9]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [10]:
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(ner_spacy_fmt_datasets)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(ner_spacy_fmt_datasets, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    
    print("Losses at iteration {}".format(iteration), losses)

Losses at iteration 0 {'ner': 45910.67175821347}
Losses at iteration 1 {'ner': 44738.910790195376}
Losses at iteration 2 {'ner': 44312.70587632521}
Losses at iteration 3 {'ner': 44141.957184502346}
Losses at iteration 4 {'ner': 43960.33697548103}
Losses at iteration 5 {'ner': 44131.965996890154}
Losses at iteration 6 {'ner': 43771.28000052126}
Losses at iteration 7 {'ner': 43777.94996309585}
Losses at iteration 8 {'ner': 43668.77543842289}
Losses at iteration 9 {'ner': 43243.99606156396}
Losses at iteration 10 {'ner': 43331.28969894315}
Losses at iteration 11 {'ner': 43564.74727642169}
Losses at iteration 12 {'ner': 43278.27375075633}
Losses at iteration 13 {'ner': 43406.35800886545}
Losses at iteration 14 {'ner': 43166.70321452029}
Losses at iteration 15 {'ner': 42833.017318742066}
Losses at iteration 16 {'ner': 43176.65926181083}
Losses at iteration 17 {'ner': 42930.17800784978}
Losses at iteration 18 {'ner': 43062.6399440933}
Losses at iteration 19 {'ner': 42981.88144029334}
Losses 

In [12]:
# test 
doc = nlp("SELUBUNG yang menyelimuti kasus penembakan yang menewaskan Pendeta Yeremia Zanambani di Kabupaten Intan Jaya, Papua kian terkuak. Hasil investigasi Tim Gabungan Pencari Fakta (TGPF) kasus tersebut menyatakan bahwa penembakan di Intan Jaya diduga dilakukan oleh aparat keamanan.")
print(doc.ents)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

(SELUBUNG, Pendeta Yeremia Zanambani, Kabupaten Intan, Gabungan Pencari Fakta (TGPF, Intan Jaya)
Entities [('SELUBUNG', 'ORGANIZATION'), ('Pendeta Yeremia Zanambani', 'PERSON'), ('Kabupaten Intan', 'LOCATION'), ('Gabungan Pencari Fakta (TGPF', 'ORGANIZATION'), ('Intan Jaya', 'LOCATION')]


In [14]:
# save model 
from pathlib import Path

output_dir = Path('nlp_id_checkpoint_2020_10_26')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to nlp_id_checkpoint_2020_10_26


In [2]:
# load existing model 
output_dir = 'nlp_id_checkpoint_2020_10_26'
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)


Loading from nlp_id_checkpoint_2020_10_26


In [7]:
doc = nlp_updated("Kementerian Perhubungan tidak mewajibkan rapid test COVID-19 untuk perjalanan darat lintas daerah, kecuali untuk tujuan Bali. Termasuk, dalam periode cuti bersama." )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Bali', 'LOCATION')]


In [8]:
displacy.render(doc, style="ent")

In [11]:
doc = nlp_updated("Empat saksi terkait korupsi proyek infrastruktur fiktif yang dikerjakan PT Waskita Karya (Persero) Tbk absen dari panggilan KPK hari ini. Seorang di antaranya mantan Bupati Wakatobi, Hugua." )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('PT Waskita Karya', 'ORGANIZATION'), ('KPK', 'ORGANIZATION'), ('Hugua', 'PERSON')]


In [12]:
displacy.render(doc, style="ent")