In [1]:
import spacy
from spacy.pipeline import EntityRuler
import json
import glob

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)


In [25]:
#Gernating rules
def generate_ruler(patterns, name):
    nlp = spacy.blank("en")
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    ruler.to_disk(f"models/{name}_ent_ruler/entity_ruler/patterns.jsonl") 
    nlp.to_disk(f"models/{name}_ent_ruler")

In [4]:
def create_training_data(file, type):
    data = load_data(file)
    patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item
                    }
        patterns.append(pattern)
    return (patterns)

In [5]:
def test_ent_ruler(ruler, corpus):
    nlp = spacy.load(ruler)
    with open (corpus, "r", encoding="utf-8") as f:
        corpus = f.read()
    with open ("temp/results.txt", "w", encoding="utf-8") as f:
        doc = nlp(corpus)
        for ent in doc.ents:
            f.write(f"{ent.text}, {ent.label_}\n")

In [30]:
def create_training_set(corpus, ent_ruler_model, output_file, prodigy=False):
    nlp=spacy.load(ent_ruler_model)
    TRAIN_DATA = []
    with open (corpus, "r", encoding="utf-8") as f:
        data = f.read()
        segments = data.split("\n")
        for segment in segments:
            segment = segment.strip()
            doc = nlp(segment)
            entities = []
            for ent in doc.ents:
                if prodigy==True:
                    entities.append({"start":ent.start_char, "end": ent.end_char,  "label": ent.label_, "text": ent.text})
                    pass
                else:
                    entities.append((ent.start_char, ent.end_char, ent.label_))
            if len(entities) > 0:
                if prodigy==True:
                    TRAIN_DATA.append({"text": segment, "spans": entities})
                else:
                    TRAIN_DATA.append([segment, {"entities": entities}])
    print (len(TRAIN_DATA))
    with open (output_file, "w", encoding="utf-8") as f:
        json.dump(TRAIN_DATA, f, indent=4)



In [18]:
person_patterns = create_training_data("latin_data/all_names_declined.json", "PERSON")
groups_patterns = create_training_data("latin_data/groups_declined.json", "GROUP")
places_patterns = create_training_data("latin_data/places_declined.json", "LOCATION")

In [19]:
all_patterns = person_patterns+groups_patterns+places_patterns

In [26]:
generate_ruler(all_patterns, "latin_loc_per_group")

In [27]:
test_ent_ruler("models/latin_loc_per_group_ent_ruler", "latin_data/corpus.txt")

In [32]:
create_training_set("latin_data/corpus.txt", "models/latin_loc_per_group_ent_ruler", "training_data/training_set_spacy.json", prodigy=False)

388


In [33]:
from spacy.tokens import DocBin

In [36]:
all_docs = load_data("training_data/training_set_spacy.json")


In [37]:
print (all_docs[0])

['[1] Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur. Hi omnes lingua, institutis, legibus inter se differunt. Gallos ab Aquitanis Garumna flumen, a Belgis Matrona et Sequana dividit. Horum omnium fortissimi sunt Belgae, propterea quod a cultu atque humanitate provinciae longissime absunt, minimeque ad eos mercatores saepe commeant atque ea quae ad effeminandos animos pertinent important, proximique sunt Germanis, qui trans Rhenum incolunt, quibuscum continenter bellum gerunt. Qua de causa Helvetii quoque reliquos Gallos virtute praecedunt, quod fere cotidianis proeliis cum Germanis contendunt, cum aut suis finibus eos prohibent aut ipsi in eorum finibus bellum gerunt. Eorum una, pars, quam Gallos obtinere dictum est, initium capit a flumine Rhodano, continetur Garumna flumine, Oceano, finibus Belgarum, attingit etiam ab Sequanis et Helvetiis flumen Rhenum, vergit ad septentriones. Belgae 

In [38]:
train_docs = all_docs[:200]

In [39]:
valid_docs = all_docs[200:]

In [50]:
train_db = DocBin()
from tqdm import tqdm
nlp = spacy.blank("en")
for text, annot in tqdm(train_docs):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            pass
        else:
            ents.append(span)
    doc.ents = ents
    train_db.add(doc)

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 609.77it/s]


In [51]:
valid_db = DocBin()
from tqdm import tqdm
nlp = spacy.blank("en")
for text, annot in tqdm(valid_docs):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            pass
        else:
            ents.append(span)
    doc.ents = ents
    train_db.add(doc)

100%|███████████████████████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 648.33it/s]


In [52]:
train_db.to_disk("./training_data/train_hs.spacy")

In [53]:

valid_db.to_disk("./training_data/valid_hs.spacy")

In [54]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values

2021-07-01 11:43:01.032686: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll



[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [57]:
!python -m spacy train config.cfg --output ./output

[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     81.36    0.00    0.00    0.00    0.00
  0     200          9.41   1744.06    0.00    0.00    0.00    0.00
  1     400         24.45    481.48    0.00    0.00    0.00    0.00
  1     600         41.73    239.30    0.00    0.00    0.00    0.00
  2     800         74.30    178.93    0.00    0.00    0.00    0.00
  2    1000         58.22    115.73    0.00    0.00    0.00    0.00
  3    1200         72.91    110.32    0.00    0.00    0.00    0.00
  4    1400         99.68    140.03    0.00    0.00    0.00    0.00
  5    1600         91.67    108.83    0.00    0.00    0.00    0.00
[+] Saved pipeline to output directory
output\model-last


2021-07-01 11:44:33.292158: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
[2021-07-01 11:44:36,020] [INFO] Set up nlp object from config
[2021-07-01 11:44:36,020] [INFO] Pipeline: ['tok2vec', 'ner']
[2021-07-01 11:44:36,020] [INFO] Created vocabulary
[2021-07-01 11:44:37,555] [INFO] Added vectors: en_core_web_lg
[2021-07-01 11:44:37,555] [INFO] Finished initializing nlp object
[2021-07-01 11:44:39,595] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [59]:
nlp = spacy.load("output/model-best")

In [69]:
with open ("latin_data/livy_01.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [70]:
doc = nlp(text)

In [71]:
for ent in doc.ents:
    print (ent.text, ent.label_)

Troia LOCATION
Troianos GROUP
Achivos GROUP
Paphlagonia GROUP
Pylaemene GROUP
Troiam GROUP
Troia GROUP
Veneti GROUP
Macedoniam PERSON
Aenea PERSON
Troianos GROUP
Aeneas PERSON
Ascanium PERSON
Rutulique GROUP
Rutulis PERSON
Aeneas PERSON
Aboriginum GROUP
Aeneas PERSON
Numicum PERSON
Etruscis GROUP
Etruscis LOCATION
Latinisque PERSON
Silvium PERSON
Silvium GROUP
Prisci PERSON
Capys GROUP
Romulus PERSON
Amulium GROUP
Romani GROUP
Romulus PERSON
Remus GROUP
Romulus PERSON
Remus PERSON
Remumque GROUP
Romulus PERSON
Remus PERSON
Remo PERSON
Remus PERSON
Romulus PERSON
Evandro GROUP
Cacus PERSON
Hercules PERSON
Cacus PERSON
Sibyllae PERSON
Italiam LOCATION
Evander PERSON
Hercules PERSON
Hercules PERSON
Pinarii PERSON
Evandro GROUP
Romulus PERSON
Etruscis GROUP
Romulus PERSON
Romulus PERSON
Romulus PERSON
T. PERSON
Crustuminique PERSON
Tatius PERSON
Caeninum GROUP
Romulus PERSON
Romulus PERSON
Romani GROUP
Romanos GROUP
Crustuminos GROUP
Utroque PERSON
Romam GROUP
Sabini PERSON
Sabini PERSON
R