In [1]:
import spacy
import glob
from spacy.language import Language
from spacy.tokens import Span
from collections import Counter
import re
import string
import toml

from spacy import displacy


In [2]:
def create_mwt(text, label, tokenizer):
    tokens = [t.text for t in tokenizer(text)]
    pattern = []
    for t in tokens:
        if t == "the":
            pattern.append({"LOWER": t})
        else:
            pattern.append({"TEXT": t})
    pattern = {"pattern": pattern, "label": label}
    return pattern

In [3]:
def open_file(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()[1:]
    file = file.replace("\\", "/")
    label = file.split("/")[-1].upper().replace(".TXT", "")
    return data, label

In [4]:
def create_patterns(file, tokenizer, label=""):
    if label == "":
        data, label = open_file(file)
    else:
        data, _ = open_file(file)
    patterns = []
    for d in data:
        patterns.append(create_mwt(d, label, tokenizer))
        if "(" in d:
            patterns.append(create_mwt(d.split("(")[0].strip(), label, tokenizer))
            patterns.append(create_mwt(d.split("(")[1].replace(")", "").strip(), label, tokenizer))
        if "USS" in d or "HMS" in d:
            patterns.append(create_mwt(d.split("(")[0].strip().replace("USS", "the").replace("HMS", "the"), label, tokenizer))
    return patterns

In [5]:
def create_military_patterns(file, tokenizer):
    data, label = open_file(file)
    patterns = []
    for d in data:
        patterns.append(create_mwt(d, label, tokenizer))
        if d[0].isdigit():
            patterns.append(create_mwt("the "+d.split()[0].strip(), label, tokenizer))
        if "(" in d:
            patterns.append(create_mwt(d.split("(")[0].strip(), label, tokenizer))
    return patterns

In [6]:
def patterns_dir(directory, mode="", label=""):
    tokenizer = spacy.blank("en")
    files = glob.glob(directory)
    patterns = []
    for file in files:
        if mode == "":
            patterns = patterns+create_patterns(file, tokenizer, label)
        elif mode == "military":
            patterns = patterns+create_military_patterns(file, tokenizer)
    return patterns

In [7]:
def regex_patterns(file, extra=""):
    patterns = ""
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()[1:]
    for d in data:
        patterns=patterns+f"({d})|"
    patterns = f"({patterns[:-1]}){extra}"
    print(patterns)
    return patterns

In [8]:
@Language.component("clean_spans")
def clean_spans(doc):
    original_spans = list(doc.spans["ruler"])
    #remove the from spans
    new_spans = []
    for span in doc.spans["ruler"]:
        if span[0].text.lower() == "the":
            span.start = span.start+1
        new_spans.append(span)
    doc.spans["ruler"] = new_spans
    
    #filter the overlapping spans so that priority is given to the longest span unless all spans
    #are of equal length
    span_starts = [span.start for span in doc.spans["ruler"]]
    overlap_starts = set([i for i in span_starts if span_starts.count(i)>1])
    longest = {}
    for span in doc.spans["ruler"]:
        if span.start in overlap_starts:
            if span.text not in longest:
                longest[span.start] = [span.end, span.text]
            else:
                if longest[span.start][0] < span.end:
                    longest[span.start] = [span.end, span.text]
    final_spans = []
    for span in doc.spans["ruler"]:
        if span.start in longest:
            if [span.end, span.text] == longest[span.start]:
                final_spans.append(span)
        else:
            final_spans.append(span)
            
    doc.spans["ruler"] = final_spans
    return doc

@Language.component("military_personnel")
def military_personel(doc):
    military_pattern = regex_patterns("assets/military_ranks/american/army.txt",
                                              extra=" [A-Z][a-z\.]*( [A-Z][a-z\.]*)*")
    text = doc.text
    new_ents = []
    original_ents = list(doc.spans["ruler"])
    for sent in doc.sents:
        for match in re.finditer(military_pattern, sent.text):
            
            start, end = match.span()
            start = start+sent.start_char
            end = end+sent.start_char
            span = doc.char_span(start, end)
            if span.text[-1] in string.punctuation:
                span.end = span.end-1
            start, end, name = span.start, span.end, span.text
            original_ents.append(Span(doc, start, end, label="MILITARY_PERSONNEL"))
    doc.spans["ruler"] = original_ents
    return doc

In [9]:
@Language.component("clean_tank")
def clean_tank(doc):
    new_spans = []
    for span in doc.spans["ruler"]:
        if span.label_ == "TANK":
            if span.text.split()[-1] in ["tank", "tanks"]:
                span.end = span.end-1
        new_spans.append(span)
    doc.spans["ruler"] = new_spans
    return doc

In [1]:
pipeline_data = toml.load("./project.toml")["pipeline_data"]
ships = patterns_dir("assets/ships/american/*.txt")
military_units = patterns_dir("assets/military_units/american/*.txt", mode="military")
tanks = patterns_dir("assets/tanks/american/*.txt")
planes = patterns_dir("assets/planes/american/*.txt")
planes = patterns_dir("assets/weapons/american/*.txt")
battles = patterns_dir("assets/battles/*.txt", label="BATTLE")
all_patterns = ships+military_units+tanks+planes+battles+weapons
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

ship_ruler = nlp.add_pipe("span_ruler")
ship_ruler.add_patterns(all_patterns)
nlp.add_pipe("military_personnel")
nlp.add_pipe("clean_spans")
nlp.add_pipe("clean_tank")
for name, val in pipeline_data.items():
    nlp.meta[name] = val
nlp.to_disk("./models/ww2spacy")

NameError: name 'toml' is not defined

In [None]:
text = "The P-35 flew in WW2 at Battle of Point 175. The 10th Armored Division was led by General William H. H. Morris. It contained 24 Sherman tanks. John Sherman is a false positive."
doc = nlp(text)
displacy.render(doc, style="span", jupyter=True, options = {"spans_key": "ruler"})