In [212]:
import spacy
import glob
from spacy.language import Language
from spacy.tokens import Span
from collections import Counter
import re
import string
import toml
from num2words import num2words

from spacy import displacy

In [2]:
def create_mwt(text, label, tokenizer):
    tokens = [t.text for t in tokenizer(text)]
    pattern = []
    for t in tokens:
        if t == "the":
            pattern.append({"LOWER": t})
        else:
            pattern.append({"TEXT": t})
    pattern = {"pattern": pattern, "label": label}
    return pattern

In [3]:
def open_file(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()[1:]
    file = file.replace("\\", "/")
    label = file.split("/")[-1].upper().replace(".TXT", "")
    return data, label

In [200]:
def create_patterns(file, tokenizer, label=""):
    if label == "":
        data, label = open_file(file)
    else:
        data, _ = open_file(file)
    patterns = []
    for d in data:
        patterns.append(create_mwt(d, label, tokenizer))
        if "(" in d:
            patterns.append(create_mwt(d.split("(")[0].strip(), label, tokenizer))
            patterns.append(create_mwt(d.split("(")[1].replace(")", "").strip(), label, tokenizer))
        if "USS" in d or "HMS" in d:
            patterns.append(create_mwt(d.split("(")[0].strip().replace("USS", "the").replace("HMS", "the"), label, tokenizer))
    return patterns

In [240]:
def create_military_patterns(file, tokenizer):
    data, label = open_file(file)
    patterns = []
    for d in data:
        num = ""
        res = ""
        if d.split()[0][-2:] in ["st", "rd", "th"]:
            num = d.split()[0][:-2]
            try:
                res = num2words(num, ordinal=True).title()
                patterns.append(create_mwt(f"the {res}", label, tokenizer))
                patterns.append(create_mwt(d.replace(num, res), label, tokenizer))
            except:
                Exception
        patterns.append(create_mwt(d, label, tokenizer))
        patterns.append(create_mwt(d.replace(num, res), label, tokenizer))
        if d[0].isdigit():
            patterns.append(create_mwt("the "+d.split()[0].strip(), label, tokenizer))
        if "(" in d:
            patterns.append(create_mwt(d.split("(")[0].strip(), label, tokenizer))
    return patterns

In [6]:
def patterns_dir(directory, mode="", label=""):
    tokenizer = spacy.blank("en")
    files = glob.glob(directory)
    patterns = []
    for file in files:
        if mode == "":
            patterns = patterns+create_patterns(file, tokenizer, label)
        elif mode == "military":
            patterns = patterns+create_military_patterns(file, tokenizer)
    return patterns

In [42]:
@Language.component("clean_spans")
def clean_spans(doc):
    original_spans = list(doc.spans["ruler"])
    #remove the from spans
    new_spans = []
    for span in doc.spans["ruler"]:
        if span[0].text.lower() == "the":
            span.start = span.start+1
        new_spans.append(span)
    doc.spans["ruler"] = new_spans
    
    #filter the overlapping spans so that priority is given to the longest span unless all spans
    #are of equal length
    span_starts = [span.start for span in doc.spans["ruler"]]
    overlap_starts = set([i for i in span_starts if span_starts.count(i)>1])
    longest = {}
    for span in doc.spans["ruler"]:
        if span.start in overlap_starts:
            if span.text not in longest:
                longest[span.start] = [span.end, span.text]
            else:
                if longest[span.start][0] < span.end:
                    longest[span.start] = [span.end, span.text]
    final_spans = []
    for span in doc.spans["ruler"]:
        if span.start in longest:
            if [span.end, span.text] == longest[span.start]:
                final_spans.append(span)
        else:
            final_spans.append(span)
            
    doc.spans["ruler"] = final_spans
    return doc

In [43]:
@Language.component("clean_tank")
def clean_tank(doc):
    new_spans = []
    for span in doc.spans["ruler"]:
        if span.label_ == "TANK":
            if span.text.split()[-1] in ["tank", "tanks"]:
                span.end = span.end-1
        new_spans.append(span)
    doc.spans["ruler"] = new_spans
    return doc

In [44]:
def regex_patterns(file, extra=""):
    patterns = ""
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()[1:]
    for d in data:
        patterns=patterns+f"{d}|"
    patterns = f"({patterns[:-1]}){extra}"
    print(patterns)
    return patterns

In [None]:
@Language.component("find_ghetto")
def find_ghetto(doc):
    original_ents = list(doc.spans["ruler"])
    for i, token in enumerate(doc):
        if token.text.lower() == "ghetto":
            prev_token = doc[i-1]
            if prev_token.text[0].isupper():
                original_ents.append(Span(doc, i-1, i, label="GHETTO"))
    doc.spans["ruler"] = original_ents
    return doc

In [238]:
@Language.component("clean_ships")
def clean_ships(doc):
    hit_words = ["crew", "sea", "marine", "water", "ship", "boat", "vessel", "aboard", "captain", "sail"]
    window_start, window_end = [25, 25]
    original_ents = list(doc.spans["ruler"])
    new_ents = []
    for span in original_ents:
        if span.label_ in ["CRUISER", "BATTLESHIP"]:
            if span.start_char < window_start:
                window_start = 0
            else:
                window_start = span.start_char-window_start    
            if len(doc.text)-span.end_char > window_end:
                window_end = -1
            else:
                window_end = span.end_char+window_end
            window_text = doc.text[window_start: window_end]
            if any(hit in window_text for hit in hit_words):
                new_ents.append(span)
        else:
            new_ents.append(span)
    doc.spans["ruler"] = new_ents
    return doc

@Language.component("military_personnel")
def military_personel(doc):
    military_pattern = regex_patterns("assets/military_ranks/american/army.txt",
                                              extra=" [A-Z][a-z\.]*( [A-Z][a-z\.]*)*")
    text = doc.text
    new_ents = []
    original_ents = list(doc.spans["ruler"])

    for match in re.finditer(military_pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end, alignment_mode="expand")
        if span != None:
            if span.text[-1] in string.punctuation:
                span.end = span.end-1
            start, end, name = span.start, span.end, span.text
            tmp_span = Span(doc, start, end, label="MILITARY_PERSONNEL")
            print(tmp_span)
            for i, token in enumerate(tmp_span):
                if i > 2 and doc[(tmp_span.start+i)-2].text not in military_pattern.replace("\\", ""):
                    print(token.text)
                    if token.is_sent_start == True:
                        print("START SENT: ", token, (tmp_span.start+i)-2)
                        tmp_span.end=tmp_span.start+i-1
                    
            original_ents.append(tmp_span)
    doc.spans["ruler"] = original_ents
    return doc

pipeline_data = toml.load("./project.toml")["pipeline_data"]
ships = patterns_dir("assets/ships/american/*.txt")
military_units = patterns_dir("assets/military_units/american/*.txt", mode="military")
tanks = patterns_dir("assets/tanks/american/*.txt")
planes = patterns_dir("assets/planes/american/*.txt")
weapons = patterns_dir("assets/weapons/american/*.txt")
battles = patterns_dir("assets/battles/*.txt", label="BATTLE")
operations = patterns_dir("assets/military_operations/*.txt", label="OPERATION")

camps = patterns_dir("assets/holocaust/*.txt")
all_patterns = ships+military_units+tanks+planes+battles+weapons+operations+camps
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

ship_ruler = nlp.add_pipe("span_ruler")
ship_ruler.add_patterns(all_patterns)
nlp.add_pipe("military_personnel")
nlp.add_pipe("find_ghetto")
nlp.add_pipe("clean_spans")
nlp.add_pipe("clean_tank")
nlp.add_pipe("clean_ships")
for name, val in pipeline_data.items():
    nlp.meta[name] = val
nlp.to_disk("./models/ww2spacy")

6 Sixth
9 Ninth
11 Eleventh
13 Thirteenth
15 Fifteenth
17 Seventeenth
18 Eighteenth
21 Twenty-First
101 One Hundred And First
135 One Hundred And Thirty-Fifth
1 First
3 Third
4 Fourth
5 Fifth
6 Sixth
7 Seventh
8 Eighth
9 Ninth
10 Tenth
11 Eleventh
12 Twelfth
13 Thirteenth
14 Fourteenth
15 Fifteenth
16 Sixteenth
18 Eighteenth
19 Nineteenth
20 Twentieth
21 Twenty-First
39 Thirty-Ninth
12 Twelfth
15 Fifteenth
19 Nineteenth
184 One Hundred And Eighty-Fourth
267 Two Hundred And Sixty-Seventh
285 Two Hundred And Eighty-Fifth
291 Two Hundred And Ninety-First
305 Three Hundred And Fifth
313 Three Hundred And Thirteenth
320 Three Hundred And Twentieth
551 Five Hundred And Fifty-First
555 Five Hundred And Fifty-Fifth
717 Seven Hundred And Seventeenth
758 Seven Hundred And Fifty-Eighth
761 Seven Hundred And Sixty-First
763 Seven Hundred And Sixty-Third
789 Seven Hundred And Eighty-Ninth
2671 Two Thousand, Six Hundred And Seventy-First
6888 Six Thousand, Eight Hundred And Eighty-Eighth
1 First
1 F

In [239]:
text = "the Sixth Airborne Division Combat Command-B's lead Sherman tanks, tank destroyers and half-tracks entered Bastogne 18 December 1944. These were the first combat troops to reach the threatened town. CCB's commander, Col. William L. Roberts, split his command to form a crescent-shaped arc facing eastward five miles from the city. A task force commanded by Maj. William R. Desobry went north to Noville, while a similar group under Lt. Col. Henry T. Cherry wheeled east to Longvilly. Lt. Col. James O'Hara's group shifted southeast to Bras. "
doc = nlp(text)
displacy.render(doc, style="span", jupyter=True, options = {"spans_key": "ruler"})

(Private|PV1|Pvt|Pvt\.|Private First Class|Pfc|Pfc\.|Corporal|Cpl|Cpl\.|Sergeant|Sgt|Sgt\.|Staff Sergeant|SSG|S\/Sgt|Staff Sgt\.|Staff Sgt|Sergeant First Class|SFC|T\/Sgt|First Sergeant|1SG|1sg|1st Sgt|1st Sgt\.|Master Sergeant|MSG|m\/Sgt|m \Sgt|M Sgt\.|Second Lieutenant|2lt|2Lt\.|2Lt|First Lieutenant|1Lt|2Lt\.|2Lt|Lt\.|Lieutenant|Captain|Cap|Cpt|Capt|Cpt\.|Capt\.|Cap\.|Major|Maj|Maj\.|Lieutenant Colonel|LTC|Lt Colonel|Lt\. Colonel|Lt\. Col\.|Colonel|Col|Col\.|General|Gen|Gen\.|Brigadier General|Brigadier Gen|Brig\. Gen\.|Brigadier Gen\.|Major General|Major Gen|Maj\. Gen\.|Lieutenant General|Lt\. Gen\.|Lt Gen) [A-Z][a-z\.]*( [A-Z][a-z\.]*)*
Col. William L. Roberts
Roberts
Maj. William R. Desobry
Desobry
Lt. Col. Henry T. Cherry
Cherry
Lt. Col. James O'Hara


In [199]:
!python -m spacy package ./models/ww2spacy ./models/packaged --code ww2spacy/ww2spacy_components.py --force

running sdist
running egg_info
creating en_ww2spacy.egg-info
writing en_ww2spacy.egg-info\PKG-INFO
writing dependency_links to en_ww2spacy.egg-info\dependency_links.txt
writing entry points to en_ww2spacy.egg-info\entry_points.txt
writing requirements to en_ww2spacy.egg-info\requires.txt
writing top-level names to en_ww2spacy.egg-info\top_level.txt
writing manifest file 'en_ww2spacy.egg-info\SOURCES.txt'
reading manifest file 'en_ww2spacy.egg-info\SOURCES.txt'
reading manifest template 'MANIFEST.in'
writing manifest file 'en_ww2spacy.egg-info\SOURCES.txt'
running check
creating en_ww2spacy-0.0.8
creating en_ww2spacy-0.0.8\en_ww2spacy
creating en_ww2spacy-0.0.8\en_ww2spacy.egg-info
creating en_ww2spacy-0.0.8\en_ww2spacy\en_ww2spacy-0.0.8
creating en_ww2spacy-0.0.8\en_ww2spacy\en_ww2spacy-0.0.8\span_ruler
creating en_ww2spacy-0.0.8\en_ww2spacy\en_ww2spacy-0.0.8\vocab
copying files to en_ww2spacy-0.0.8...
copying MANIFEST.in -> en_ww2spacy-0.0.8
copying README.md -> en_ww2spacy-0.0.8
copy

