In [1]:
import spacy
import pandas as pd
from spacy import displacy
import srsly
from spacy.language import Language
from spacy.tokens import Span

In [2]:
df = pd.read_feather("../data/full_data_00-00-05")
df.head(1)

Unnamed: 0,index,full_name,lastname,firstname,description,place,homeland,province,long,lat,hrv,org,date,object_id,coordinates,full_orgs,old_org,age,gender
0,1,Thabo Simon Aaron,AARON,Thabo Simon,An ANCYL member who was shot and severely inju...,[Bethulie],[nan],[Orange Free State],25.97552,-30.50329,"[shoot, injure]",[ANCYL (African National Congress Youth League...,[1991-04-17],1,"[25.97552, -30.50329]",[ANCYL (African National Congress Youth League...,"[ANC, ANCYL, Police, SAP]",22,[Unknown]


In [3]:
orgs = df.full_orgs.tolist()
orgs[:1]

[array(['ANCYL (African National Congress Youth League)',
        'ANC (African National Congress)',
        'ANCYL (African National Congress Youth League)',
        'ANC (African National Congress)', 'Police',
        'SAP (South African Police)', 'SAP (South African Police)'],
       dtype=object)]

In [7]:
unique_orgs = ["MK"]
homelands = []
provinces = ["OFS"]
places = ["Lephoi", "Cape", "White River"]
hrvs = []
for idx, row in df.iterrows():
    # print(type(row.full_orgs))
    org = row.full_orgs.tolist()
    for o in org:
        unique_orgs.append(o)
        if "(" in o:
            first, second, *_ = o.split("(")
            first = first.strip()
            second = second.replace(")", "").strip()
            unique_orgs.append(first)
            unique_orgs.append(second)
    for homeland in row.homeland.tolist():
        if homeland not in homelands and homeland != "nan":
            homelands.append(homeland)
    for province in row.province.tolist():
        if province not in provinces and province != "nan":
            provinces.append(province)
    for place in row.place.tolist():
        if place not in places and place != "nan":
            places.append(place)
    try:
        for hrv in row.hrv.tolist():
            if hrv not in hrvs and hrv != "nan":
                hrvs.append(hrv)
    except:
        Exception
new = list(set(unique_orgs))
new.sort()
print(len(homelands), len(provinces), len(places), len(hrvs))

10 6 1149 35


In [8]:
def create_patterns(data, label, lemma=False):
    if lemma == False:
        return [{"label": label, "pattern": pattern} for pattern in data]
    else:
        all_patterns =[]
        for item in data:
            lemma_patterns = []
            for pattern in item.split():
                lemma_patterns.append({"LEMMA": pattern})
            all_patterns.append(lemma_patterns)
        return [{"label": label, "pattern": pattern} for pattern in all_patterns]

In [9]:
# hrvs

In [10]:
nlp = spacy.load("en_core_web_sm")
hrv_lemmas = ["burn to death", "kill", "rape", "petrol bomb", "stone to death", "kick", "shoot dead", "stab to death"]
for hrv in hrvs:
    doc = nlp(hrv)
    hrv_lemmas.append(doc[0].lemma_)
print(hrv_lemmas)
hrv_lemmas.remove("convict")

['kill', 'rape', 'petrol bomb', 'stone to death', 'kick', 'shoot dead', 'stab to death', 'shoot', 'injure', 'beat', 'blind', 'stab', 'stone', 'kill', 'burn', 'bomb', 'detain', 'assault', 'torture', 'arrest', 'convict', 'necklace', 'steal', 'imprison', 'destroy', 'interrogate', 'abduct', 'sexual', 'explode', 'mutilate', 'execute', 'intimidate', 'poison', 'frame', 'suffocate', 'drown', 'incarcerate', 'deprive', 'confine', 'shock', 'expose', 'electrocute']


In [11]:
def load_list(file):
    with open(file, "r", encoding="utf-8") as f:
        data = f.read().splitlines()
    return data
weapons = load_list("../assets/weapon.txt")
events = load_list("../assets/event.txt")

In [12]:
patterns = create_patterns(new, "ORG")

In [13]:
patterns = patterns+create_patterns(weapons, "WEAPON", lemma=True)
patterns = patterns+create_patterns(events, "EVENT", lemma=True)
patterns = patterns+create_patterns(homelands, "HOMELAND")
patterns = patterns+create_patterns(provinces, "PROVINCE")
patterns = patterns+create_patterns(places, "GPE")
patterns = patterns+create_patterns(hrv_lemmas, "HRV", lemma=True)

In [14]:
create_patterns(events, "EVENT", lemma=True)

[{'label': 'EVENT', 'pattern': [{'LEMMA': 'Soweto'}, {'LEMMA': 'uprising'}]},
 {'label': 'EVENT',
  'pattern': [{'LEMMA': 'trojan'}, {'LEMMA': 'horse'}, {'LEMMA': 'incident'}]},
 {'label': 'EVENT',
  'pattern': [{'LEMMA': 'Trojan'}, {'LEMMA': 'Horse'}, {'LEMMA': 'Incident'}]},
 {'label': 'EVENT',
  'pattern': [{'LEMMA': 'Church'}, {'LEMMA': 'Street'}, {'LEMMA': 'Bombing'}]},
 {'label': 'EVENT',
  'pattern': [{'LEMMA': '1985'}, {'LEMMA': 'school'}, {'LEMMA': 'boycotts'}]}]

In [15]:
nlp = spacy.load("en_core_web_sm")

In [16]:
@Language.component("keep_dates")
def keep_dates(doc):
    ents = []
    for ent in doc.ents:
        if ent.label_ == "DATE":
            ents.append(ent)
    doc.ents = ents
    return doc

In [17]:
@Language.component("find_adverb")
def find_adverb(doc):
    ents = []
    for ent in doc.ents:
        if ent.label_ == "HRV":
            if doc[ent.start-1].pos_ == "ADV" and doc[ent.start-1].text != "then":
                new_ent = Span(doc, ent.start-1, ent.end, label=ent.label_)
                ents.append(new_ent)
            elif doc[ent.start+1].pos_ == "ADV":
                new_ent = Span(doc, ent.start, ent.end+1, label=ent.label_)
                ents.append(new_ent)
            else:
                ents.append(ent)
        else:
            ents.append(ent)
    doc.ents = ents
    return doc

In [18]:
@Language.component("police_stations")
def police_stations(doc):
    ents = list(doc.ents)
    
    for chunk in doc.noun_chunks:
        if "police station" in chunk.text.lower():
            new_ent = Span(doc, chunk.start, chunk.end, label="POLICE_STATION")
            ents.append(new_ent)
    doc.ents = ents
    return doc

In [19]:
nlp.add_pipe("keep_dates")
nlp.add_pipe("police_stations")

<function __main__.police_stations(doc)>

In [20]:
ruler = nlp.add_pipe("entity_ruler")

In [21]:
ruler.add_patterns(patterns)

In [22]:
nlp.add_pipe("find_adverb")

<function __main__.find_adverb(doc)>

In [25]:
nlp.to_disk("../models/vol7_heuristic")

In [24]:
doc = nlp(df.description.tolist()[38])
displacy.render(doc, style="ent", jupyter=True)

In [140]:
for chunk in doc.noun_chunks:
    print(chunk)

A cleric
chairperson
the Pietermaritzburg Council
Churches
who
Imbali
Pietermaritzburg
4 May
his child
school
A named Inkatha member
the murder
appeal


In [19]:
# phrases = []
# for text in df.description.tolist():
#     doc = nlp(text)
#     for i, token in enumerate(doc):
#         if token.text == "with":
#             # print(" ".join([t.text for t in token.subtree]))
#             phrases.append(token.subtree)

In [None]:
# len(phrases)

In [87]:
def get_annotations(text, name):
    annotated_data = []

    annotations = [{"start": 0, "end": len(name)-1,
                                "label": "VICTIM", "token_start": 0,
                                "token_end": len(name.split())-1}, {"start": 0, "end": len(name)-1,
                                "label": "PERSON", "token_start": 0,
                                "token_end": len(name.split())-1}]
    doc = nlp(text)
    for span in doc.ents:
        # if span.label_ in ["HRV", "PROVINCE", "HOMELAND", "GPE", "ORG", "DATE", "EVENT"]:
        annotations.append({"start": span.start_char, "end": span.end_char,
                                "label": span.label_, "token_start": span.start,
                                "token_end": span.end-1})
    for sent in doc.sents:
        annotations.append({"start": sent.start_char, "end": sent.end_char,
                                "label": "VIOLENT_ACT", "token_start": sent.start,
                                "token_end": sent.end-1})
    # print(annotations)
    result = [dict(tupleized) for tupleized in set(tuple(item.items()) for item in annotations)]
    # print(result)
    annotated_data.append({"text": doc.text, "spans": result})
    return annotated_data

In [88]:
prodigy_data = []
for idx, row in df.iterrows():
    name = row.full_name
    d = row.description
    if "Was" == d[:3]:
        d = f"{name} {d[0].lower()}{d[1:]}"
    elif "A" == d[0]:
        d = f"{name} was {d[0].lower()}{d[1:]}"
    prodigy_data = prodigy_data+get_annotations(d, name)
    if len(prodigy_data) > 50:
        break

In [None]:
# prodigy_data

In [89]:
srsly.write_jsonl("C:\\Users\wma22\\OneDrive\\Documents\\GitHub\\bap-vol7-gold\\to_annotate\\vol7_prodigy.jsonl", prodigy_data)

In [None]:
prodigy_data[0]

In [None]:
# python -m prodigy spans.manual vol7 blank:en ./to_annotate/vol7_prodigy.jsonl --label HRV,HOMELAND,PROVINCE,ORG,DATE,WEAPON,VIOLENT_ACT,GPE,EVENT,VICTIM,PERSON,POLICE_STATION