In [15]:
import spacy
from spacy.matcher import Matcher
from spacy.lang.en import English
import pandas as pd
import random
import os
from tqdm import tqdm
random.seed(2022)

In [8]:
DATA_PATH = "../data/csvfiles/scraped_web"
filename = "beta_announcements.csv"
os.listdir(DATA_PATH)

['beta_announcements.csv']

In [9]:
df = pd.read_csv(DATA_PATH + "/" + filename)
df.head()

Unnamed: 0.1,Unnamed: 0,URL,text,title,published_date,summary
0,0,https://www.futureflight.aero/news-article/202...,The U.S. Army is to support flight testing of ...,U.S. Army Looks To Enlist Beta's Alia eVTOL wi...,2022-01-31 00:00:00,
1,1,https://www.cae.com/news-events/press-releases...,• CAE and BETA to partner and create best in c...,CAE,,
2,2,https://evtol.com/features/why-evtol-developer...,When many people contemplate the benefits of a...,Why eVTOL developer Beta gives its employees f...,,
3,3,https://evtol.com/video/martine-rothblatt-flie...,"Martine Rothblatt, CEO of the biotechnology co...",Martine Rothblatt Flies Beta’s Alia,,
4,4,https://www.burlingtonfreepress.com/story/news...,Update: This story has been updated to reflect...,Ready for takeoff: Vermont nonprofit wants to ...,2021-06-08 00:00:00,


In [10]:
model = "en_core_web_trf"

In [11]:
if model is not None:
    nlp = spacy.load(model)  # load existing spacy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  # create blank Language class
    print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

Loaded model 'en_core_web_trf'


In [25]:

bodies = list(df['text'])
titles = list(df['title'])

In [26]:
def check_str(texts):
    strs = []
    count_ = 0
    for b in texts:
        if type(b) == str:
            strs.append(b)
        else:
            count_ += 1
    count_perc = count_/len(texts)
    return strs, count_, count_perc

bodies = check_str(bodies)[0]
titles = check_str(titles)[0]
# print([type(bodies[i]) for i in range(len(bodies))])

In [29]:
# Process texts as a stream, much faster than calling
# nlp() on each text
t_docs = list(nlp.pipe(titles))
b_docs = list(nlp.pipe(bodies))

  0%|          | 0/25 [14:55<?, ?it/s]


In [71]:
def group_ents(doc, print_ = False):
    if doc.ents:
        d = {"label_counts" : {}, "ent_counts" : {}, "ents_per_label" : {}}
        for ent in doc.ents:
            if print_:
                print("=================================================")
                print(ent.text + " - " + str(ent.start_char) + " - " + str(ent.end_char) +\
                    " - " + ent.label_ + " - " + spacy.explain(ent.label_))
                print("=================================================")
            d["label_counts"][ent.label_] = d["label_counts"].get(ent.label_, 0) + 1
            d["ent_counts"][ent.text] = d["ent_counts"].get(ent.text, 0) + 1
            if ent.label_ in d["ents_per_label"]:
                d["ents_per_label"][ent.label_].append(ent.text)
            else:
                d["ents_per_label"][ent.label_] = [ent.text]
            # d["ents_per_label"][ent.label_] = d["ents_per_label"].get(ent.label_, 0).append(ent.text)
        return d
    else:
        print("No entities in doc")

print(t_docs[0])
group_ents(t_docs[0])

U.S. Army Looks To Enlist Beta's Alia eVTOL with Flight Test Contract


{'label_counts': {'ORG': 2, 'PRODUCT': 2},
 'ent_counts': {'U.S. Army': 1, 'Beta': 1, 'Alia': 1, 'eVTOL': 1},
 'ents_per_label': {'ORG': ['U.S. Army', 'Beta'],
  'PRODUCT': ['Alia', 'eVTOL']}}

In [38]:
t_ent_lsts = [doc.ents for doc in t_docs]
b_ent_lsts = [doc.ents for doc in b_docs]

In [40]:
t_ent_lsts

[(U.S. Army, Beta, Alia, eVTOL),
 (CAE,),
 (),
 (Martine Rothblatt, Alia),
 (Vermont,),
 (Williston Observer,),
 (Amazon, Climate Fund, Fidelity),
 (BTV,),
 (BETA Technologies’, ALIA eVTOL, US, Air Force),
 (BETA, first, 20, Blade),
 (),
 (EVTOL,),
 (),
 (GAMA, Beta Technologies, Toyota Motor North America),
 (),
 (Beta, first),
 (),
 (),
 (Beta Technologies, Vermont),
 (),
 (),
 (),
 (Beta, Ava, the Edward Scissorhands),
 (),
 (),
 (Beta Technologies, eVTOL)]

In [72]:
grouped_ents = []
for b in b_docs:
    grouped_ents.append(group_ents(b))

In [88]:
# print(grouped_ents)
def count_dataset_ents(docs_ent_counts):
    c = {"entity_counts" : {}, "label_counts" : {}, "ent_lists" : {}}
    for doc_counts in docs_ent_counts:
        for l in doc_counts["label_counts"]:
            c["label_counts"][l] = c["label_counts"].get(l, 0) + 1
        for e in doc_counts["ent_counts"]:
            c["entity_counts"][e] = c["entity_counts"].get(e, 0) + 1
        epl = doc_counts["ents_per_label"]
        for ec in epl:
            if ec in c["ent_lists"]:
                c["ent_lists"][ec] += epl[ec]
            else:
                c["ent_lists"][ec] = epl[ec]
    return c

c = count_dataset_ents(grouped_ents)

In [90]:
c['label_counts']

{'ORG': 25,
 'PRODUCT': 23,
 'DATE': 24,
 'FAC': 12,
 'GPE': 23,
 'QUANTITY': 19,
 'PERSON': 25,
 'EVENT': 2,
 'LAW': 3,
 'CARDINAL': 23,
 'ORDINAL': 15,
 'TIME': 15,
 'WORK_OF_ART': 6,
 'NORP': 3,
 'MONEY': 8,
 'LOC': 9,
 'PERCENT': 6}

In [98]:
# c['ent_lists']
from collections import Counter
all_orgs = c['ent_lists']["ORG"]
print(len(all_orgs), type(all_orgs[20]))
# print(Counter(c['ent_lists']["ORG"]))
# c['ent_lists']["PERCENT"]

1357 <class 'list'>


In [56]:
grouped_ents

[{'label_counts': {'ORG': 15,
   'PRODUCT': 6,
   'DATE': 4,
   'FAC': 1,
   'GPE': 4,
   'QUANTITY': 2,
   'PERSON': 3,
   'EVENT': 1,
   'LAW': 1,
   'CARDINAL': 1},
  'ent_counts': {'The U.S. Army': 1,
   'Beta Technologies': 1,
   'Alia 250': 1,
   'today': 1,
   'Army': 2,
   'Beta': 3,
   'the U.S. Air Force': 1,
   'last year': 1,
   'Alia': 3,
   'Agility Prime': 2,
   'Burlington International Airport': 1,
   'Vermont': 1,
   'Plattsburg': 1,
   'New York': 1,
   '205 miles': 1,
   '8,000 feet': 1,
   'Kyle Clark': 1,
   'Will Roper': 1,
   'U.S.': 1,
   'the Air Force': 1,
   'the Reagan National Defense Forum': 1,
   'December 5, 2021': 1,
   "the Air Force's": 1,
   'Charles Brown': 1,
   'Afwerx': 1,
   'FAA': 1,
   'Part 23 type': 1,
   '2024': 1,
   'UPS': 1,
   '150': 1,
   'United Therapeutics': 1,
   'Blade Urban Air Mobility': 1}},
 {'label_counts': {'ORG': 34,
   'PRODUCT': 3,
   'DATE': 7,
   'PERSON': 2,
   'ORDINAL': 2,
   'TIME': 1,
   'QUANTITY': 1,
   'WORK_OF

In [None]:
titles = list(df['title'])

random.shuffle(titles)
ts = random.sample(titles, 10)
for t in ts:
    print(t)

In [None]:
train_strs = {ts[0] : {"Beta" : "COMP", "Amazon's Climate Fund" : "PARTNER", "Fidelity" : "PARTNER"},
              ts[1] : {},
             ts[2] : {"Beta" : "COMP"},
             ts[3] : {"BETA Technologies":"COMP", "ALIA":"TECH"},
             ts[4] : {"Beta":"COMP", "Ava":"TECH"},
             ts[5] : {},
             ts[6] : {"Beta":"COMP", "Alia":"TECH"},
             ts[7] : {"Beta Technologies":"COMP"},
             ts[8] : {"Beta":"COMP"},
             ts[9] : {"Beta":"COMP", "Alia":"TECH"}}

In [None]:
def patt_finder(substr, string):
    start_idx = string.index(substr)
    end_idx = start_idx + len(substr)
    return start_idx, end_idx

train_data = []
for i, sentence in enumerate(train_strs.keys()):
    assert(len(train_data)==i)
    num_ents = len(train_strs[sentence].keys())
    train_data.append([sentence, [None]*num_ents])
    for j in range(num_ents):
        sent_dict = train_strs[sentence]
        start_idx, end_idx = patt_finder(list(sent_dict.keys())[j], sentence)
        ent_str = list(sent_dict.values())[j]
        train_data[i][1][j] = (start_idx, end_idx, ent_str)

In [None]:
train_data

In [None]:
temp = [[lst[0], {"entities" : lst[1]}] for lst in train_data]
TRAINING_DATA = [tuple(dp) for dp in temp]
TRAINING_DATA

In [None]:
# # Loop for 10 iterations
# for i in range(10):
#     # Shuffle the training data
#     random.shuffle(TRAINING_DATA)
#     # Create batches and iterate over them
#     for batch in spacy.util.minibatch(TRAINING_DATA, size=3):
#         # Split the batch in texts and annotations
#         texts = [text for text, annotation in batch]
#         annotations = [annotation for text, annotation in batch]
#         # Update the model
#         nlp.update(texts, annotations)
#         # Save the model
#         nlp.to_disk(path_to_model)