In [1]:
import re
import ast
import json
import pandas as pd

import spacy 
from spacy.displacy import render
from spacy.tokens import DocBin
from spacy.util import filter_spans

import warnings
warnings.filterwarnings('ignore')

2023-05-17 12:30:46.682010: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
!python -m spacy download en_core_web_sm -q

[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl#egg=en_core_web_sm==3.1.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


### Load the dataset

In [3]:
df = pd.read_csv("../data/assembled.csv", index_col=0)
clean_bodies = pd.read_csv("../word2vec/clean_bodies.csv", index_col=0)

### Load both the provided and the found entities

In [4]:
token_patterns = []  # single token patterns
multi_patterns = []  # multi-token patterns

with open("entities_collection/patterns_provided.jsonl", "r") as f:
    for line in f:
        json_line = json.loads(line)
        this_pattern = json_line["pattern"][0]
        if "TEXT" in this_pattern.keys():
            multi_patterns.append(json_line)
        elif len(this_pattern['lower'].split(' ')) > 1:
            this_pattern_text = this_pattern['lower']
            this_pattern_regex = {"label": json_line['label'], "pattern": [{"TEXT": {"REGEX": fr"\b({this_pattern_text})\b"}}]}
            multi_patterns.append(this_pattern_regex)
        else:
            token_patterns.append(json_line)
            
with open("entities_collection/patterns_found.jsonl", "r") as f:
    for line in f:
        json_line = json.loads(line)
        this_pattern = json_line["pattern"][0]
        if "TEXT" in this_pattern.keys():
            multi_patterns.append(json_line)
        elif len(this_pattern['lower'].split(' ')) > 1:
            this_pattern_text = this_pattern['lower']
            this_pattern_regex = {"label": json_line['label'], "pattern": [{"TEXT": {"REGEX": fr"\b({this_pattern_text})\b"}}]}
            multi_patterns.append(this_pattern_regex)
        else:
            token_patterns.append(json_line)

In [5]:
len(multi_patterns), len(token_patterns)

(3188, 3240)

In [6]:
token_patterns[:5], 

([{'label': 'SOURCE', 'pattern': [{'lower': 'grb200613a'}]},
  {'label': 'SOURCE', 'pattern': [{'lower': 'grb200622a'}]},
  {'label': 'SOURCE', 'pattern': [{'lower': 'grb200623a'}]},
  {'label': 'SOURCE', 'pattern': [{'lower': 'grb200623b'}]},
  {'label': 'SOURCE', 'pattern': [{'lower': 'grb200625a'}]}],)

In [7]:
token_patterns[-10:-5]

[{'label': 'TELESCOPE', 'pattern': [{'lower': 'lamost'}]},
 {'label': 'TELESCOPE', 'pattern': [{'lower': 'spitzer'}]},
 {'label': 'TELESCOPE', 'pattern': [{'lower': 'aristarchos'}]},
 {'label': 'TELESCOPE', 'pattern': [{'lower': 'planck'}]},
 {'label': 'TELESCOPE', 'pattern': [{'lower': '74\xa0inch'}]}]

In [8]:
multi_patterns[:5]

[{'label': 'SOURCE',
  'pattern': [{'TEXT': {'REGEX': '\\b(IceCube|IC|GRB|FRB|PKS|Mrk|HAWC|MAXI|GW)([ -]?)([0-9\\.\\-\\+]{2,}[A-Z]?)\\b'}}]},
 {'label': 'SOURCE',
  'pattern': [{'TEXT': {'REGEX': '\\b(AT) *?([0-9]{4}[a-z]{3})\\b'}}]},
 {'label': 'SOURCE',
  'pattern': [{'TEXT': {'REGEX': '\\b(ZTF)([0-9]{2}[a-z]{7})\\\\b'}}]},
 {'label': 'SOURCE',
  'pattern': [{'TEXT': {'REGEX': '\\b(irregular variable)\\b'}}]},
 {'label': 'SOURCE',
  'pattern': [{'TEXT': {'REGEX': '\\b(sx phe variable)\\b'}}]}]

In [9]:
multi_patterns[-10:-5]

[{'label': 'TELESCOPE',
  'pattern': [{'TEXT': {'REGEX': '\\b(telescopio amici osservatorio astrofisico di arcetri)\\b'}}]},
 {'label': 'TELESCOPE',
  'pattern': [{'TEXT': {'REGEX': '\\b(griffith observatory)\\b'}}]},
 {'label': 'TELESCOPE',
  'pattern': [{'TEXT': {'REGEX': '\\b(gran ecuatorial observatorio astronómico nacional)\\b'}}]},
 {'label': 'TELESCOPE',
  'pattern': [{'TEXT': {'REGEX': '\\b(himalayan chandra telescope)\\b'}}]},
 {'label': 'TELESCOPE',
  'pattern': [{'TEXT': {'REGEX': '\\b(steward observatory 60" cassegrain telescope)\\b'}}]}]

In [10]:
# single-token patterns can be loaded to spacy directly

In [11]:
multi_patterns_lower_temp = multi_patterns[3:]
# the first 3 multi-patterns are not in lower-case
multi_patterns_temp = ast.literal_eval(str(multi_patterns[:3]).lower())

In [12]:
# will throw all the gamma-related stuff from the TELESCOPE labels, they are creating lots of noise
multi_patterns_lower = []
for pat in multi_patterns_lower_temp:
    if pat['label'] == 'TELESCOPE':
        regex = pat['pattern'][0]['TEXT']['REGEX']
        if 'gamma' in str(regex):
            print('-1 gamma')
            continue
    multi_patterns_lower.append(pat)

-1 gamma
-1 gamma
-1 gamma
-1 gamma
-1 gamma
-1 gamma
-1 gamma


In [13]:
multi_patterns_lower += multi_patterns_temp

In [25]:
len(multi_patterns_lower)

3181

### For each telegram body perform a search for an entity and get a corresponding span

In [26]:
def get_ner_span(body):
    res = list()
    doc = nlp(body)
    
    # token re matches
    for ent in doc.ents:
        res.append((ent.start_char, ent.end_char, ent.label_))
    
    # full text re matches LOWER CASE (cleaned bodies)
    for pattern in multi_patterns_lower:
        try:
            label = pattern['label']

            for match in re.finditer(pattern['pattern'][0]['TEXT']['REGEX'], doc.text.lower()):
                start, end = match.span()
                span = doc.char_span(start, end)
                # This is a Span object or None if match doesn't map to valid token sequence
                if span is not None:
                    span.label_ = label
                    res.append((span.start_char, span.end_char, span.label_))
        except:
            pass  # broken RE patterns may happen
                
    return res

In [27]:
nlp = spacy.load("en_core_web_sm")

In [28]:
# remove default NER and replace with single-token entity ruler for labeling
nlp.remove_pipe("ner")
nlp.add_pipe("entity_ruler").add_patterns(token_patterns)

#### Test entity ruler single-token NER

In [29]:
test_body = df.body[4]

In [30]:
render(nlp(test_body), style="ent")  

#### Find all spans including the multi-token ones

In [31]:
df['ner_spans'] = df.body.apply(get_ner_span)

### Convert the spans to the NER training data format (both spacy binary and prodigy jsonl)
(the overlapping spans are fixed)

In [32]:
nlp = spacy.blank('en')
db = DocBin()

for i, r in df.iterrows():
    text = r.body
    annotations = r.ner_spans
    if annotations:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = filter_spans(ents)
        db.add(doc)

        
db.to_disk("training_data/spacy_train.spacy")

In [33]:
# and also save it to prodigy so we can read and correct labels later if needed

doc_bin = DocBin().from_disk("training_data/spacy_train.spacy")  
examples = []  # examples in Prodigy format
for doc in doc_bin.get_docs(nlp.vocab):
    spans = [{"start": ent.start_char, "end": ent.end_char, "label": ent.label_} for ent in doc.ents]
    examples.append({"text": doc.text, "spans": spans})
    
with open('training_data/prodigy_train.jsonl', 'w') as f:
    for line in examples:
        f.write(f"{json.dumps(line)}\n")

In [34]:
# save ruler-based found entities for the future comparison
df[['ner_spans']].to_csv("rb_ents.csv", index=True)

In [35]:
# df.iloc[12355][['ner_spans']]

ner_spans    [(10, 16, TELESCOPE)]
Name: 2432_atel, dtype: object