In [96]:
import pandas as pd
import numpy as np
import ast
import spacy

In [153]:
import warnings
warnings.filterwarnings('ignore')

### Load the trained NER model and the data
Check the model performance on a couple of examples

In [50]:
# vectors_128_d = spacy.load('../word2vec/floret-128/')

In [51]:
astro_nlp = spacy.load('astro_ner/model-best/')

In [52]:
df = pd.read_csv("../data/assembled.csv", index_col=0)
bodies = pd.read_csv("../word2vec/clean_bodies.csv", index_col=0)

In [65]:
df['body'] = df['body'].apply(lambda x: x.replace("gamma -", 'gamma-'))

In [66]:
spacy.displacy.render(astro_nlp(df.body[23]), style='ent')

In [243]:
spacy.displacy.render(astro_nlp(df.body[246]), style='ent')

In [72]:
spacy.displacy.render(astro_nlp(df.body[12347]), style='ent')

In [64]:
spacy.displacy.render(astro_nlp(df.body[12355]), style='ent')

### Use NER to fetch all entities from the texts, and compare the results with the RB approach
Bad news:
- `gamma` is being constantly labelled as TELESCOPE, even tho I deleted all mentions of 'gamma' in telescope patterns used for labelling and in most of the cases it aint the mention of a telescope. I have to remove this entity in future.

In [75]:
def get_ents_with_ner(text: str):
    res = list()
    doc = astro_nlp(text)
    
    # ents found by NER
    for ent in doc.ents:
        res.append(doc.char_span(ent.start_char, ent.end_char, ent.label_))
        
    return res

In [76]:
df['ner_ents'] = df.body.apply(get_ents_with_ner)

In [112]:
df['ner_telescope'] = df['ner_ents'].apply(lambda x: set([e.text.lower() for e in x if e.label_ == 'TELESCOPE']))
df['ner_source'] = df['ner_ents'].apply(lambda x: set([e.text.lower() for e in x if e.label_ == 'SOURCE']))

In [113]:
df.head()

Unnamed: 0_level_0,body,subject,date,from,refs,ner_ents,ner_telescope,ner_source
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3_atel,In addendum of ATEL #2: Additional Information...,Improved Coordinates for GB971227,1997-12-28,rutledge@rosat.mpe-garching.mpg.de,[],"[(Gamma), (BeppoSAX), (BeppoSAX), (BeppoSAX), ...","{gamma, bepposax}",{}
2_atel,The following message was emailed to me this e...,GB971227,1997-12-28,rutledge@rosat.mpe-garching.mpg.de,[],"[(GRB), (BeppoSAX), (BeppoSAX), (BeppoSAX)]",{bepposax},{grb}
4_atel,The recent detection of delayed Gamma ray burs...,The Probable Connection Between Relativistic S...,1998-01-06,rutledge@rosat.mpe-garching.mpg.de,[],"[(Gamma), (GRB), (Goodman), (Gamma), (gamma), ...","{fermi, fast, most, goodman, gamma}",{grb}
5_atel,The optical transient (IAUC # 6788 ) of GRB 97...,GRB 971214,1998-01-12,rutledge@rosat.mpe-garching.mpg.de,[],[(GRB)],{},{grb}
6_atel,GRB980109 field was observed by the OGLE colla...,GRB980109,1998-01-18,rutledge@rosat.mpe-garching.mpg.de,[],[(OGLE)],{ogle},{}


In [244]:
df.tail()

Unnamed: 0_level_0,body,subject,date,from,refs,ner_ents,ner_telescope,ner_source
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16033_atel,"The Large Area Telescope (LAT), one of the two...",Fermi LAT detection of increasing gamma-ray ac...,2023-05-10,federica.giacchino@roma2.infn.it,"['2980_atel', '2966_atel']","[(Fermi), (Gamma), (gamma), (radio, source), (...","{gamma, fermi}","{radio source, blazar}"
16034_atel,We report the following classification from a ...,Spectroscopic Classification of an optical tra...,2023-05-12,crojasbr@ucsc.edu,[],"[(Lick), (Shane, telescope), (ATLAS), (galaxy)...","{shane telescope, lick, atlas}","{galaxy, supernova}"
16035_atel,"The Large Area Telescope (LAT), one of the two...",Fermi-LAT detection of enhanced gamma-ray acti...,2023-05-12,Denis.bernard@in2p3.fr,[],"[(Fermi), (Gamma), (gamma), (quasar), (Fermi),...","{gamma, fermi}",{quasar}
16036_atel,Our spectroscopic monitoring of the developmen...,Continued spectroscopic monitoring of V1716 Sc...,2023-05-12,shore@df.unipi.it,"['16019_atel', '16018_atel', '16007_atel', '16...","[(most), (Most)]",{most},{}
16037_atel,We report pre-discovery detection of AT 2023hr...,"Pre-discovery detection of AT 2023hrq, a super...",2023-05-14,vinko@astro.as.utexas.edu,[],"[(Pan), (galaxy), (Pan), (ATLAS), (supernova),...",{atlas},"{pan, galaxy, supernova}"


In [245]:
# load all ents found with RB approach and get entities from the spans

In [246]:
df_rb_ents = pd.read_csv("rb_ents.csv", index_col=0).join(df)[['body', 'ner_spans']]
df_rb_ents.head()

Unnamed: 0_level_0,body,ner_spans
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1
3_atel,In addendum of ATEL #2: Additional Information...,"[(113, 118, 'TELESCOPE'), (249, 257, 'TELESCOP..."
2_atel,The following message was emailed to me this e...,"[(144, 147, 'SOURCE'), (203, 211, 'TELESCOPE')..."
4_atel,The recent detection of delayed Gamma ray burs...,"[(32, 37, 'TELESCOPE'), (49, 52, 'SOURCE'), (2..."
5_atel,The optical transient (IAUC # 6788 ) of GRB 97...,"[(40, 43, 'SOURCE'), (40, 50, 'SOURCE')]"
6_atel,GRB980109 field was observed by the OGLE colla...,"[(0, 9, 'SOURCE'), (36, 40, 'TELESCOPE')]"


In [247]:
df_rb_ents['ner_spans'] = df_rb_ents['ner_spans'].apply(ast.literal_eval)

In [248]:
df_rb_ents['rb_telescope'] = df_rb_ents.apply(lambda x: set([x.body[span_start: span_end].lower() for (span_start, span_end, label )
                                                         in x.ner_spans if label=='TELESCOPE']), axis=1)
df_rb_ents['rb_source'] = df_rb_ents.apply(lambda x: set([x.body[span_start: span_end].lower() for (span_start, span_end, label) 
                                                         in x.ner_spans if label=='SOURCE']), axis=1)

In [249]:
df_rb_ents.head()

Unnamed: 0_level_0,body,ner_spans,rb_telescope,rb_source
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3_atel,In addendum of ATEL #2: Additional Information...,"[(113, 118, TELESCOPE), (249, 257, TELESCOPE),...","{gamma, bepposax}",{}
2_atel,The following message was emailed to me this e...,"[(144, 147, SOURCE), (203, 211, TELESCOPE), (4...",{bepposax},{grb}
4_atel,The recent detection of delayed Gamma ray burs...,"[(32, 37, TELESCOPE), (49, 52, SOURCE), (290, ...","{fermi, fast, most, goodman, gamma}",{grb}
5_atel,The optical transient (IAUC # 6788 ) of GRB 97...,"[(40, 43, SOURCE), (40, 50, SOURCE)]",{},"{grb, grb 971214}"
6_atel,GRB980109 field was observed by the OGLE colla...,"[(0, 9, SOURCE), (36, 40, TELESCOPE)]",{ogle},{grb980109}


In [250]:
df_to_compare = df[['ner_telescope', 'ner_source']].join(df_rb_ents[['rb_telescope', 'rb_source']])

In [251]:
df_to_compare

Unnamed: 0_level_0,ner_telescope,ner_source,rb_telescope,rb_source
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3_atel,"{gamma, bepposax}",{},"{gamma, bepposax}",{}
2_atel,{bepposax},{grb},{bepposax},{grb}
4_atel,"{fermi, fast, most, goodman, gamma}",{grb},"{fermi, fast, most, goodman, gamma}",{grb}
5_atel,{},{grb},{},"{grb, grb 971214}"
6_atel,{ogle},{},{ogle},{grb980109}
...,...,...,...,...
16033_atel,"{gamma, fermi}","{radio source, blazar}","{gamma, fermi}","{radio source, blazar}"
16034_atel,"{shane telescope, lick, atlas}","{galaxy, supernova}","{shane telescope, lick}","{galaxy, supernova, atlas}"
16035_atel,"{gamma, fermi}",{quasar},"{gamma, fermi}",{quasar}
16036_atel,{most},{},{most},{}


In [252]:
df_to_compare['t_not_found_by_ner'] = df_to_compare.apply(lambda x: x.rb_telescope.difference(x.ner_telescope), axis=1)
df_to_compare['t_not_found_by_rb'] = df_to_compare.apply(lambda x: x.ner_telescope.difference(x.rb_telescope), axis=1)

df_to_compare['s_not_found_by_ner'] = df_to_compare.apply(lambda x: x.rb_source.difference(x.ner_source), axis=1)
df_to_compare['s_not_found_by_rb'] = df_to_compare.apply(lambda x: x.ner_source.difference(x.rb_source), axis=1)

In [255]:
len(set(df_to_compare.explode('t_not_found_by_ner')['t_not_found_by_ner'].tolist())), len(set(df_to_compare.explode('t_not_found_by_rb')['t_not_found_by_rb'].tolist()))

(131, 101)

In [256]:
len(set(df_to_compare.explode('s_not_found_by_ner')['s_not_found_by_ner'].tolist())), len(set(df_to_compare.explode('s_not_found_by_rb')['s_not_found_by_rb'].tolist()))

(947, 307)

In [124]:
df_to_compare['t_not_found_by_ner_cnt'] = df_to_compare['t_not_found_by_ner'].apply(len)
df_to_compare['t_not_found_by_rb_cnt'] = df_to_compare['t_not_found_by_rb'].apply(len)

df_to_compare['s_not_found_by_ner_cnt'] = df_to_compare['s_not_found_by_ner'].apply(len)
df_to_compare['s_not_found_by_rb_cnt'] = df_to_compare['s_not_found_by_rb'].apply(len)

In [132]:
df_to_compare.t_not_found_by_ner_cnt.sum(), df_to_compare.t_not_found_by_rb_cnt.sum()

(3962, 442)

In [133]:
df_to_compare['s_not_found_by_ner_cnt'].sum(), df_to_compare['s_not_found_by_rb_cnt'].sum()

(3692, 654)

In [139]:
df_to_compare.ner_telescope.apply(len).sum(), df_to_compare.rb_telescope.apply(len).sum()

(70163, 73683)

In [140]:
df_to_compare.ner_source.apply(len).sum(), df_to_compare.rb_source.apply(len).sum()

(71782, 74820)

In [143]:
df_to_compare['t_not_found_by_ner'].explode().value_counts()

gamma                        2880
swift                         136
hubble                        135
hale                          131
chandra x-ray observatory     121
                             ... 
einstein observatory            1
osn 1.5m                        1
flwo 1.5m tillinghast           1
hubble space telescope          1
pansstarrs                      1
Name: t_not_found_by_ner, Length: 130, dtype: int64

In [144]:
df_to_compare['t_not_found_by_rb'].explode().value_counts()

atlas                       246
neil gehrels                 20
faulkes north                14
nasa infrared telescope      11
gamma ray telescope           8
                           ... 
~t=70s                        1
pulkovo                       1
faulkes  telescope north      1
idra                          1
south array                   1
Name: t_not_found_by_rb, Length: 100, dtype: int64

In [145]:
df_to_compare['s_not_found_by_ner'].explode().value_counts()

star            787
grb             394
atlas           226
x-ray binary    141
cloud           106
               ... 
at2021lkg         1
grb210507.04      1
grb210506.78      1
at2019wzl         1
4u 1626-67        1
Name: s_not_found_by_ner, Length: 946, dtype: int64

In [146]:
df_to_compare['s_not_found_by_rb'].explode().value_counts()

long burst                38
atlas                     28
young stellar             26
red nova                  22
optical nova              18
                          ..
reddened nova              1
at2016jah                  1
at2016izg                  1
at2016jbc                  1
nonastrophysical event     1
Name: s_not_found_by_rb, Length: 306, dtype: int64

### Conclusion: RB approach produces more precise names and captures more entities. I will combine the entities found by both approaches, removing the duplicates and cleaning the suspicious entities (including `gamma` as telescope name)

In [210]:
df_ents = df[['ner_telescope', 'ner_source']].join(df_rb_ents[['rb_telescope', 'rb_source']])

In [211]:
df_ents['telescope'] = df_ents.apply(lambda x: x.ner_telescope.union(x.rb_telescope), axis=1)
df_ents['source'] = df_ents.apply(lambda x: x.ner_source.union(x.rb_source), axis=1)

df_ents = df_ents[['telescope', 'source']]

In [212]:
# these will be thrown out
df_ents['telescopes_weird'] = df_ents['telescope'].apply(lambda x: [t for t in x if t != 'gamma' and 
                                                                    any(i in t for i in ['+', '/', ','])
                                                                    or all([i.isnumeric() for i in t])])
df_ents['telescopes_weird'].explode().value_counts()

epessto+                                                                                                                33
https://gcn.gsfc.nasa.gov/other/555263883.fermi                                                                          2
06/05/2008                                                                                                               1
http://gcn.gsfc.nasa.gov/notices_s/610953/ba/                                                                            1
ryle (1973, nature                                                                                                       1
cecube,                                                                                                                  1
https://heasarc.gsfc.nasa.gov/ftp/fermi/data/gbm/triggers/2020/bn201214672/quicklook/glg_lc_medres34_bn201214672.gif     1
r=17.96+/0.03                                                                                                            1
08:55:46.848,596

In [213]:
df_ents['telescope'] = df_ents['telescope'].apply(lambda x: [t for t in x if t != 'gamma' 
                                                             and all(not i in t for i in ['+', '/', ','])
                                                             or not all([i.isnumeric() for i in t])])
df_ents['telescope'] = df_ents['telescope'].apply(lambda x: [y.strip() for y in x])
df_ents['telescope'].explode().value_counts()  # 354 unique telescopes

swift                              18902
fermi                               7941
gamma                               7176
master                              3653
integral                            3101
                                   ...  
james clark maxwell telescope          1
ohp 1.93                               1
wisep                                  1
united states naval observatory        1
pansstarrs                             1
Name: telescope, Length: 364, dtype: int64

In [214]:
# these will be thrown out
df_ents['sources_weird'] = df_ents['source'].apply(lambda x: [s for s in x if any([i in s for i in ['+', ',']])])
df_ents['sources_weird'].explode().value_counts()

host?+grb                               1
p=12.457+/-0.002                        1
09:03:44.446,59630.37760,21.630,0.23    1
Name: sources_weird, dtype: int64

In [215]:
df_ents['source'] = df_ents['source'].apply(lambda x: [s for s in x if all(not i in s for i in ['+', ','])])
df_ents['source'] = df_ents['source'].apply(lambda x: [y.strip() for y in x])
df_ents['source'].explode().value_counts()  # 1725 unique sources

grb                       24807
total                      6218
galaxy                     5669
star                       4371
supernova                  3636
                          ...  
at2021iyr                     1
at2021hrj                     1
at2021jze                     1
at2021ita                     1
nonastrophysical event        1
Name: source, Length: 1725, dtype: int64

In [218]:
df_ents['telescope'] = df_ents['telescope'].apply(lambda x: x if len(x) else ['telescope not found'])
df_ents['source'] = df_ents['source'].apply(lambda x: x if len(x) else ['source not found'])

In [219]:
df_ents[['telescope', 'source']].to_csv("name_entities.csv", index=True)