In [255]:
import pandas as pd
import numpy as np
import ast
import re
import subprocess

In [256]:
import warnings
warnings.filterwarnings('ignore')

### Load ATels

In [257]:
# subprocess.run("python load_atel.py --parse_from 15745 --csv_filename atel.csv", shell=True)

In [303]:
atel_df = pd.read_csv("atel.csv", index_col=0)

In [304]:
atel_df.tail()

Unnamed: 0_level_0,title,date,authors,credential_certification,subjects,body,related_ids
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16033,Fermi LAT detection of increasing gamma-ray ac...,10 May 2023; 17:31 UT,Federica Giacchino (INFN Sezione Roma TorVerga...,Federica Giacchino (federica.giacchino@roma2.i...,"Gamma Ray, >GeV, AGN, Blazar, Quasar","The Large Area Telescope (LAT), one of the two...","['16033', '2980', '2966']"
16034,Spectroscopic Classification of an optical tra...,12 May 2023; 16:26 UT,"C. Rojas-Bravo, K. Taggart, R. J. Foley (UCSC)",Cesar Rojas Bravo (crojasbr@ucsc.edu),"Optical, Supernovae",We report the following classification from a ...,[]
16035,Fermi-LAT detection of enhanced gamma-ray acti...,12 May 2023; 18:28 UT,"Denis Bernard (LLR, Ecole Polytechnique & CNR...",Denis Bernard (Denis.bernard@in2p3.fr),"Gamma Ray, >GeV, Request for Observations, AGN...","The Large Area Telescope (LAT), one of the two...",[]
16036,Continued spectroscopic monitoring of V1716 Sc...,12 May 2023; 23:54 UT,"Steve Shore (Univ. Pisa), Stephane Charbonnel,...",S. N. Shore (shore@df.unipi.it),"Cataclysmic Variable, Nova, Transient",Our spectroscopic monitoring of the developmen...,"['16036', '16019', '16018', '16007', '16006', ..."
16037,"Pre-discovery detection of AT 2023hrq, a super...",14 May 2023; 20:56 UT,"A. Horti-David, K. Sarneczky, J. Vinko (Konkol...",Jozsef Vinko (vinko@astro.as.utexas.edu),"Optical, Supernovae, Transient",We report pre-discovery detection of AT 2023hr...,[]


### Load GCNs

In [260]:
# subprocess.run("python load_gcn.py --csv_filename gcn.csv", shell=True)

In [288]:
gcn_df = pd.read_csv("gcn.csv", index_col=0)

gcn_df['atel_refs'] = gcn_df['atel_refs'].apply(ast.literal_eval)
gcn_df['gcn_refs'] = gcn_df['gcn_refs'].apply(ast.literal_eval)
gcn_df['date'] = pd.to_datetime(gcn_df['date']).apply(lambda x: x.date)

gcn_df.dropna(subset=['date'], inplace=True)

In [289]:
gcn_df

Unnamed: 0_level_0,body,subject,date,from,gcn_refs,atel_refs,type
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
31,"S. V. Zharikov, V. V. Sokolov, SAO RAS and Yu....",GRB970508 SAO RAS optical observations,1998-03-26,sokolov@relay.sao.ru,[],[],gcn
32,Comparison of optical R-band observations of t...,GRB980326 optical observations,1998-03-28,paulgr@astro.uva.nl,[],[],gcn
33,GRB 980326: Optical Transient Confirmed: A. C...,GRB980326 optical observations,1998-03-29,srk@astro.caltech.edu,[],[],gcn
34,"Bruce Grossan, Robert Knop, Saul Perlmutter (L...",GRB980326 optical observations,1998-03-30,bruce@singu.lbl.gov,[],[],gcn
35,"Addendum to GCN #34 Bruce Grossan, Robert Kno...",GRB980326 optical observations,1998-03-31,bruce@singu.lbl.gov,[34],[],gcn
...,...,...,...,...,...,...,...
9995,We observed the afterglow of Fermi GRB 091003 ...,GRB 091003: WHT ACAM observations,2009-10-05,kw113@star.le.ac.uk,"[9985, 9986, 9983]",[],gcn
9996,Skynet observed the Swift/BAT localization of ...,GRB 090926B: Skynet/PROMPT Observations,2009-10-06,haislip@physics.unc.edu,"[9944, 9935]",[],gcn
9997,We observed the position of Fermi GRB 091003 (...,GRB 091003: Lick observations and possible SDS...,2009-10-06,dperley@astro.berkeley.edu,"[9986, 9987, 9990, 9995, 983]",[],gcn
9998,The long GRB 091003A (Fermi-GBM trigger 276237...,GRB 091003A: Suzaku WAM observation of the pro...,2009-10-07,kenta0514@astro.miyazaki-u.ac.jp,"[9985, 9983]",[],gcn


In [291]:
gcn_df.date.min(), gcn_df.date.max() 

(datetime.date(1998, 3, 26), datetime.date(2023, 4, 16))

### Prepare ATels so they match the newer format and have all nesessary data

In [305]:
atel_df = atel_df\
    .drop(columns=["authors", "subjects"])\
    .rename(columns={"credential_certification": "from", "related_ids": "atel_refs", 'title': "subject"})\
    .dropna(subset=['atel_refs', 'body', 'from'])

In [306]:
gcn_re = r"[\[\s(](?:GCN|gcn).?:?\s?[Circ.]+?\s?([\d\s,]+)|[\[\s(](?:GCN|gcn).?:?\s?#?([\d\s,#]+)"

# search for GCN refs in ATels
gcn_refs = []
for i, r in atel_df.iterrows():
    gcn_references = re.findall(gcn_re, r.body)
    this_gcn_refs = []
    for ref_group in gcn_references:
        for ref in ref_group:
            processed_refs = [x.replace('#', '').strip() for x in ref.split(',') if x.replace('#', '').strip()]
            try:
                this_gcn_refs.extend(list(set([int(x) for x in processed_refs])))
            except: pass
    gcn_refs.append(this_gcn_refs)
    
atel_df['gcn_refs'] = gcn_refs

In [307]:
atel_df['atel_refs'] = atel_df['atel_refs'].apply(str).apply(lambda x: ast.literal_eval(x))
atel_df['type'] = ['atel']*len(atel_df)

In [308]:
# leave only the email of submitter(s)

atel_from_re = r"\((.*)\)"

def search_from_email_atel(from_field):
    exact_match = re.findall(atel_from_re, from_field)
    if exact_match:
        return exact_match[0]
    from_parts = from_field.split(' ')
    for part in from_parts:
        if '@' in part:
            return part
    return ''

atel_df['from'] = atel_df['from'].apply(search_from_email_atel)

In [313]:
atel_df['date'] = pd.to_datetime(atel_df['date']).apply(lambda x: x.date())

In [314]:
atel_df

Unnamed: 0_level_0,subject,date,from,body,atel_refs,gcn_refs,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,QPOs in 4U 1626-67,1997-12-17,,The low-mass X-ray binary pulsar 4U 1626-67 sh...,[],[],atel
2,GB971227,1997-12-28,rutledge@rosat.mpe-garching.mpg.de,The following message was emailed to me this e...,[],[],atel
3,Improved Coordinates for GB971227,1997-12-28,rutledge@rosat.mpe-garching.mpg.de,In addendum of ATEL #2: Additional Information...,[],[],atel
4,The Probable Connection Between Relativistic S...,1998-01-06,rutledge@rosat.mpe-garching.mpg.de,The recent detection of delayed Gamma ray burs...,[],[],atel
5,GRB 971214,1998-01-12,rutledge@rosat.mpe-garching.mpg.de,The optical transient (IAUC # 6788 ) of GRB 97...,"[7, 5]",[],atel
...,...,...,...,...,...,...,...
16033,Fermi LAT detection of increasing gamma-ray ac...,2023-05-10,federica.giacchino@roma2.infn.it,"The Large Area Telescope (LAT), one of the two...","[16033, 2980, 2966]",[],atel
16034,Spectroscopic Classification of an optical tra...,2023-05-12,crojasbr@ucsc.edu,We report the following classification from a ...,[],[],atel
16035,Fermi-LAT detection of enhanced gamma-ray acti...,2023-05-12,Denis.bernard@in2p3.fr,"The Large Area Telescope (LAT), one of the two...",[],[],atel
16036,Continued spectroscopic monitoring of V1716 Sc...,2023-05-12,shore@df.unipi.it,Our spectroscopic monitoring of the developmen...,"[16036, 16019, 16018, 16007, 16006, 16005, 160...",[],atel


In [315]:
# fix ATel refs data issue when refs are counting in both directions
atel_df['atel_refs'] = atel_df.apply(lambda x: [ref for ref in x.atel_refs if int(ref) < x.name], axis=1)

In [316]:
atel_df.date.min(), atel_df.date.max() 

(datetime.date(1997, 12, 17), datetime.date(2023, 5, 14))

### Remove the GCN and ATel telegram duplicates using cosine similarity search

In [317]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [318]:
duplicates_similarity_threshold = 0.9

In [319]:
gcn_bodies = gcn_df.body.values
atel_bodies = atel_df.body.values

In [320]:
documents = np.hstack((gcn_bodies, atel_bodies))
tfidf = TfidfVectorizer().fit(documents)

In [321]:
atel_tf = tfidf.transform(atel_bodies)
atel_tf.shape

(16036, 188564)

In [322]:
gcn_tf = tfidf.transform(gcn_bodies)
gcn_tf.shape

(32448, 188564)

In [323]:
sim_m = cosine_similarity(atel_tf, gcn_tf)
sim_m.shape

(16036, 32448)

In [324]:
probable_duplicates = np.where(sim_m > duplicates_similarity_threshold)

In [325]:
atel_index_to_throw, gcn_index_to_throw = probable_duplicates[0], probable_duplicates[1]

In [326]:
atel_to_throw = atel_df.iloc[atel_index_to_throw].index
len(atel_to_throw)

258

In [327]:
atel_df = atel_df.drop(atel_to_throw)

### Combine the data into one DataFrame

In [328]:
cols = ['telegram_no', 'body', 'subject', 'date', 'from', 'gcn_refs', 'atel_refs', 'type']

atel_df = atel_df.reset_index()\
    .rename(columns={'index': 'telegram_no'})[cols]
atel_df['telegram_no'] = atel_df['telegram_no'].apply(str)

gcn_df = gcn_df.reset_index()\
    .rename(columns={'number': 'telegram_no'})[cols]
gcn_df['telegram_no'] = gcn_df['telegram_no'].apply(str)

In [329]:
# create unified index as `telegram_source`_`telegram_number`
df = pd.concat((gcn_df, atel_df))
df['telegram_index'] = df.apply(lambda x: x['telegram_no']+ '_'+ x['type'], axis=1)
df = df.drop(columns=['telegram_no', 'type'])\
    .set_index('telegram_index')

In [330]:
df.head(1)

Unnamed: 0_level_0,body,subject,date,from,gcn_refs,atel_refs
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31_gcn,"S. V. Zharikov, V. V. Sokolov, SAO RAS and Yu....",GRB970508 SAO RAS optical observations,1998-03-26,sokolov@relay.sao.ru,[],[]


In [331]:
# create unified reference col
df['refs'] = df.apply(lambda x: [str(y) + '_gcn' for y in x.gcn_refs] + [str(y) + '_atel' for y in x.atel_refs], 
                      axis=1)
df = df.drop(columns=['gcn_refs', 'atel_refs'])
df.head()

Unnamed: 0_level_0,body,subject,date,from,refs
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
31_gcn,"S. V. Zharikov, V. V. Sokolov, SAO RAS and Yu....",GRB970508 SAO RAS optical observations,1998-03-26,sokolov@relay.sao.ru,[]
32_gcn,Comparison of optical R-band observations of t...,GRB980326 optical observations,1998-03-28,paulgr@astro.uva.nl,[]
33_gcn,GRB 980326: Optical Transient Confirmed: A. C...,GRB980326 optical observations,1998-03-29,srk@astro.caltech.edu,[]
34_gcn,"Bruce Grossan, Robert Knop, Saul Perlmutter (L...",GRB980326 optical observations,1998-03-30,bruce@singu.lbl.gov,[]
35_gcn,"Addendum to GCN #34 Bruce Grossan, Robert Kno...",GRB980326 optical observations,1998-03-31,bruce@singu.lbl.gov,[34_gcn]


In [334]:
df.sort_values(by=['date']).to_csv('assembled.csv', index=True)

In [335]:
pd.read_csv("assembled.csv", index_col=0).dropna(subset=['date', 'from', 'body']).to_csv('assembled.csv', index=True)