### I will convert all the extracted entities to vectors using pre-trained Floret embeddings and some baseline approaches

In [191]:
import pandas as pd
import numpy as np
import ast
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [211]:
df = pd.read_csv("name_entities.csv", index_col=0)

In [212]:
df['telescope'] = df['telescope'].apply(ast.literal_eval)
df['source'] = df['source'].apply(ast.literal_eval)

df['telescope'] = df['telescope'].apply(lambda x: [y.strip() for y in x])
df['source'] = df['source'].apply(lambda x: [y.strip() for y in x])

In [213]:
df.tail()

Unnamed: 0_level_0,telescope,source
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1
16033_atel,"[gamma, fermi]","[radio source, blazar]"
16034_atel,"[shane telescope, atlas, lick]","[atlas, galaxy, supernova]"
16035_atel,"[gamma, fermi]",[quasar]
16036_atel,[most],[source not found]
16037_atel,[atlas],"[atlas, pan, galaxy, supernova]"


#### get vectors using floret 128D embeddings

In [155]:
floret_128 = spacy.load("../word2vec/floret-128/")

In [156]:
df['telescope_vec'] = df['telescope'].apply(lambda x: np.mean([floret_128(t).vector for t in x], axis=0))
df['source_vec'] = df['source'].apply(lambda x: np.mean([floret_128(s).vector for s in x], axis=0))

In [157]:
df.head()

Unnamed: 0_level_0,telescope,source,telescope_vec,source_vec
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3_atel,"[gamma, bepposax]",[source not found],"[-0.62745, 0.63778496, 0.03205499, -0.78914, -...","[-1.9836334, 3.0164, -0.70897007, -0.34840167,..."
2_atel,[bepposax],[grb],"[-2.7991, -0.43983, 0.62377, -0.20258, 3.0079,...","[-3.876, 0.32272, -0.59962, 1.377, 0.60387, -1..."
4_atel,"[fermi, fast, most, goodman, gamma]",[grb],"[-2.48042, 1.125766, 0.45803404, 0.48099598, -...","[-3.876, 0.32272, -0.59962, 1.377, 0.60387, -1..."
5_atel,[telescope not found],"[grb, grb 971214]","[-2.2696667, 2.5010467, -0.7204434, -0.3446496...","[-3.372875, 0.33710748, -0.060314983, 0.56595,..."
6_atel,[ogle],[grb980109],"[-3.4514, 0.089772, 0.6605, 0.29436, -0.77596,...","[-2.0772, 0.59179, 0.88244, -0.72861, -0.00203..."


In [158]:
pd.concat([pd.DataFrame(df.telescope_vec.tolist(), index=df.index, columns=[*range(128)]), 
           pd.DataFrame(df.source_vec.tolist(), index=df.index, columns=[*range(128, 256)])],
          axis=1)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3_atel,-0.627450,0.637785,0.032055,-0.789140,-0.801100,-0.778650,-0.415307,1.459350,-1.224885,-0.282865,...,-1.665767,2.601700,-1.637177,-1.036830,0.364245,1.445063,0.833967,2.841267,-2.536283,0.211777
2_atel,-2.799100,-0.439830,0.623770,-0.202580,3.007900,1.376200,-0.792310,1.859000,-0.465370,-1.560800,...,-4.171000,0.963690,-0.135300,-1.626400,1.131600,0.922140,-0.832150,0.953610,-2.339500,4.451700
4_atel,-2.480420,1.125766,0.458034,0.480996,-3.262056,-0.718174,0.487979,1.721258,-0.638792,-2.638626,...,-4.171000,0.963690,-0.135300,-1.626400,1.131600,0.922140,-0.832150,0.953610,-2.339500,4.451700
5_atel,-2.269667,2.501047,-0.720443,-0.344650,1.228233,-1.479640,-0.541160,1.016717,-0.678800,-2.994867,...,-3.614550,0.216543,-0.777000,-1.012408,0.580325,0.903885,-0.527263,0.909035,-2.082050,2.950850
6_atel,-3.451400,0.089772,0.660500,0.294360,-0.775960,0.364430,2.822000,0.382400,7.093900,-2.604800,...,-3.683900,0.738540,-1.723400,-0.424600,-0.223810,1.120200,-0.117950,2.520900,-2.457300,-0.342590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16033_atel,-1.014200,1.525500,-0.531200,-0.490710,-3.279150,-2.449200,0.971497,1.522700,-1.557550,-0.445865,...,-0.038665,1.337900,-0.736241,-0.533847,-0.612908,2.077898,-0.817168,3.577675,-2.047270,-0.616167
16034_atel,-2.568383,-2.105408,-0.976705,-0.292000,-3.832566,-0.418271,2.245010,0.847725,1.014745,-1.946300,...,-0.596403,2.689300,-0.179680,0.257307,-1.500307,1.933533,-1.848997,2.721500,-3.665167,-0.414981
16035_atel,-1.014200,1.525500,-0.531200,-0.490710,-3.279150,-2.449200,0.971497,1.522700,-1.557550,-0.445865,...,-1.081700,-0.965560,-1.963200,-0.365870,-0.392640,-0.252850,-2.692200,2.120900,-3.491100,0.494700
16036_atel,-1.252200,0.882520,2.376100,-1.059200,-2.270500,0.772170,1.144500,2.178300,1.840200,-3.422000,...,-1.665767,2.601700,-1.637177,-1.036830,0.364245,1.445063,0.833967,2.841267,-2.536283,0.211777


In [159]:
pd.concat([pd.DataFrame(df.telescope_vec.tolist(), index=df.index, columns=[*range(128)]), 
           pd.DataFrame(df.source_vec.tolist(), index=df.index, columns=[*range(128, 256)])],
          axis=1).to_csv("ent_vectors/ent_floret_128.csv", index=True)

#### get vectors using floret 256D embeddings

In [160]:
floret_256 = spacy.load("../word2vec/floret-256/")

In [161]:
df['telescope_vec'] = df['telescope'].apply(lambda x: np.mean([floret_256(t).vector for t in x], axis=0))
df['source_vec'] = df['source'].apply(lambda x: np.mean([floret_256(s).vector for s in x], axis=0))

In [162]:
pd.concat([pd.DataFrame(df.telescope_vec.tolist(), index=df.index, columns=[*range(256)]), 
           pd.DataFrame(df.source_vec.tolist(), index=df.index, columns=[*range(256, 512)])],
          axis=1).to_csv("ent_vectors/ent_floret_256.csv", index=True)

#### CountVectorizer 

In [194]:
df = df[['telescope', 'source']]

In [195]:
df['telescopes'] = df['telescope'].apply(lambda x: ' '.join(x))
df['sources'] = df['source'].apply(lambda x: ' '.join(x))

In [196]:
df.head()

Unnamed: 0_level_0,telescope,source,telescopes,sources
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3_atel,"[gamma, bepposax]",[source not found],gamma bepposax,source not found
2_atel,[bepposax],[grb],bepposax,grb
4_atel,"[fermi, fast, most, goodman, gamma]",[grb],fermi fast most goodman gamma,grb
5_atel,[telescope not found],"[grb, grb 971214]",telescope not found,grb grb 971214
6_atel,[ogle],[grb980109],ogle,grb980109


In [199]:
vectorizer_t = CountVectorizer()  #token_pattern="([\w\s\d\.\-]+)#")
X_t = vectorizer_t.fit_transform(df['telescopes'])
len(vectorizer_t.get_feature_names_out())

458

In [200]:
vectorizer_s = CountVectorizer()
X_s = vectorizer_s.fit_transform(df['sources'])
len(vectorizer_s.get_feature_names_out())

1654

In [210]:
pd.concat([pd.DataFrame(X_t.toarray(), index=df.index, columns=[*range(X_t.shape[1])]),
pd.DataFrame(X_s.toarray(), index=df.index, columns=[*range(X_t.shape[1], X_s.shape[1]+X_t.shape[1])])],
axis=1).to_csv("ent_vectors/end_cnt_vec.csv", index=True)