### I will convert the extracted topics to vectors using pre-trained Floret embeddings and count vectorizer approach

In [2]:
import pandas as pd
import numpy as np
import ast
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

In [10]:
atel_df = pd.read_csv("../topics/atel_with_topics.csv", index_col=0)
atel_df['topics'] = atel_df['topics'].apply(lambda x: x.split(','))

gcn_df = pd.read_csv("../topics/gcn_with_topics.csv", index_col=0)
gcn_df['topics'] = gcn_df['topics'].apply(ast.literal_eval)

df = pd.concat((atel_df, gcn_df))[['topics']]

In [11]:
df

Unnamed: 0_level_0,topics
telegram_index,Unnamed: 1_level_1
2_atel,"[gamma ray, gamma-ray burst]"
3_atel,"[gamma ray, gamma-ray burst]"
4_atel,"[optical, gamma ray, a comment, gamma-ray b..."
5_atel,"[optical, gamma-ray burst]"
6_atel,"[optical, gamma-ray burst]"
...,...
33590_gcn,"[gamma-ray burst, transient]"
33585_gcn,"[transient, variables]"
33623_gcn,"[optical, gamma-ray burst, a comment]"
33617_gcn,"[optical, gamma-ray burst]"


floret 128-D

In [12]:
floret_128 = spacy.load("../word2vec/floret-128/")

In [13]:
df['topics_vec'] = df['topics'].apply(lambda x: np.mean([floret_128(t).vector for t in x], axis=0))

In [15]:
df_vec_floret = pd.DataFrame(df['topics_vec'].tolist(), index=df.index, columns=[*range(128)])
df_vec_floret.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2_atel,0.316386,1.311309,0.068209,-0.266096,-3.898285,-2.250049,-0.829309,0.65351,-0.633522,-2.878241,...,-2.910473,-0.480244,-2.594925,-1.670457,-0.252598,-1.16398,1.966611,-0.15636,-2.899865,1.878706
3_atel,0.316386,1.311309,0.068209,-0.266096,-3.898285,-2.250049,-0.829309,0.65351,-0.633522,-2.878241,...,-2.910473,-0.480244,-2.594925,-1.670457,-0.252598,-1.16398,1.966611,-0.15636,-2.899865,1.878706
4_atel,-0.897859,1.043075,0.28859,-0.05266,-1.890191,-2.154824,-0.705334,1.174327,-0.554532,-1.962018,...,-2.174857,0.73806,-1.957508,-1.366832,0.154198,0.577788,0.720608,1.295994,-3.383379,0.634673
5_atel,-1.647911,1.837809,-0.023241,-0.29375,-1.305915,-1.430249,-0.196183,0.63697,-0.255237,-2.937783,...,-1.140748,0.576851,-1.7953,-1.112982,0.16575,1.02217,0.595235,0.509335,-2.59414,0.567046
6_atel,-1.647911,1.837809,-0.023241,-0.29375,-1.305915,-1.430249,-0.196183,0.63697,-0.255237,-2.937783,...,-1.140748,0.576851,-1.7953,-1.112982,0.16575,1.02217,0.595235,0.509335,-2.59414,0.567046


In [16]:
df_vec_floret.to_csv("topics_vectors/topics_floret_128.csv", index=True)

CountVectorizer

In [17]:
df['topics'] = df['topics'].apply(lambda x: ' '.join(x))

In [19]:
vectorizer_t = CountVectorizer()  
X_t = vectorizer_t.fit_transform(df['topics'])
len(vectorizer_t.get_feature_names_out())

75

In [20]:
pd.DataFrame(X_t.toarray(), index=df.index, columns=[*range(X_t.shape[1])])

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2_atel,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3_atel,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4_atel,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5_atel,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6_atel,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33590_gcn,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
33585_gcn,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
33623_gcn,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33617_gcn,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
pd.DataFrame(X_t.toarray(), index=df.index, columns=[*range(X_t.shape[1])]).to_csv("topics_vectors/topics_cnt_vec.csv", index=True)