In [673]:
import pandas as pd
import numpy as np
import ast
import re
import datetime
from functools import reduce

In [674]:
atel_df = pd.read_csv("../topics/atel_with_topics.csv", index_col=0)
atel_df['topics'] = atel_df['topics'].apply(lambda x: x.split(','))

gcn_df = pd.read_csv("../topics/gcn_with_topics.csv", index_col=0)
gcn_df['topics'] = gcn_df['topics'].apply(ast.literal_eval)

all_df = pd.read_csv('../data/assembled.csv', index_col=0)

labels = pd.read_csv('../data/labels.csv', index_col=0)

In [675]:
df = pd.concat((atel_df, gcn_df)).join(all_df).join(labels)
df = df[['topics', 'date', 'n_cited']]
df['date'] = pd.to_datetime(df['date'])
df['topics'] = df.topics.apply(lambda x: [y.strip() for y in x])
df = df.reset_index().set_index("date").sort_index()
df

Unnamed: 0_level_0,telegram_index,topics,n_cited
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-12-28,2_atel,"[gamma ray, gamma-ray burst]",0
1997-12-28,3_atel,"[gamma ray, gamma-ray burst]",0
1998-01-06,4_atel,"[optical, gamma ray, a comment, gamma-ray burst]",0
1998-01-12,5_atel,"[optical, gamma-ray burst]",0
1998-01-18,6_atel,"[optical, gamma-ray burst]",0
...,...,...,...
2023-05-10,16033_atel,"[gamma ray, gev, agn, blazar, quasar]",0
2023-05-12,16034_atel,"[optical, supernovae]",0
2023-05-12,16035_atel,"[gamma ray, gev, request for observations, agn...",0
2023-05-12,16036_atel,"[cataclysmic variable, nova, transient]",0


### Calc each topic's `weight` to represent it's relative frequency and potential interest to it

In [733]:
# ignore_topics = ['a comment', 'request for observations']  # these topics weight will be set to 0
ignore_topics = []

In [734]:
def get_topic_weight(topics):
    total = len(topics)
    unique_topics, cnts = np.unique(topics, return_counts=True)
    topic_weight = {}

    for t, t_cnt in zip(unique_topics, cnts):
        if t not in ignore_topics:   
            topic_weight[t] = (1 / t_cnt) * (total / len(unique_topics))
        else:
            topic_weight[t] = 0
    return topic_weight

In [735]:
topic_weights = get_topic_weight(df.explode('topics').apply(lambda x: x.topics.strip(), axis=1).values)

In [736]:
topic_weights = sorted(topic_weights.items(), key=lambda x:x[1])

In [737]:
topic_weights

[('transient', 0.10678524347510467),
 ('optical', 0.13429001457589904),
 ('x-ray', 0.16041747745994683),
 ('gamma-ray burst', 0.18876533669166334),
 ('gamma ray', 0.26281253009487565),
 ('supernovae', 0.44586981261760694),
 ('variables', 0.4664462361788905),
 ('gev', 0.544418303890815),
 ('binary', 0.7477418745275889),
 ('agn', 0.7887912131722681),
 ('radio', 0.8164758072836068),
 ('infra-red', 0.823587566211899),
 ('nova', 0.846907872911918),
 ('neutron star', 0.8994010887229666),
 ('black hole', 0.9397717692042796),
 ('blazar', 1.2574638130193687),
 ('cataclysmic variable', 1.329318888049047),
 ('pulsar', 1.3427155969528852),
 ('request for observations', 1.4417846277167476),
 ('ultra-violet', 1.5250510656337921),
 ('vhe', 2.018697071727375),
 ('quasar', 2.17605653166158),
 ('a comment', 2.254729344729345),
 ('star', 2.53559528386518),
 ('neutrinos', 2.5488244766505637),
 ('tev', 3.7103141115799345),
 ('gravitational waves', 5.758640762570035),
 ('magnetar', 8.963755804734399),
 ('mi

### Create time-series features: how many telegrams with these topics appeared in the last year (and month and all-time) before the current publication, and how many cications these topics received over the same periods of time

In [681]:
topics_df = df.explode("topics").rename(columns={'topics': 'topic'})
topics_df

Unnamed: 0_level_0,telegram_index,topic,n_cited
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-12-28,2_atel,gamma ray,0
1997-12-28,2_atel,gamma-ray burst,0
1997-12-28,3_atel,gamma ray,0
1997-12-28,3_atel,gamma-ray burst,0
1998-01-06,4_atel,optical,0
...,...,...,...
2023-05-12,16036_atel,nova,0
2023-05-12,16036_atel,transient,0
2023-05-14,16037_atel,optical,0
2023-05-14,16037_atel,supernovae,0


In [685]:
columns = ['telegram_index', 'topic', 'date', 'topics_prev_posts_all', 'topics_prev_posts_month', 'topics_prev_posts_year', 
           'topics_prev_cited_all', 'topics_prev_cited_month', 'topics_prev_cited_year']
dfs = []

for topic, data in topics_df.groupby("topic"):

    print(topic)
    topics_features_df = pd.DataFrame([], columns=columns, index=[])
    topics_features_df['telegram_index'] = data.telegram_index.values
    topics_features_df['topic'] = topic
    topics_features_df['date'] = data.index
    
    # all previous posts with this topic
    topics_features_df['topics_prev_posts_all'] = data.reset_index().index.values
    # last month posts
    topics_features_df['topics_prev_posts_month'] = (data.telegram_index.rolling('30D').count() - 1).astype(np.int32).values
    # last year
    topics_features_df['topics_prev_posts_year'] = (data.telegram_index.rolling('365D').count() - 1).astype(np.int32).values
    
    # this topic's citations in other telegrams all time
    topics_features_df['topics_prev_cited_all'] = (data['n_cited'].cumsum() - data['n_cited']).astype(np.int32).values
    # over the last month, year
    topics_features_df['topics_prev_cited_month'] = (data.n_cited.rolling('30D').sum() - data.n_cited).astype(np.int32).values
    topics_features_df['topics_prev_cited_year'] = (data.n_cited.rolling('365D').sum() - data.n_cited).astype(np.int32).values    
    
    dfs.append(topics_features_df)

a comment
agn
asteroid
asteroid  binary
binary
black hole
blazar
cataclysmic variable
comet
cosmic rays
direct collapse event
exoplanet
far-infra-red
fast radio burst
gamma ray
gamma-ray burst
gev
globular cluster
gravitational lensing
gravitational waves
infra-red
magnetar
meteor
microlensing event
millimeter
near-earth object
neutrinos
neutron star
nova
optical
planet
planet  minor
potentially hazardous asteroid
pre-main-sequence star
pulsar
quasar
radio
request for observations
soft gamma-ray repeater
solar system object
star
sub-millimeter
supernova remnant
supernovae
tev
the sun
tidal disruption event
transient
uhe
ultra-violet
variables
vhe
x-ray
young stellar object


In [686]:
topics_features = pd.concat(dfs).groupby('telegram_index').agg(list).rename(columns={'topic': 'topics'})

In [687]:
len(topics_features)

48279

In [688]:
topics_features.head()

Unnamed: 0_level_0,topics,date,topics_prev_posts_all,topics_prev_posts_month,topics_prev_posts_year,topics_prev_cited_all,topics_prev_cited_month,topics_prev_cited_year
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10000_atel,"[optical, supernovae]","[2017-01-25 00:00:00, 2017-01-25 00:00:00]","[13862, 4160]","[135, 83]","[1596, 759]","[32883, 6128]","[240, 31]","[3169, 825]"
10000_gcn,"[transient, variables]","[2009-10-08 00:00:00, 2009-10-08 00:00:00]","[6293, 1457]","[70, 17]","[1079, 212]","[23836, 3234]","[341, 34]","[3732, 487]"
10001_atel,"[nova, optical, transient]","[2017-01-25 00:00:00, 2017-01-25 00:00:00, 201...","[2052, 13865, 16215]","[19, 138, 180]","[251, 1599, 1801]","[12899, 32883, 62759]","[204, 240, 333]","[1676, 3169, 4573]"
10001_gcn,"[optical, transient, variables]","[2009-10-08 00:00:00, 2009-10-08 00:00:00, 200...","[5051, 6295, 1458]","[44, 72, 18]","[704, 1081, 213]","[9856, 23836, 3234]","[101, 341, 34]","[1517, 3732, 487]"
10002_atel,"[nova, transient, ultra-violet]","[2017-01-25 00:00:00, 2017-01-25 00:00:00, 201...","[2054, 16224, 1333]","[21, 189, 6]","[253, 1810, 102]","[12954, 62815, 4213]","[259, 389, 8]","[1731, 4629, 365]"


### Calculate each topic relative weight (as it was done before) in a time-series manner

In [689]:
topics_weights = []
topics_history = []
for i, r in df.iterrows():
    topics_history.extend(r.topics)
    topics_weights.append(get_topic_weight(topics_history))

In [690]:
df['weights'] = topics_weights

In [691]:
df['topics_weights'] = df.apply(lambda x: [x['weights'][t] for t in x['topics']], axis=1)

In [692]:
# these are all-time cumulative weights:

In [693]:
df.head(3)

Unnamed: 0_level_0,telegram_index,topics,n_cited,weights,topics_weights
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1997-12-28,2_atel,"[gamma ray, gamma-ray burst]",0,"{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]"
1997-12-28,3_atel,"[gamma ray, gamma-ray burst]",0,"{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]"
1998-01-06,4_atel,"[optical, gamma ray, a comment, gamma-ray burst]",0,"{'a comment': 0, 'gamma ray': 0.66666666666666...","[2.0, 0.6666666666666666, 0, 0.6666666666666666]"


In [694]:
df.tail(3)

Unnamed: 0_level_0,telegram_index,topics,n_cited,weights,topics_weights
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-05-12,16035_atel,"[gamma ray, gev, request for observations, agn...",0,"{'a comment': 0, 'agn': 0.7887613124426902, 'a...","[0.26280256765328047, 0.5443976666116339, 0, 0..."
2023-05-12,16036_atel,"[cataclysmic variable, nova, transient]",0,"{'a comment': 0, 'agn': 0.7887762628074791, 'a...","[1.3292936927857564, 0.8468918210322429, 0.106..."
2023-05-14,16037_atel,"[optical, supernovae, transient]",0,"{'a comment': 0, 'agn': 0.7887912131722681, 'a...","[0.13429001457589904, 0.44586981261760694, 0.1..."


In [695]:
# do the same in a year and month ranges

In [696]:
topics_history_month = [df.iloc[0].topics]
dates = [df.index[0]]

topics_weights_month = [get_topic_weight(topics_history_month[0])]

for i, r in df.iloc[1:].iterrows():
    # print(f'Current date: {i}')
    # print(f'Current min date: {dates[0]}')
    
    while len(dates) and pd.Timedelta(i - dates[0]).days >= 30:
        # print(f'Diff between {i} and {dates[0]} is more than 30 days, removing the earliest date and its elements')
        dates.pop(0)
        topics_history_month.pop(0)
    
    topics_history_month.append(r.topics)
    topics_weights_month.append(get_topic_weight(reduce(lambda x, y: x+y, topics_history_month)))
    dates.append(i)

In [697]:
topics_history_year = [df.iloc[0].topics]
dates = [df.index[0]]

topics_weights_year = [get_topic_weight(topics_history_year[0])]

for i, r in df.iloc[1:].iterrows():

    while len(dates) and pd.Timedelta(i - dates[0]).days >= 365:
        dates.pop(0)
        topics_history_year.pop(0)
    
    topics_history_year.append(r.topics)
    topics_weights_year.append(get_topic_weight(reduce(lambda x, y: x+y, topics_history_year)))
    dates.append(i)

In [698]:
df['weights_month'] = topics_weights_month
df['weights_year'] = topics_weights_year

In [699]:
df['topics_weights_month'] = df.apply(lambda x: [x['weights_month'][t] for t in x['topics']], axis=1)
df['topics_weights_year'] = df.apply(lambda x: [x['weights_year'][t] for t in x['topics']], axis=1)

In [700]:
df.head()

Unnamed: 0_level_0,telegram_index,topics,n_cited,weights,topics_weights,weights_month,weights_year,topics_weights_month,topics_weights_year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1997-12-28,2_atel,"[gamma ray, gamma-ray burst]",0,"{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]","{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]","[1.0, 1.0]"
1997-12-28,3_atel,"[gamma ray, gamma-ray burst]",0,"{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]","{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","{'gamma ray': 1.0, 'gamma-ray burst': 1.0}","[1.0, 1.0]","[1.0, 1.0]"
1998-01-06,4_atel,"[optical, gamma ray, a comment, gamma-ray burst]",0,"{'a comment': 0, 'gamma ray': 0.66666666666666...","[2.0, 0.6666666666666666, 0, 0.6666666666666666]","{'a comment': 0, 'gamma ray': 0.66666666666666...","{'a comment': 0, 'gamma ray': 0.66666666666666...","[2.0, 0.6666666666666666, 0, 0.6666666666666666]","[2.0, 0.6666666666666666, 0, 0.6666666666666666]"
1998-01-12,5_atel,"[optical, gamma-ray burst]",0,"{'a comment': 0, 'gamma ray': 0.83333333333333...","[1.25, 0.625]","{'a comment': 0, 'gamma ray': 0.83333333333333...","{'a comment': 0, 'gamma ray': 0.83333333333333...","[1.25, 0.625]","[1.25, 0.625]"
1998-01-18,6_atel,"[optical, gamma-ray burst]",0,"{'a comment': 0, 'gamma ray': 1.0, 'gamma-ray ...","[1.0, 0.6000000000000001]","{'a comment': 0, 'gamma ray': 1.0, 'gamma-ray ...","{'a comment': 0, 'gamma ray': 1.0, 'gamma-ray ...","[1.0, 0.6000000000000001]","[1.0, 0.6000000000000001]"


In [701]:
df.tail()

Unnamed: 0_level_0,telegram_index,topics,n_cited,weights,topics_weights,weights_month,weights_year,topics_weights_month,topics_weights_year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-05-10,16033_atel,"[gamma ray, gev, agn, blazar, quasar]",0,"{'a comment': 0, 'agn': 0.7889387368525995, 'a...","[0.26281450927254374, 0.5444747180767987, 0.78...","{'a comment': 0, 'agn': 0.8452380952380951, 'b...","{'a comment': 0, 'agn': 0.6992344497607654, 'a...","[0.461038961038961, 1.0142857142857142, 0.8452...","[0.27264925373134324, 0.3313832199546485, 0.69..."
2023-05-12,16034_atel,"[optical, supernovae]",0,"{'a comment': 0, 'agn': 0.7889487064453417, 'a...","[0.1342868342547624, 0.44590665517183087]","{'a comment': 0, 'agn': 0.8084415584415584, 'b...","{'a comment': 0, 'agn': 0.7100980392156864, 'a...","[0.228021978021978, 2.2232142857142856]","[0.1760145808019441, 1.316909090909091]"
2023-05-12,16035_atel,"[gamma ray, gev, request for observations, agn...",0,"{'a comment': 0, 'agn': 0.7887613124426902, 'a...","[0.26280256765328047, 0.5443976666116339, 0, 0...","{'a comment': 0, 'agn': 0.7559523809523809, 'b...","{'a comment': 0, 'agn': 0.7071219512195123, 'a...","[0.45357142857142857, 0.9071428571428571, 0, 0...","[0.27196998123827393, 0.3347806004618938, 0, 0..."
2023-05-12,16036_atel,"[cataclysmic variable, nova, transient]",0,"{'a comment': 0, 'agn': 0.7887762628074791, 'a...","[1.3292936927857564, 0.8468918210322429, 0.106...","{'a comment': 0, 'agn': 0.7648809523809523, 'b...","{'a comment': 0, 'agn': 0.7074146341463415, 'a...","[0.8344155844155845, 0.5736607142857143, 0.269...","[1.7472289156626508, 0.9063750000000002, 0.109..."
2023-05-14,16037_atel,"[optical, supernovae, transient]",0,"{'a comment': 0, 'agn': 0.7887912131722681, 'a...","[0.13429001457589904, 0.44586981261760694, 0.1...","{'a comment': 0, 'agn': 0.7464285714285714, 'b...","{'a comment': 0, 'agn': 0.7201000000000001, 'a...","[0.23325892857142858, 1.4928571428571429, 0.28...","[0.17627906976744187, 1.3092727272727274, 0.10..."


In [702]:
topics_df = df.set_index("telegram_index").join(topics_features.drop(columns=['topics', 'date']))

In [703]:
topics_df = topics_df.drop(columns=['weights', 'weights_month', 'weights_year'])\
    .rename(columns={'topics_weights': 'weights_all_time'})

In [704]:
topics_df.tail()

Unnamed: 0_level_0,topics,n_cited,weights_all_time,topics_weights_month,topics_weights_year,topics_prev_posts_all,topics_prev_posts_month,topics_prev_posts_year,topics_prev_cited_all,topics_prev_cited_month,topics_prev_cited_year
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
16033_atel,"[gamma ray, gev, agn, blazar, quasar]",0,"[0.26281450927254374, 0.5444747180767987, 0.78...","[0.461038961038961, 1.0142857142857142, 0.8452...","[0.27264925373134324, 0.3313832199546485, 0.69...","[3714, 2329, 11151, 5382, 1346]","[11, 6, 21, 9, 3]","[208, 157, 535, 440, 62]","[13319, 10347, 28270, 13805, 6529]","[20, 2, 22, 22, 1]","[415, 418, 803, 884, 244]"
16034_atel,"[optical, supernovae]",0,"[0.1342868342547624, 0.44590665517183087]","[0.228021978021978, 2.2232142857142856]","[0.1760145808019441, 1.316909090909091]","[21825, 6572]","[38, 3]","[822, 109]","[52183, 9357]","[87, 0]","[1257, 50]"
16035_atel,"[gamma ray, gev, request for observations, agn...",0,"[0.26280256765328047, 0.5443976666116339, 0, 0...","[0.45357142857142857, 0.9071428571428571, 0, 0...","[0.27196998123827393, 0.3347806004618938, 0, 0...","[3715, 2330, 11152, 5383, 2032]","[11, 6, 19, 9, 5]","[204, 155, 532, 432, 65]","[13319, 10347, 28270, 13805, 11460]","[19, 1, 20, 21, 2]","[375, 374, 766, 833, 153]"
16036_atel,"[cataclysmic variable, nova, transient]",0,"[1.3292936927857564, 0.8468918210322429, 0.106...","[0.8344155844155845, 0.5736607142857143, 0.269...","[1.7472289156626508, 0.9063750000000002, 0.109...","[2204, 3460, 27447]","[10, 15, 33]","[82, 159, 1328]","[5145, 18923, 93659]","[13, 34, 96]","[106, 298, 2824]"
16037_atel,"[optical, supernovae, transient]",0,"[0.13429001457589904, 0.44586981261760694, 0.1...","[0.23325892857142858, 1.4928571428571429, 0.28...","[0.17627906976744187, 1.3092727272727274, 0.10...","[21826, 6573, 27448]","[31, 4, 25]","[816, 109, 1318]","[52183, 9357, 93659]","[43, 0, 42]","[1235, 50, 2787]"


### Calculate the average and weighted average of previous posts and citations. Averaging is needed to level the cases of different amount of topics found in telegrams, weighting will (probably) help to accent the least frequent topics

In [705]:
# raw avg: both citations and posts

In [706]:
topics_df['topics_prev_posts_all'] = topics_df['topics_prev_posts_all'].apply(np.mean)
topics_df['topics_prev_posts_month'] = topics_df['topics_prev_posts_month'].apply(np.mean)
topics_df['topics_prev_posts_year'] = topics_df['topics_prev_posts_year'].apply(np.mean)

In [707]:
topics_df['topics_prev_cited_all_raw'] = topics_df['topics_prev_cited_all'].apply(np.mean)
topics_df['topics_prev_cited_month_raw'] = topics_df['topics_prev_cited_month'].apply(np.mean)
topics_df['topics_prev_cited_year_raw'] = topics_df['topics_prev_cited_year'].apply(np.mean)

In [708]:
# weighted avg: citations only, to emphasize the least frequent topics citations'

In [709]:
topics_df['topics_prev_cited_all_weighted'] = topics_df.apply(lambda x: 
                                                              np.mean([v*w for v,w in zip(x.topics_prev_cited_all, x.weights_all_time)]), 
                                                              axis=1)
topics_df['topics_prev_cited_month_weighted'] = topics_df.apply(lambda x: 
                                                              np.mean([v*w for v,w in zip(x.topics_prev_cited_month, x.topics_weights_month)]), 
                                                              axis=1)
topics_df['topics_prev_cited_year_weighted'] = topics_df.apply(lambda x: 
                                                              np.mean([v*w for v,w in zip(x.topics_prev_cited_year, x.topics_weights_year)]), 
                                                              axis=1)

In [710]:
topics_df = topics_df.drop(columns=['topics_prev_cited_all', 'topics_prev_cited_month', 'topics_prev_cited_year',
                                   'weights_all_time', 'topics_weights_month', 'topics_weights_year', 'n_cited'])

In [711]:
topics_df.head()

Unnamed: 0_level_0,topics,topics_prev_posts_all,topics_prev_posts_month,topics_prev_posts_year,topics_prev_cited_all_raw,topics_prev_cited_month_raw,topics_prev_cited_year_raw,topics_prev_cited_all_weighted,topics_prev_cited_month_weighted,topics_prev_cited_year_weighted
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2_atel,"[gamma ray, gamma-ray burst]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3_atel,"[gamma ray, gamma-ray burst]",1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4_atel,"[optical, gamma ray, a comment, gamma-ray burst]",1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5_atel,"[optical, gamma-ray burst]",2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
6_atel,"[optical, gamma-ray burst]",3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [712]:
topics_df.tail()

Unnamed: 0_level_0,topics,topics_prev_posts_all,topics_prev_posts_month,topics_prev_posts_year,topics_prev_cited_all_raw,topics_prev_cited_month_raw,topics_prev_cited_year_raw,topics_prev_cited_all_weighted,topics_prev_cited_month_weighted,topics_prev_cited_year_weighted
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
16033_atel,"[gamma ray, gev, agn, blazar, quasar]",4784.4,10.0,280.4,14454.0,13.4,552.8,12601.804514,12.851571,439.359896
16034_atel,"[optical, supernovae]",14198.5,20.5,465.5,30770.0,43.5,653.5,5589.919222,9.918956,143.547891
16035_atel,"[gamma ray, gev, request for observations, agn...",4922.4,10.0,277.6,15440.2,12.6,500.2,6886.397802,5.598367,191.680316
16036_atel,"[cataclysmic variable, nova, transient]",11037.0,19.333333,523.0,39242.333333,47.666667,1076.0,10955.507968,18.755944,254.486625
16037_atel,"[optical, supernovae, transient]",18615.666667,20.0,747.666667,51733.0,28.333333,1357.333333,7060.352929,7.362609,195.825805


how the feature should work

In [713]:
# optical, star, direct COLLAPSE

n_posts = [2000, 1000, 1]
n_cited = [200, 50, 70]

In [714]:
n_posts_raw = np.mean(n_posts)
n_posts_raw

1000.3333333333334

In [715]:
n_cited_raw = np.mean(n_cited)
n_cited_raw

106.66666666666667

In [716]:
# raw citation rate does not emphasize that there is an interesting topic 

In [717]:
citation_rate_raw = n_cited_raw/n_posts_raw
citation_rate_raw

0.10663112295901367

In [718]:
# let's add WEIGHTS!

weights = [0.1, 0.2, 400]   # low weights for generic topics, and huge weight for topic that appeared only once before

In [719]:
n_cited_weighted = np.mean([n*w for n,w in zip(n_cited, weights)])
n_cited_weighted

9343.333333333334

In [720]:
# now the citation rate will indicate that the telegram with this topics combination might be interesting 

citation_rate_weighted = n_cited_weighted/n_posts_raw
citation_rate_weighted 

9.340219926691104

- Calculate the citation rates (raw and weighted)
- Monthly and yearly activity fractions with respect to the total
- Monthly and yearly citation fractions with respect to the total

In [721]:
topics_df['topics_activity_frac_month'] = topics_df['topics_prev_posts_month']/topics_df['topics_prev_posts_all']
topics_df['topics_activity_frac_year'] = topics_df['topics_prev_posts_year']/topics_df['topics_prev_posts_all']

In [722]:
topics_df['topics_citation_frac_month'] = topics_df['topics_prev_cited_month_raw']/topics_df['topics_prev_cited_all_raw']
topics_df['topics_citation_frac_year'] = topics_df['topics_prev_cited_year_raw']/topics_df['topics_prev_cited_all_raw']

In [723]:
# weighted citation rate
topics_df['topics_citation_rate_all_weighted'] = topics_df['topics_prev_cited_all_weighted']/topics_df['topics_prev_posts_all']
topics_df['topics_citation_rate_month_weighted'] = topics_df['topics_prev_cited_month_weighted']/topics_df['topics_prev_posts_month']
topics_df['topics_citation_rate_year_weighted'] = topics_df['topics_prev_cited_year_weighted']/topics_df['topics_prev_posts_year']

In [724]:
# raw citation rate
topics_df['topics_citation_rate_all_raw'] = topics_df['topics_prev_cited_all_raw']/topics_df['topics_prev_posts_all']
topics_df['topics_citation_rate_month_raw'] = topics_df['topics_prev_cited_month_raw']/topics_df['topics_prev_posts_month']
topics_df['topics_citation_rate_year_raw'] = topics_df['topics_prev_cited_year_raw']/topics_df['topics_prev_posts_year']

In [725]:
topics_df.fillna(0, inplace=True)

In [726]:
topics_df.head()

Unnamed: 0_level_0,topics,topics_prev_posts_all,topics_prev_posts_month,topics_prev_posts_year,topics_prev_cited_all_raw,topics_prev_cited_month_raw,topics_prev_cited_year_raw,topics_prev_cited_all_weighted,topics_prev_cited_month_weighted,topics_prev_cited_year_weighted,topics_activity_frac_month,topics_activity_frac_year,topics_citation_frac_month,topics_citation_frac_year,topics_citation_rate_all_weighted,topics_citation_rate_month_weighted,topics_citation_rate_year_weighted,topics_citation_rate_all_raw,topics_citation_rate_month_raw,topics_citation_rate_year_raw
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2_atel,"[gamma ray, gamma-ray burst]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3_atel,"[gamma ray, gamma-ray burst]",1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4_atel,"[optical, gamma ray, a comment, gamma-ray burst]",1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_atel,"[optical, gamma-ray burst]",2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6_atel,"[optical, gamma-ray burst]",3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [727]:
topics_df.to_csv("topics_features.csv", index=True)