In [167]:
import pandas as pd
import numpy as np
import ast

In [168]:
df = pd.read_csv('../data/assembled.csv', index_col=0)
labels = pd.read_csv("../data/labels.csv", index_col=0)
df = df.join(labels)

### Create some other features that may help us to indicate the likeness of a telegram to be more cited in the future;
- What is the fraction of the telegrams for the last year / month compared to the total / year BEFORE the given telegram? And what is the citation rate during the same periods? For instance, if the given period's activity fraction or the citataion rate is relatively large there are some well-known events ongoing (very clear sky in this month, or some meteor rain going on the whole year). Such events might get more citations in the future.

- How many references are in this telegram? Maybe if the author is referring to lots of other posts, his work is more substantial and will also be referenced to more frequently

- Content length. What if short telegrams are less informative and are less likely to contain important or well-described observations? 

In [169]:
df['date'] = pd.to_datetime(df['date'])
df = df.reset_index().set_index('date')

In [170]:
# the amount of telegrams appeared all time, this year and this month  (sum)

telegrams_cnt_year = (df.rolling('365D').count() - 1).telegram_index
telegrams_cnt_month = (df.rolling('30D').count() - 1).telegram_index
telegrams_cnt_all = df.reset_index().index.values

df['activity_frac_year'] = telegrams_cnt_year/telegrams_cnt_all
df['activity_frac_month'] = telegrams_cnt_month/telegrams_cnt_all

In [171]:
# the amount of citations during the same periods

telegrams_citations_year = df.rolling('365D').n_cited.sum() - df.n_cited
telegrams_citations_month = df.rolling('30D').n_cited.sum() - df.n_cited
telegrams_citations_all = df.n_cited.cumsum()

# citation rate == citations average (same as moving average)
df['citation_rate_year'] = telegrams_citations_year/telegrams_cnt_year
df['citation_rate_month'] = telegrams_citations_month/telegrams_cnt_month

# the fraction of year/month citations compared to the total
df['citation_frac_year'] = telegrams_citations_year/telegrams_citations_all
df['citation_frac_month'] = telegrams_citations_month/telegrams_citations_all

In [172]:
telegrams_avg_citations_all = telegrams_citations_all/telegrams_cnt_all

In [173]:
# the relation of the month/year avg citations to the total citations

df['citation_rate_frac_year'] = df['citation_rate_year']/telegrams_avg_citations_all
df['citation_rate_frac_month'] = df['citation_rate_month']/telegrams_avg_citations_all
df['citation_rate_all'] = telegrams_avg_citations_all

In [174]:
# how many citations telegram contains?
df['refs_count'] = df.refs.apply(lambda x: len(ast.literal_eval(x)))

In [175]:
# body len
df['telegram_len'] = df.body.apply(len)

In [176]:
# month of the telegram (labels data analysis showed it might be an indicator as well)
df['month'] = df.index.month

In [177]:
df_f = df.set_index("telegram_index").drop(columns=['body', 'subject', 'from', 'refs', 'n_cited', 'citation_class'])

In [178]:
df_f = df_f.fillna(0)
df_f.tail(10)

Unnamed: 0_level_0,activity_frac_year,activity_frac_month,citation_rate_year,citation_rate_month,citation_frac_year,citation_frac_month,citation_rate_frac_year,citation_rate_frac_month,citation_rate_all,refs_count,telegram_len,month
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
16028_atel,0.047049,0.002341,1.978864,1.60177,0.032042,0.001291,0.681044,0.551264,2.905633,0,1110,5
16029_atel,0.047069,0.002362,1.977993,1.587719,0.032042,0.001291,0.680758,0.546439,2.905573,0,2431,5
16030_atel,0.046881,0.002155,1.982766,1.634615,0.031992,0.001212,0.682415,0.562591,2.905513,21,1659,5
16031_atel,0.046714,0.001968,1.98714,1.747368,0.03195,0.001184,0.683935,0.60141,2.905452,0,2303,5
16032_atel,0.046734,0.001989,1.986259,1.729167,0.03195,0.001184,0.683646,0.595158,2.905392,0,1020,5
16033_atel,0.046568,0.001699,1.992883,1.804878,0.031943,0.001055,0.68594,0.62123,2.905332,2,1808,5
16034_atel,0.046214,0.001554,1.978485,1.933333,0.031472,0.001034,0.680998,0.665457,2.905272,0,582,5
16035_atel,0.046234,0.001574,1.977599,1.907895,0.031472,0.001034,0.680707,0.656715,2.905212,0,1605,5
16036_atel,0.046254,0.001595,1.976713,1.883117,0.031472,0.001034,0.680416,0.648199,2.905152,8,2244,5
16037_atel,0.045942,0.001264,1.970243,1.032787,0.031158,0.000449,0.678204,0.355509,2.905091,0,1948,5


In [179]:
df_f.to_csv("other_features.csv", index=True)