In [216]:
import pandas as pd
import ast
import numpy as np

In [217]:
df = pd.read_csv("../data/assembled.csv", index_col=0)
labels = pd.read_csv("../data/labels.csv", index_col=0)
df = df.join(labels)\
    .drop(columns=['refs', 'citation_class'])\
    .reset_index()
df['date'] = pd.to_datetime(df['date'])
df.set_index("date", inplace=True)

In [218]:
df.head()

Unnamed: 0_level_0,telegram_index,body,subject,from,n_cited
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1997-12-28,3_atel,In addendum of ATEL #2: Additional Information...,Improved Coordinates for GB971227,rutledge@rosat.mpe-garching.mpg.de,0
1997-12-28,2_atel,The following message was emailed to me this e...,GB971227,rutledge@rosat.mpe-garching.mpg.de,0
1998-01-06,4_atel,The recent detection of delayed Gamma ray burs...,The Probable Connection Between Relativistic S...,rutledge@rosat.mpe-garching.mpg.de,0
1998-01-12,5_atel,The optical transient (IAUC # 6788 ) of GRB 97...,GRB 971214,rutledge@rosat.mpe-garching.mpg.de,0
1998-01-18,6_atel,GRB980109 field was observed by the OGLE colla...,GRB980109,rutledge@rosat.mpe-garching.mpg.de,0


In [219]:
len(df)

48279

### author-based feature ideas:
 1. num of publications: all time, last 5 years, last year, last month
 2. citation rate: all time, last 5 years, last year, last month
 3. What are the author's most popular topics?
 4. How better (how more cited) this author is compared to the others?
 
 It will potetially help to include the author's popularity (all-time and recent) and the author's publication history.
 
 These features must be extracted in a time-series manner

In [222]:
columns = ['telegram_index', 'author_prev_posts_all', 'author_prev_posts_month', 'author_prev_posts_year', 
           'author_prev_cited_all', 'author_prev_cited_month', 'author_prev_cited_year']
dfs = []

for author, data in df.groupby("from"):

    print(author)
    author_features_df = pd.DataFrame([], columns=columns, index=[])
    author_features_df['telegram_index'] = data.telegram_index.values
    
    # all author's previous posts
    author_features_df['author_prev_posts_all'] = data.reset_index().index.values
    # last month posts
    author_features_df['author_prev_posts_month'] = (data.body.rolling('30D').count() - 1).astype(np.int32).values
    # last year
    author_features_df['author_prev_posts_year'] = (data.body.rolling('365D').count() - 1).astype(np.int32).values
    
    # author's citations in other telegrams all time
    author_features_df['author_prev_cited_all'] = (data['n_cited'].cumsum() - data['n_cited']).astype(np.int32).values
    # over the last month, year
    author_features_df['author_prev_cited_month'] = (data.n_cited.rolling('30D').sum() - data.n_cited).astype(np.int32).values
    author_features_df['author_prev_cited_year'] = (data.n_cited.rolling('365D').sum() - data.n_cited).astype(np.int32).values    
    
    dfs.append(author_features_df)

(khurley@sunspot.ssl.berkeley.edu)
20934203@student.uwa.edu.au
245487@mail.muni.cz
30.v@mail.ru
500025@mail.muni.cz
A.J.Levan@warwick.ac.uk
A.J.vanderHorst@uva.nl
A.P.Kamble@uva.nl
A.S.Parikh@uva.nl
A.Scaife@soton.ac.uk
A.Shaw@soton.ac.uk
Agottlieb7@gmail.com
Aitor.Ibarra@sciops.esa.int
Alain.Klotz@free.fr
Alain.Maury@obs-azur.fr
Alexander.J.VanDerHorst@nasa.gov
Ann.M.Parsons@nasa.gov
Anthony.Rushton@Manchester.ac.uk
AprajitaHajela2015@u.northwestern.edu
Ascension.Camero@uv.es
BCLee@LBL.gov
Boris.Gaensicke@warwick.ac.uk
Brammer@stsci.edu
Bruce_Grossan@lbl.gov
Bwgref@srl.caltech.edu
C.Inserra@soton.ac.uk
Carlo.Ferrigno@unige.ch
Celia.Sanchez@sciops.esa.int
Colleen.Wilson@nasa.gov
Craig.Markwardt@nasa.gov
D.M.Russell@uva.nl
D.T.H.Steeghs@warwick.ac.uk
DANIELE.FARGION@ROMA1.INFN.IT
Daniele.Fargion@roma1.infn.it
David.J.Thompson@nasa.gov
David.L.Band@nasa.gov
David.M.Palmer.1@gsfc.nasa.gov
Delphine.Porquet@astro.u-strasbg.fr
Denis.bernard@in2p3.fr
Dieter.Horns@mpi-hd.mpg.de
E.Breedt@warwic

In [223]:
author_features = pd.concat(dfs).set_index("telegram_index")
author_features.head()

Unnamed: 0_level_0,author_prev_posts_all,author_prev_posts_month,author_prev_posts_year,author_prev_cited_all,author_prev_cited_month,author_prev_cited_year
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
53_gcn,0,0,0,0,0,0
24549_gcn,0,0,0,0,0,0
32622_gcn,0,0,0,0,0,0
32624_gcn,1,1,1,0,0,0
32629_gcn,2,2,2,0,0,0


- Whats the fraction of the author's acitivity in the last month/year compared to total activity? 

- What is author's citaition rate during these periods? (n_cited/n_posts)

- What is author's citation fraction of the last month/year compared to total? (n_cited_period/n_cited_all)

In [224]:
author_features['author_activity_frac_month'] = author_features['author_prev_posts_month']/author_features['author_prev_posts_all']
author_features['author_activity_frac_year'] = author_features['author_prev_posts_year']/author_features['author_prev_posts_all']

In [225]:
author_features['author_citation_rate_all'] = author_features['author_prev_cited_all']/author_features['author_prev_posts_all']
author_features['author_citation_rate_month'] = author_features['author_prev_cited_month']/author_features['author_prev_posts_month']
author_features['author_citation_rate_year'] = author_features['author_prev_cited_year']/author_features['author_prev_posts_year']

In [226]:
author_features['author_citation_frac_month'] = author_features['author_prev_cited_month']/author_features['author_prev_cited_all']
author_features['author_citation_frac_year'] = author_features['author_prev_cited_year']/author_features['author_prev_cited_all']

In [227]:
author_features.fillna(0, inplace=True)

In [228]:
author_features.tail()

Unnamed: 0_level_0,author_prev_posts_all,author_prev_posts_month,author_prev_posts_year,author_prev_cited_all,author_prev_cited_month,author_prev_cited_year,author_activity_frac_month,author_activity_frac_year,author_citation_rate_all,author_citation_rate_month,author_citation_rate_year,author_citation_frac_month,author_citation_frac_year
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
13890_atel,2,1,2,4,1,4,0.5,1.0,2.0,1.0,2.0,0.25,1.0
14207_atel,3,0,3,4,0,4,0.0,1.0,1.333333,0.0,1.333333,0.0,1.0
14323_atel,4,0,4,4,0,4,0.0,1.0,1.0,0.0,1.0,0.0,1.0
14943_atel,5,0,2,4,0,0,0.0,0.4,0.8,0.0,0.0,0.0,0.0
14967_atel,6,1,3,6,2,2,0.166667,0.5,1.0,2.0,0.666667,0.333333,0.333333


find author's most popular topics

In [229]:
atel_df = pd.read_csv("../topics/atel_with_topics.csv", index_col=0)
atel_df['topics'] = atel_df['topics'].apply(lambda x: x.split(','))

gcn_df = pd.read_csv("../topics/gcn_with_topics.csv", index_col=0)
gcn_df['topics'] = gcn_df['topics'].apply(ast.literal_eval)

df = df.reset_index().set_index("telegram_index").join(pd.concat((atel_df, gcn_df)))

In [230]:
df['topics'] = df['topics'].apply(lambda x: [y.strip() for y in x])
df = df[['date', 'from', 'topics']]

In [231]:
df = df.reset_index().set_index("date")
df.head()

Unnamed: 0_level_0,telegram_index,from,topics
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-12-28,3_atel,rutledge@rosat.mpe-garching.mpg.de,"[gamma ray, gamma-ray burst]"
1997-12-28,2_atel,rutledge@rosat.mpe-garching.mpg.de,"[gamma ray, gamma-ray burst]"
1998-01-06,4_atel,rutledge@rosat.mpe-garching.mpg.de,"[optical, gamma ray, a comment, gamma-ray burst]"
1998-01-12,5_atel,rutledge@rosat.mpe-garching.mpg.de,"[optical, gamma-ray burst]"
1998-01-18,6_atel,rutledge@rosat.mpe-garching.mpg.de,"[optical, gamma-ray burst]"


In [238]:
columns = ['author', 'telegram_index', 'author_most_frequent_topics']
author_topic_dfs = []

for author, data in df.groupby("from"):
    this_topics_df = pd.DataFrame([], columns=columns, index=[])
    this_topics_df['telegram_index'] = data.telegram_index
    
    author_topics_history = []
    author_topics = []
    for i, r in data.iterrows():
        # append  current post's topics to the all author's topics
        author_topics.extend(r.topics)
        topics, cnt = np.unique(author_topics, return_counts=True)
        # sort the topics by the frequency
        count_sort_ind = np.argsort(-cnt)
        # take top 3 most frequent author's topics
        author_topics_history.append(topics[count_sort_ind[:3]])
    
    this_topics_df['author_most_frequent_topics'] = author_topics_history
    this_topics_df['author'] = author
    author_topic_dfs.append(this_topics_df)

In [239]:
author_df = pd.concat(author_topic_dfs).set_index("telegram_index").join(author_features)
author_df.tail(10)

Unnamed: 0_level_0,author,author_most_frequent_topics,author_prev_posts_all,author_prev_posts_month,author_prev_posts_year,author_prev_cited_all,author_prev_cited_month,author_prev_cited_year,author_activity_frac_month,author_activity_frac_year,author_citation_rate_all,author_citation_rate_month,author_citation_rate_year,author_citation_frac_month,author_citation_frac_year
telegram_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
14610_atel,zwk@umich.edu,"[optical, supernovae, transient]",68,0,0,73,0,0,0.0,0.0,1.073529,0.0,0.0,0.0,0.0
7649_atel,zyan@shao.ac.cn,"[binary, black hole, x-ray]",0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10137_atel,zyan@shao.ac.cn,"[black hole, x-ray, binary]",1,0,0,40,0,0,0.0,0.0,40.0,0.0,0.0,0.0,0.0
13629_atel,zylin@astro.ncu.edu.tw,"[comet, optical]",0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13886_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",1,0,1,3,0,3,0.0,1.0,3.0,0.0,3.0,0.0,1.0
13890_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",2,1,2,4,1,4,0.5,1.0,2.0,1.0,2.0,0.25,1.0
14207_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",3,0,3,4,0,4,0.0,1.0,1.333333,0.0,1.333333,0.0,1.0
14323_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",4,0,4,4,0,4,0.0,1.0,1.0,0.0,1.0,0.0,1.0
14943_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",5,0,2,4,0,0,0.0,0.4,0.8,0.0,0.0,0.0,0.0
14967_atel,zylin@astro.ncu.edu.tw,"[comet, optical, solar system object]",6,1,3,6,2,2,0.166667,0.5,1.0,2.0,0.666667,0.333333,0.333333


In [240]:
author_df.to_csv("author_features.csv", index=True)