In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import time
import networkx as nx
from itertools import combinations

Import Inspire meta data

In [2]:
%%time 
path = '/Users/NewUser/hep_records.json'
raw_data = pd.read_json(path, lines=True, orient='columns')

CPU times: user 1min 37s, sys: 2min 46s, total: 4min 24s
Wall time: 7min


Remove unnecessary columns and Basic cleaning

In [3]:
dropcol = ['abstract', 'free_keywords', 'standardized_keywords']
raw_data.drop(dropcol, axis=1, inplace=True)

In [4]:
new_data = raw_data.copy()

# basic features

current_year = 2020
new_data['Year'] = new_data.creation_date.str.slice(start=0, stop=4)
new_data['Refs'] = new_data.references.str.len()
new_data['Cites'] = new_data.citations.str.len()
new_data['Authors'] = new_data['authors'] + new_data['co-authors']
new_data['Num_authors'] = new_data.Authors.str.len()

# basic clearning

new_data.dropna(subset=['creation_date'], inplace=True)
new_data.drop(new_data[new_data.Refs < 5].index, inplace=True)
new_data.drop(new_data[new_data.Authors.str.len() == 0].index, inplace=True)

# clean Year

new_data.drop(new_data[(new_data.creation_date.str.slice(start=0, stop=2) != '20')
                             & (new_data.creation_date.str.slice(start=0, stop=2) != '19')].index, inplace=True)
new_data.drop(new_data[(new_data.creation_date.str.slice(start=0, stop=3) == '20 ')
                             | (new_data.creation_date.str.slice(start=0, stop=3) == '19 ')].index, inplace=True)
new_data.drop(new_data[(new_data.creation_date.str.slice(start=0, stop=3) == '20-')
                             | (new_data.creation_date.str.slice(start=0, stop=3) == '19-')].index, inplace=True)
new_data['Year'] = new_data['Year'].astype(int)
new_data['Cites_per_year'] = new_data.Cites / ((current_year - new_data.Year) + 1)

Initial cut on large author papers (e.g. experimental results, white papers, ...)

In [248]:
#plt.hist(np.log10(new_data.Num_authors))
max_authors = 6
trimmed_data = new_data.loc[new_data.Num_authors <= max_authors]
trimmed_data = trimmed_data.set_index('recid')
trimmed_data['recid'] = trimmed_data.index
trimmed_data.head()

Unnamed: 0_level_0,authors,citations,co-authors,creation_date,references,title,Year,Refs,Cites,Authors,Num_authors,Cites_per_year,recid
recid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
51,"[Noyes, H.Pierre]","[107392, 48129, 72835, 50824, 1436908, 114955,...",[],1963,"[47202, 43619, 47204, 46245, 47400, 45692, 166...",Neutron-Proton Scattering Below 20-MeV,1963,19,62,"[Noyes, H.Pierre]",1,1.068966,51
56,"[Thiebaux, Martial L., Jr.]",[],[],1962-11,"[47713, 1475814, 1414721, 9133, 40878, 48497, ...",PION PRODUCTION IN PION PION COLLISIONS,1962,8,0,"[Thiebaux, Martial L., Jr.]",1,0.0,56
61,"[Berman, S.M.]","[1431040, 60929, 1416706, 98820, 56841, 52240,...","[Drell, S.D.]",1963-08,"[46912, 46531, 1672881, 9384, 40203, 46733, 46...",SPECULATIONS ON THE PRODUCTION OF VECTOR MESONS,1963,11,87,"[Berman, S.M., Drell, S.D.]",2,1.5,61
63,"[Berman, S.M.]","[89093, 80998, 372907, 442413, 102990, 63631, ...","[Tsai, Yung-Su]",1963-10,"[47712, 21794, 47687, 21804, 23023, 43541, 484...",INTERMEDIATE BOSON PAIR PRODUCTION AS A MEANS ...,1963,8,12,"[Berman, S.M., Tsai, Yung-Su]",2,0.206897,63
67,"[Bander, Myron]","[60800, 75010, 143237, 279691, 157073, 245270,...",[],1964-01,"[1476836, 47206, 47344, 51, 9019, 47228]",LOW-ENERGY NEUTRON NEUTRON SCATTERING PARAMETERS,1964,6,32,"[Bander, Myron]",1,0.561404,67


Author cleaning, basic cuts

In [17]:
%%time

dfsplit = pd.DataFrame({'recid':np.repeat(trimmed_data.recid.values, trimmed_data.Authors.str.len()),
                        'Author':np.concatenate(trimmed_data.Authors.values)})
authorgroup = dfsplit.groupby('Author')['recid'].apply(list).reset_index(name='recid')

authorgroupcuts = authorgroup.drop(authorgroup[authorgroup.recid.str.len() <= 2].index)

def clean_name(name):
    try:
        lastname = name[:name.index(',')]
        first = name[name.index(',')+2:]
        firsti = first[0]
        middlei = ''
        #try:
        #    middlei = first[first.index(' ')+1] + '.'
        #except:
        #    pass
        #try:
        #    middlei = first[first.index('.')+1] + '.'
        #except:
        #    pass
        newname = lastname + ',' + ' ' + firsti + '.' + middlei
        return newname
    except:
        return None
    
authorgroupcuts['Author']=authorgroupcuts.Author.apply(clean_name)
authorgroupcuts.dropna(subset=['Author'], inplace=True)
authorgroupcutsrec = pd.DataFrame(authorgroupcuts.groupby('Author')['recid'].sum())
authorgroupcutsrec['Author'] = authorgroupcutsrec.index

CPU times: user 1min 22s, sys: 6.65 s, total: 1min 29s
Wall time: 1min 31s


Create aggregate by recid

In [24]:
authorexpand = pd.DataFrame({'recid':np.concatenate(authorgroupcutsrec.recid.values), 
                     'Author':np.repeat(authorgroupcutsrec.Author.values, authorgroupcutsrec.recid.str.len())})
recidrec = pd.DataFrame(authorexpand.groupby('recid')['Author'].apply(list))

df = trimmed_data.join(recidrec).dropna(subset=['Author'])
dfsplit_full = pd.DataFrame({'recid':np.repeat(df.recid.values, df.Author.str.len()),
                        'Author':np.concatenate(df.Author.values), 
                        'Cites':np.repeat(df.Cites.values, df.Author.str.len()),
                       'Year':np.repeat(df.Year.values, df.Author.str.len()),
                       'Num_authors':np.repeat(df.Num_authors.values, df.Author.str.len()),
                       'Cites_per_year':np.repeat(df.Cites_per_year.values, df.Author.str.len())})

Basic feature engineering

In [247]:
df_author = dfsplit_full.groupby('Author').agg({'Cites':['count', 'min', 'max', 'mean'], 
                                               'Year':['min', 'max'], 
                                               'Num_authors':['min', 'max', 'mean'], 
                                               'Cites_per_year':['max', 'mean']})
df_author['Lifespan'] = df_author.Year['max'] - df_author.Year['min'] + 1
df_author['Productivity'] = df_author.Cites['count'] / df_author.Lifespan

Form collaboration network 


In [152]:
G = nx.Graph()
G.add_nodes_from(list(authorexpand.Author.unique()))
for i in df.Author:
    if len(i) >= 2:
        G.add_edges_from(list(combinations(i,2)))

In [249]:
df_collab = pd.DataFrame(index=df_author.index)
df_collab['collab1'] = pd.Series()
df_collab['collab2'] = pd.Series()
df_collab['collab3'] = pd.Series()
df_collab['collab4'] = pd.Series()

Fill in df_collab values using network

In [199]:
%%time 

for i in df_collab.index:
    neighbors = list(G.neighbors(i))
    tempmean = list()
    tempmax = list()
    for j in neighbors:
        tempmean.append(df_author.loc[j].Cites_per_year['mean'])
        tempmax.append(df_author.loc[j].Cites_per_year['max'])
    if len(neighbors) >= 1:
        df_collab.at[i, 'collab1'] = np.mean(tempmean)
        df_collab.at[i, 'collab2'] = max(tempmean)
        df_collab.at[i, 'collab3'] = np.mean(tempmax)
        df_collab.at[i, 'collab4'] = max(tempmax)

CPU times: user 27min 1s, sys: 9.16 s, total: 27min 10s
Wall time: 35min 46s


Join, rename, and export df_full dataframe

In [244]:
df_collab.rename(columns={'collab1':'collab_cpy_mean_mean','collab2':'collab_cpy_mean_max',
                          'collab3':'collab_cpy_max_mean','collab4':'collab_cpy_max_max'}, inplace=True)
df_full = df_author.join(df_collab, on=df_author.index)
df_full.rename(columns=
              {('Cites', 'count'):'Total_papers',
              ('Cites', 'min'):'Cites_min',
               ('Cites', 'max'):'Cites_max',
               ('Cites','mean'):'Cites_mean',
               ('Year', 'min'): 'Year_first',
               ('Year', 'max'): 'Year_last',
               ('Num_authors', 'min'): 'Num_authors_min',
               ('Num_authors', 'max'): 'Num_authors_max',
               ('Num_authors', 'mean'): 'Num_authors_mean',
               ('Cites_per_year', 'max'): 'cpy_max',
               ('Cites_per_year', 'mean'): 'cpy_mean',
               ('Lifespan', '') :'Lifespan',
               ('Productivity', ''): 'Productivity'
              }, inplace=True)
df_full.fillna(0, inplace=True)
df_full['Author'] = df_full.index
df_full.to_csv('df_full.csv', index=False)