In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from gensim.models import KeyedVectors

In [2]:
# read training data
df_train = pd.read_csv('./data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('./data/test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [3]:
# read collaboration graph
G = nx.read_edgelist('collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [4]:
# read weighted collaboration graph
WG = nx.read_edgelist("weighted_collaboration_network.edgelist", nodetype=int, data=(("weight", float),))

In [5]:
# read similarity graph
SG = nx.read_multiline_adjlist("sim_collaboration_network.adjlist", nodetype=int)

In [6]:
nodes = {k: v for v, k in enumerate(list(G.nodes()))}

In [7]:
# compute graph features for each node
avg_neighbor_degree_wg = nx.average_neighbor_degree(WG)
avg_neighbor_degree_g = nx.average_neighbor_degree(G)
core_number_g = nx.core_number(G)
page_rank_g = nx.pagerank(G)
page_rank_wg = nx.pagerank(WG)
avg_neighbor_degree_sg = nx.average_neighbor_degree(SG)
page_rank_sg = nx.pagerank(SG)
eigenvector_centrality_sg = nx.eigenvector_centrality(SG)

In [8]:
# load precomputed features for each node
f = open("./data/n_papers.pkl", "rb")
n_papers = pickle.load(f)
f.close()

f = open("./data/average_coauthors_n_papers.pkl", "rb")
average_coauthors_n_papers = pickle.load(f)
f.close()

f = open("./data/betweenness_centrality_g.pkl", "rb")
betweenness_centrality_g = pickle.load(f)
f.close()

f = open("./data/betweenness_centrality_wg.pkl", "rb")
betweenness_centrality_wg = pickle.load(f)
f.close()

f = open("./data/clustering_g.pkl", "rb")
clustering_g = pickle.load(f)
f.close()

f = open("./data/clustering_wg.pkl", "rb")
clustering_wg = pickle.load(f)
f.close()

f = open("./data/clustering_sg.pkl", "rb")
clustering_sg = pickle.load(f)
f.close()

In [9]:
node2vec_wg = KeyedVectors.load_word2vec_format('./data/node2vec_wg.nodevectors')

In [10]:
node2vec_pg = KeyedVectors.load_word2vec_format('./data/node2vec_pg.nodevectors')

In [11]:
node2vec_wpg = KeyedVectors.load_word2vec_format('./data/node2vec_wpg.nodevectors')

In [12]:
node2vec_sg = KeyedVectors.load_word2vec_format('./data/node2vec_sg.nodevectors')

In [13]:
# read embeddings of abstracts
text_embeddings = pd.read_csv("author_embedding_64.csv", header=None)
text_embeddings = text_embeddings.rename(columns={0: "authorID"})

In [14]:
n_temb = text_embeddings.shape[1] - 1
n_wg = node2vec_wg.vector_size
n_wpg = node2vec_wpg.vector_size
n_pg = node2vec_pg.vector_size
n_sg = node2vec_sg.vector_size

In [15]:
# read the file to create a dictionary with author key and paper list as value
f = open("./data/author_papers.txt","r")
papers_set = set()
author_paper = {}
for l in f:
    auth_paps = [int(paper_id.strip()) for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    author_paper[int(l.split(":")[0])] = auth_paps
f.close()

In [19]:
# create the training matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, 
# (7) its core number, # (8-9-10) its page rank, (11-12) its betweenness centrality, 
# (13-14) its clustering coefficient, (15) its eigenvector centrality,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18-19-20-21) embeddings from Node2Vec, (22) text_embeddings from Doc2Vec
X_train = np.zeros((n_train, 17+n_wg+n_pg+n_wpg+n_sg+n_temb))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_train[i,0] = G.degree(node)
    X_train[i,1] = WG.degree(node)
    X_train[i,2] = SG.degree(node)
    X_train[i,3] = avg_neighbor_degree_g[node]
    X_train[i,4] = avg_neighbor_degree_wg[node]
    X_train[i,5] = avg_neighbor_degree_sg[node]
    X_train[i,6] = core_number_g[node]
    X_train[i,7] = page_rank_g[node]
    X_train[i,8] = page_rank_wg[node]
    X_train[i,9] = page_rank_sg[node]
    X_train[i,10] = betweenness_centrality_g[node]
    X_train[i,11] = betweenness_centrality_wg[node]
    X_train[i,12] = clustering_wg[node]
    X_train[i,13] = clustering_sg[node]
    X_train[i,14] = eigenvector_centrality_sg[node]
    X_train[i,15] = n_papers[node]
    X_train[i,16] = average_coauthors_n_papers[node]
    if node in author_paper:
        for p in author_paper[node]:
            if str(p) in node2vec_pg:
                X_train[i,17:17+n_pg]+=node2vec_pg[str(p)]
                X_train[i,17+n_pg:17+n_pg+n_wpg]+=node2vec_wpg[str(p)]
    if str(node) in node2vec_sg:
                X_train[i,17+n_pg+n_wpg:17+n_pg+n_wpg+n_sg] = node2vec_sg[str(node)]
    X_train[i,17+n_pg+n_wpg+n_sg:17+n_pg+n_wpg+n_sg+n_wg] = node2vec_wg[str(node)]
    X_train[i,17+n_pg+n_wpg+n_sg+n_wg:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]
    y_train[i] = row['h_index']

In [36]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error

def custom_loss(y, y_pred, **kwargs):
    return mean_absolute_error(y, np.round(y_pred))

sk_loss = make_scorer(custom_loss, greater_is_better=False)

In [42]:
reg = LGBMRegressor(objective='mae', boosting_type='dart', n_estimators=10000, learning_rate=0.06, n_jobs=1)

In [43]:
# cross-validation
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=5)
print(np.mean(-scores))

3.1968950796566813


In [44]:
# cross-validation
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring=sk_loss, n_jobs=5)
print(np.mean(-scores))

3.179254839614701


In [18]:
# create the testing matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, 
# (7) its core number, # (8-9-10) its page rank, (11-12) its betweenness centrality, 
# (13-14) its clustering coefficient, (15) its eigenvector centrality,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18-19-20-21) embeddings from Node2Vec, (22) text_embeddings from Doc2Vec
X_test = np.zeros((n_test, 17+n_wg+n_pg+n_wpg+n_sg+n_temb))
for i,row in df_test.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_test[i,0] = G.degree(node)
    X_test[i,1] = WG.degree(node)
    X_test[i,2] = SG.degree(node)
    X_test[i,3] = avg_neighbor_degree_g[node]
    X_test[i,4] = avg_neighbor_degree_wg[node]
    X_test[i,5] = avg_neighbor_degree_sg[node]
    X_test[i,6] = core_number_g[node]
    X_test[i,7] = page_rank_g[node]
    X_test[i,8] = page_rank_wg[node]
    X_test[i,9] = page_rank_sg[node]
    X_test[i,10] = betweenness_centrality_g[node]
    X_test[i,11] = betweenness_centrality_wg[node]
    X_test[i,12] = clustering_wg[node]
    X_test[i,13] = clustering_sg[node]
    X_test[i,14] = eigenvector_centrality_sg[node]
    X_test[i,15] = n_papers[node]
    X_test[i,16] = average_coauthors_n_papers[node]
    if node in author_paper:
        for p in author_paper[node]:
            if str(p) in node2vec_pg:
                X_test[i,17:17+n_pg]+=node2vec_pg[str(p)]
                X_test[i,17+n_pg:17+n_pg+n_wpg]+=node2vec_wpg[str(p)]
    if str(node) in node2vec_sg:
                X_test[i,17+n_pg+n_wpg:17+n_pg+n_wpg+n_sg] = node2vec_sg[str(node)]
    X_test[i,17+n_pg+n_wpg+n_sg:17+n_pg+n_wpg+n_sg+n_wg] = node2vec_wg[str(node)]
    X_test[i,17+n_pg+n_wpg+n_sg+n_wg:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]

In [45]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [46]:
y_pred = np.round(y_pred)

In [47]:
# post-processing: make sure that the predicted h-index is less than the number of papers (<10)
for i in range(len(X_test)):
    npapers = X_test[i, 15]
    if npapers < 10 and y_pred[i] > npapers:
        y_pred[i] = npapers
    if y_pred[i] < 0:
        y_pred[i] = 1

In [48]:
# write the predictions to file
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:,["authorID","h_index_pred"]].to_csv('test_predictions_32.csv', index=False)