In [10]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

In [2]:
# read training data
df_train = pd.read_csv('../data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('../data/test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [4]:
# read collaboration graph
G = nx.read_edgelist('../data/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [5]:
# read weighted collaboration graph
WG = nx.read_edgelist("../data/weighted_collaboration_network.edgelist", nodetype=int, data=(("weight", float),))

In [6]:
# read similarity graph
SG = nx.read_multiline_adjlist("../data/sim_collaboration_network.adjlist", nodetype=int)

In [8]:
# compute graph features for each node
avg_neighbor_degree_wg = nx.average_neighbor_degree(WG)
avg_neighbor_degree_g = nx.average_neighbor_degree(G)
avg_neighbor_degree_sg = nx.average_neighbor_degree(SG)
core_number_g = nx.core_number(G)
page_rank_g = nx.pagerank(G)
page_rank_wg = nx.pagerank(WG)
page_rank_sg = nx.pagerank(SG)

In [13]:
# load precomputed features for each node
f = open("../data/n_papers.pkl", "rb")
n_papers = pickle.load(f)

f = open("../data/average_coauthors_n_papers.pkl", "rb")
average_coauthors_n_papers = pickle.load(f)

f = open("../data/betweenness_centrality_g.pkl", "rb")
betweenness_centrality_g = pickle.load(f)

f = open("../data/betweenness_centrality_wg.pkl", "rb")
betweenness_centrality_wg = pickle.load(f)

In [15]:
# read embeddings of abstracts
text_embeddings = pd.read_csv("../data/author_embedding_64.csv", header=None)
text_embeddings = text_embeddings.rename(columns={0: "authorID"})
n_temb = text_embeddings.shape[1] - 1

In [17]:
# create the training matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, (7) its core number, 
# (8-9-10) its page rank, (11-12) its betweenness centrality
# (13) the number of written papers (cited), (14) the average number of written papers of its neighbors/coauthors,
# (15) text_embeddings from Doc2Vec
X_train = np.zeros((n_train, 14+n_temb))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = int(row['authorID'])
    X_train[i,0] = G.degree(node)
    X_train[i,1] = WG.degree(node)
    X_train[i,2] = SG.degree(node)
    X_train[i,3] = avg_neighbor_degree_g[node]
    X_train[i,4] = avg_neighbor_degree_wg[node]
    X_train[i,5] = avg_neighbor_degree_sg[node]
    X_train[i,6] = core_number_g[node]
    X_train[i,7] = page_rank_g[node]
    X_train[i,8] = page_rank_wg[node]
    X_train[i,9] = page_rank_sg[node]
    X_train[i,10] = betweenness_centrality_g[node]
    X_train[i,11] = betweenness_centrality_wg[node]
    X_train[i,12] = n_papers[node]
    X_train[i,13] = average_coauthors_n_papers[node]
    X_train[i,14:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]
    y_train[i] = row['h_index']

In [18]:
# create the testing matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, (7) its core number, 
# (8-9-10) its page rank, (11-12) its betweenness centrality
# (13) the number of written papers (cited), (14) the average number of written papers of its neighbors/coauthors,
# (15) text_embeddings from Doc2Vec
X_test = np.zeros((n_test, 14+n_temb))
for i,row in df_test.iterrows():
    node = int(row['authorID'])
    X_test[i,0] = G.degree(node)
    X_test[i,1] = WG.degree(node)
    X_test[i,2] = SG.degree(node)
    X_test[i,3] = avg_neighbor_degree_g[node]
    X_test[i,4] = avg_neighbor_degree_wg[node]
    X_test[i,5] = avg_neighbor_degree_sg[node]
    X_test[i,6] = core_number_g[node]
    X_test[i,7] = page_rank_g[node]
    X_test[i,8] = page_rank_wg[node]
    X_test[i,9] = page_rank_sg[node]
    X_test[i,10] = betweenness_centrality_g[node]
    X_test[i,11] = betweenness_centrality_wg[node]
    X_test[i,12] = n_papers[node]
    X_test[i,13] = average_coauthors_n_papers[node]
    X_test[i,14:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]

In [19]:
reg = LGBMRegressor(objective='mae', n_estimators=3000)

In [20]:
# cross-validation
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(np.mean(-scores))

3.489507227809912


In [21]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [22]:
# post-processing: make sure that the predicted h-index is less than the number of papers (<10)
for i in range(len(X_test)):
    n_papers = X_test[i, 12]
    if n_papers < 10 and y_pred[i] > n_papers:
        y_pred[i] = n_papers

In [23]:
# write the predictions to file
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:,["authorID","h_index_pred"]].to_csv('../predictions/test_predictions.csv', index=False)