In [10]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from gensim.models import KeyedVectors

In [2]:
# read training data
df_train = pd.read_csv('../data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('../data/test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [3]:
# read collaboration graph
G = nx.read_edgelist('../data/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [4]:
# read weighted collaboration graph
WG = nx.read_edgelist("../data/weighted_collaboration_network.edgelist", nodetype=int, data=(("weight", float),))

In [5]:
# read similarity graph
SG = nx.read_multiline_adjlist("../data/sim_collaboration_network.adjlist", nodetype=int)

In [6]:
nodes = {k: v for v, k in enumerate(list(G.nodes()))}

In [7]:
# compute graph features for each node
avg_neighbor_degree_wg = nx.average_neighbor_degree(WG)
avg_neighbor_degree_g = nx.average_neighbor_degree(G)
core_number_g = nx.core_number(G)
page_rank_g = nx.pagerank(G)
page_rank_wg = nx.pagerank(WG)
avg_neighbor_degree_sg = nx.average_neighbor_degree(SG)
page_rank_sg = nx.pagerank(SG)

In [8]:
eigenvector_centrality_sg = nx.eigenvector_centrality(SG)

In [254]:
# load precomputed features for each node
f = open("../data/n_papers.pkl", "rb")
n_papers = pickle.load(f)
f.close()

f = open("../data/average_coauthors_n_papers.pkl", "rb")
average_coauthors_n_papers = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_g.pkl", "rb")
betweenness_centrality_g = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_wg.pkl", "rb")
betweenness_centrality_wg = pickle.load(f)
f.close()

f = open("../data/clustering_g.pkl", "rb")
clustering_g = pickle.load(f)
f.close()

f = open("../data/clustering_wg.pkl", "rb")
clustering_wg = pickle.load(f)
f.close()

f = open("../data/clustering_sg.pkl", "rb")
clustering_sg = pickle.load(f)
f.close()

f = open("../data/line_embedding.pkl", "rb")
line_embedding = pickle.load(f)
f.close()

In [229]:
node2vec_wv = KeyedVectors.load_word2vec_format('../data/node2vec_emb_.nodevectors')

In [239]:
# read embeddings of abstracts
text_embeddings = pd.read_csv("../data/author_embedding_64.csv", header=None)
text_embeddings = text_embeddings.rename(columns={0: "authorID"})

In [257]:
n_temb = text_embeddings.shape[1] - 1
n_wv = node2vec_wv.vector_size

In [258]:
# create the training matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, 
# (7) its core number, # (8-9-10) its page rank, (11-12) its betweenness centrality, 
# (13-14) its clustering coefficient, (15) its eigenvector centrality,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18) text_embeddings from Doc2Vec
X_train = np.zeros((n_train, 17+n_wv+n_temb))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_train[i,0] = G.degree(node)
    X_train[i,1] = WG.degree(node)
    X_train[i,2] = SG.degree(node)
    X_train[i,3] = avg_neighbor_degree_g[node]
    X_train[i,4] = avg_neighbor_degree_wg[node]
    X_train[i,5] = avg_neighbor_degree_sg[node]
    X_train[i,6] = core_number_g[node]
    X_train[i,7] = page_rank_g[node]
    X_train[i,8] = page_rank_wg[node]
    X_train[i,9] = page_rank_sg[node]
    X_train[i,10] = betweenness_centrality_g[node]
    X_train[i,11] = betweenness_centrality_wg[node]
    X_train[i,12] = clustering_wg[node]
    X_train[i,13] = clustering_sg[node]
    X_train[i,14] = eigenvector_centrality_sg[node]
    X_train[i,15] = n_papers[node]
    X_train[i,16] = average_coauthors_n_papers[node]
    X_train[i,17:17+n_wv] = node2vec_wv[str(node)]
    X_train[i,17+n_wv:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]
    y_train[i] = row['h_index']

In [259]:
reg = LGBMRegressor(objective='mae', n_estimators=3000)

In [199]:
#p=1 q=0.5 emb_
# cross-validation
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(np.mean(-scores))

3.3543957171452603


In [149]:
# create the testing matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, 
# (7) its core number, # (8-9-10) its page rank, (11-12) its betweenness centrality, 
# (13-14-15) its clustering coefficient, (16) its eigenvector centrality,
# (17) the number of written papers (cited), (18) the average number of written papers of its neighbors/coauthors,
# (19) text_embeddings from Doc2Vec
X_test = np.zeros((n_test, 17+n_wv+n_temb))
cpt_tst = 0
for i,row in df_test.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_test[i,0] = G.degree(node)
    X_test[i,1] = WG.degree(node)
    X_test[i,2] = SG.degree(node)
    X_test[i,3] = avg_neighbor_degree_g[node]
    X_test[i,4] = avg_neighbor_degree_wg[node]
    X_test[i,5] = avg_neighbor_degree_sg[node]
    X_test[i,6] = core_number_g[node]
    X_test[i,7] = page_rank_g[node]
    X_test[i,8] = page_rank_wg[node]
    X_test[i,9] = page_rank_sg[node]
    X_test[i,10] = betweenness_centrality_g[node]
    X_test[i,11] = betweenness_centrality_wg[node]
    X_test[i,12] = clustering_wg[node]
    X_test[i,13] = clustering_sg[node]
    X_test[i,14] = eigenvector_centrality_sg[node]
    X_test[i,15] = n_papers[node]
    X_test[i,16] = average_coauthors_n_papers[node]
    X_test[i,17:17+n_wv] = n2v_embeddings.iloc[index,:]
    X_test[i,17+n_wv:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]

In [143]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [144]:
# post-processing: make sure that the predicted h-index is less than the number of papers (<10)
for i in range(len(X_test)):
    npapers = X_test[i, 15]
    if npapers < 10 and y_pred[i] > npapers:
        y_pred[i] = npapers
    if y_pred[i] < 0:
        y_pred[i] = 1

In [145]:
# write the predictions to file
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:,["authorID","h_index_pred"]].to_csv('../predictions/test_predictions_22.csv', index=False)