In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from gensim.models import KeyedVectors

In [2]:
# read training data
df_train = pd.read_csv('../data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('../data/test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [3]:
# read collaboration graph
G = nx.read_edgelist('../data/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [4]:
# read weighted collaboration graph
WG = nx.read_edgelist("../data/weighted_collaboration_network.edgelist", nodetype=int, data=(("weight", float),))

In [5]:
# read author similarity graph
SG = nx.read_multiline_adjlist("../data/author_similarity_network.adjlist", nodetype=int)

In [6]:
nodes = {k: v for v, k in enumerate(list(G.nodes()))}

In [7]:
# compute graph features for each node
avg_neighbor_degree_wg = nx.average_neighbor_degree(WG)
avg_neighbor_degree_g = nx.average_neighbor_degree(G)
core_number_g = nx.core_number(G)
page_rank_g = nx.pagerank(G)
page_rank_wg = nx.pagerank(WG)
avg_neighbor_degree_sg = nx.average_neighbor_degree(SG)
page_rank_sg = nx.pagerank(SG)
eigenvector_centrality_sg = nx.eigenvector_centrality(SG)

In [8]:
# load precomputed features for each node
f = open("../data/n_papers.pkl", "rb")
n_papers = pickle.load(f)
f.close()

f = open("../data/average_coauthors_n_papers.pkl", "rb")
average_coauthors_n_papers = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_g.pkl", "rb")
betweenness_centrality_g = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_wg.pkl", "rb")
betweenness_centrality_wg = pickle.load(f)
f.close()

f = open("../data/clustering_wg.pkl", "rb")
clustering_wg = pickle.load(f)
f.close()

f = open("../data/clustering_sg.pkl", "rb")
clustering_sg = pickle.load(f)
f.close()

In [9]:
# load Node2Vec embeddings obtained from WG
node2vec_wg = KeyedVectors.load_word2vec_format('../data/node2vec_wg.nodevectors')

In [10]:
# load Node2Vec embeddings obtained from SG
n2v_sg = pd.read_csv("../data/author_node2vec_sg.csv", header=None)
n2v_sg = n2v_sg.rename(columns={0: "authorID"})

In [11]:
# load Node2Vec embeddings obtained from PG
n2v_pg = pd.read_csv("../data/author_node2vec_pg.csv", header=None)
n2v_pg = n2v_pg.rename(columns={0: "authorID"})

In [12]:
# load Node2Vec embeddings obtained from WPG
n2v_wpg = pd.read_csv("../data/author_node2vec_wpg.csv", header=None)
n2v_wpg = n2v_wpg.rename(columns={0: "authorID"})

In [13]:
# read embeddings of abstracts obtained with Doc2Vec PV-DM
text_embeddings_dm = pd.read_csv("../data/author_embedding_64_dm.csv", header=None)
text_embeddings_dm = text_embeddings_dm.rename(columns={0: "authorID"})

In [14]:
# read embeddings of abstractsobtained with Doc2Vec PV-DBOW
text_embeddings_dbow = pd.read_csv("../data/author_embedding_64_dbow.csv", header=None)
text_embeddings_dbow = text_embeddings_dbow.rename(columns={0: "authorID"})

In [15]:
# preprocessing: fill nan values by zeros
n2v_sg = n2v_sg.fillna(0)
n2v_pg = n2v_pg.fillna(0)
n2v_wpg = n2v_wpg.fillna(0)
text_embeddings_dm = text_embeddings_dm.fillna(0)
text_embeddings_dbow = text_embeddings_dbow.fillna(0)

In [17]:
n_dim = 64

In [18]:
# create the training matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, 
# (7) its core number, (8-9-10) its page rank, (11-12) its betweenness centrality, 
# (13-14) its clustering coefficient, (15) its eigenvector centrality,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18-19-20-21) embeddings from Node2Vec, (22-23) text_embeddings from Doc2Vec
X_train = np.zeros((n_train, 17+6*n_dim))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_train[i,0] = G.degree(node)
    X_train[i,1] = WG.degree(node)
    X_train[i,2] = SG.degree(node)
    X_train[i,3] = avg_neighbor_degree_g[node]
    X_train[i,4] = avg_neighbor_degree_wg[node]
    X_train[i,5] = avg_neighbor_degree_sg[node]
    X_train[i,6] = core_number_g[node]
    X_train[i,7] = page_rank_g[node]
    X_train[i,8] = page_rank_wg[node]
    X_train[i,9] = page_rank_sg[node]
    X_train[i,10] = betweenness_centrality_g[node]
    X_train[i,11] = betweenness_centrality_wg[node]
    X_train[i,12] = clustering_wg[node]
    X_train[i,13] = clustering_sg[node]
    X_train[i,14] = eigenvector_centrality_sg[node]
    X_train[i,15] = n_papers[node]
    X_train[i,16] = average_coauthors_n_papers[node]
    X_train[i,17:17+n_dim] = n2v_pg[n2v_pg["authorID"] == node].iloc[:,1:]
    X_train[i,17+n_dim:17+2*n_dim] = n2v_wpg[n2v_wpg["authorID"] == node].iloc[:,1:]
    X_train[i,17+2*n_dim:17+3*n_dim] = n2v_sg[n2v_sg["authorID"] == node].iloc[:,1:]
    X_train[i,17+3*n_dim:17+4*n_dim] = node2vec_wg[str(node)]
    X_train[i,17+4*n_dim:17+5*n_dim] = text_embeddings_dm[text_embeddings_dm["authorID"] == node].iloc[:,1:]
    X_train[i,17+5*n_dim:] = text_embeddings_dbow[text_embeddings_dbow["authorID"] == node].iloc[:,1:]
    y_train[i] = row['h_index']

In [19]:
# creating custom loss that takes nearest integer
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error

def custom_loss(y, y_pred, **kwargs):
    return mean_absolute_error(y, np.round(y_pred))

sk_loss = make_scorer(custom_loss, greater_is_better=False)

In [20]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope

import warnings
warnings.filterwarnings("ignore")

In [21]:
from hyperopt import fmin, tpe, hp, anneal, Trials

In [22]:
def gb_mse_cv(params, cv=5, X=X_train, y=y_train):
    # the function gets a set of variable parameters in "param"
    params = {'num_leaves': int(params['num_leaves']), 
              'max_depth': int(params['max_depth']),
              'n_estimators': int(params['n_estimators']),
              'learning_rate': params['learning_rate'],
              'subsample': params['subsample'],
              'colsample_bytree': params['colsample_bytree'],
              'reg_alpha': params['reg_alpha'],
              'reg_lambda': params['reg_lambda']}
     
    # we use this params to create a new LGBM Regressor
    model = LGBMRegressor(n_jobs=1, **params)
    
    # and then conduct the cross validation with the same folds as before
    score = -cross_val_score(model, X_train, y_train, cv=cv, scoring=sk_loss, n_jobs=5).mean()

    return score

In [23]:
%%time

# possible values of parameters
space = {
    'num_leaves': hp.choice("num_leaves", [16, 32, 64, 128, 256]),
    'learning_rate': hp.uniform("learning_rate", 0.01, 0.1),
    'subsample': hp.uniform("subsample", 0.8, 1),
    'colsample_bytree': hp.uniform("colsample_bytree", 0.8, 1),
    'max_depth': hp.choice("max_depth", [*range(5,20)]),
    'n_estimators': hp.quniform("n_estimators", 1000, 10000, 1000),
    'reg_alpha':  hp.uniform("reg_alpha", 0.001, 1), # regularisation L1
    'reg_lambda': hp.uniform("reg_lambda", 0.001, 1) # regularisation L2
} 

# trials will contain logging information
trials = Trials()

best=fmin(fn=gb_mse_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=50, # maximum number of iterations
          trials=trials, # logging
          verbose = 1)

100%|██████████| 50/50 [7:27:10<00:00, 536.61s/trial, best loss: 3.2366826989619377]   
CPU times: user 4.41 s, sys: 5.27 s, total: 9.68 s
Wall time: 7h 27min 10s


In [24]:
best

{'colsample_bytree': 0.8194330336006158,
 'learning_rate': 0.010467484476003708,
 'max_depth': 14,
 'n_estimators': 8000.0,
 'num_leaves': 1,
 'reg_alpha': 0.789679180967437,
 'reg_lambda': 0.3377356252520094,
 'subsample': 0.9865966548486782}