In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

In [3]:
# read training data
df_train = pd.read_csv('../data/train.csv', dtype={'authorID': np.int64, 'h_index': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('../data/test.csv', dtype={'authorID': np.int64})
n_test = df_test.shape[0]

In [4]:
# read collaboration graph
G = nx.read_edgelist('../data/collaboration_network.edgelist', delimiter=' ', nodetype=int)

In [5]:
# read weighted collaboration graph
WG = nx.read_edgelist("../data/weighted_collaboration_network.edgelist", nodetype=int, data=(("weight", float),))

In [32]:
# read similarity graph
SG = nx.read_multiline_adjlist("../data/sim_collaboration_network.adjlist", nodetype=int)

In [19]:
nodes = {k: v for v, k in enumerate(list(G.nodes()))}

In [33]:
avg_neighbor_degree_sg = nx.average_neighbor_degree(SG)
page_rank_sg = nx.pagerank(SG)

In [7]:
# compute graph features for each node
avg_neighbor_degree_wg = nx.average_neighbor_degree(WG)
avg_neighbor_degree_g = nx.average_neighbor_degree(G)
core_number_g = nx.core_number(G)
page_rank_g = nx.pagerank(G)
page_rank_wg = nx.pagerank(WG)

In [58]:
# load precomputed features for each node
f = open("../data/n_papers.pkl", "rb")
n_papers = pickle.load(f)
f.close()

f = open("../data/average_coauthors_n_papers.pkl", "rb")
average_coauthors_n_papers = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_g.pkl", "rb")
betweenness_centrality_g = pickle.load(f)
f.close()

f = open("../data/betweenness_centrality_wg.pkl", "rb")
betweenness_centrality_wg = pickle.load(f)
f.close()

f = open("../data/clustering_g.pkl", "rb")
clustering_g = pickle.load(f)
f.close()

f = open("../data/clustering_wg.pkl", "rb")
clustering_wg = pickle.load(f)
f.close()

f = open("../data/clustering_sg.pkl", "rb")
clustering_sg = pickle.load(f)
f.close()

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GAE(nn.Module):
    """GAE model"""
    def __init__(self, n_feat, n_hidden_1, n_hidden_2, n_hidden_3, n_hidden_4, dropout):
        super(GAE, self).__init__()

        self.fc1 = nn.Linear(n_feat, n_hidden_1)
        self.fc2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.fc3 = nn.Linear(n_hidden_2, n_hidden_3)
        self.fc4 = nn.Linear(n_hidden_3, n_hidden_4)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj):
        
        h = self.fc1(x_in)
        h = self.relu(torch.mm(adj, h))
        
        h = self.dropout(h)
        
        h = self.fc2(h)
        h = self.relu(torch.mm(adj, h))
        
        h = self.fc3(h)
        h = self.relu(torch.mm(adj, h))
        
        z = self.fc4(h)
        z = torch.mm(adj, z)

        return z

In [15]:
import scipy.sparse as sp
import numpy as np
import torch
import torch.nn as nn

def normalize_adjacency(A):
    
    n = A.shape[0]
    A = A + sp.identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = sp.diags(inv_degs)
    A_normalized = D_inv.dot(A)

    return A_normalized


def sparse_to_torch_sparse(M):
    """Converts a sparse SciPy matrix to a sparse PyTorch tensor"""
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col)).astype(np.int64))
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


def loss_function(z, adj, device):
    mse_loss = nn.MSELoss()

    
    indices = adj._indices()

    y_pred = list()
    y = list()

    y_pred.append(torch.sum(torch.mul(z[indices[0,:],:], z[indices[1,:],:]), dim=1))
    y.append(adj._values().to(device))

    random_indices = torch.randint(z.size(0), indices.size())
    y_pred.append(torch.sum(torch.mul(z[random_indices[0,:],:], z[random_indices[1,:],:]), dim=1))
    y.append(torch.zeros(random_indices.size(1)).to(device))

    y_pred = torch.cat(y_pred, dim=0)
    y = torch.cat(y, dim=0)
    
    loss = mse_loss(y_pred, y)
    return loss

In [10]:
# read embeddings of abstracts
text_embeddings = pd.read_csv("../data/author_embedding_64.csv", header=None)
text_embeddings = text_embeddings.rename(columns={0: "authorID"})

In [16]:
n = SG.number_of_nodes()
nodes = {k: v for v, k in enumerate(list(G.nodes()))}
features = np.zeros((n,64))
for i in range(n):
    author = text_embeddings["authorID"][i]
    index = nodes[author]
    features[index,:] = text_embeddings.iloc[i, 1:]

In [17]:
import time
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_score


# Initialize device
device = torch.device("cpu")

# Hyperparameters
epochs = 100
n_hidden_1 = 64
n_hidden_2 = 64
n_hidden_3 = 64
n_hidden_4 = 64
learning_rate = 0.01
dropout_rate = 0.5

n = SG.number_of_nodes()
adj = nx.adjacency_matrix(SG) # Obtains the adjacency matrix
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix

# Transforms the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features).to(device)
adj = sparse_to_torch_sparse(adj).to(device)

# Creates the model and specifies the optimizer
model = GAE(features.shape[1], n_hidden_1, n_hidden_2, n_hidden_3, n_hidden_4, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Trains the model
for epoch in range(epochs):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    z = model(features, adj)
    loss = loss_function(z, adj, device)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss.item()),
              'time: {:.4f}s'.format(time.time() - t))


graph_embeddings = z.detach().cpu().numpy()
print(graph_embeddings.shape)

Epoch: 0001 loss_train: 287.1659 time: 23.4365s
Epoch: 0011 loss_train: 0.1116 time: 21.2820s
Epoch: 0021 loss_train: 0.1077 time: 20.5230s
Epoch: 0031 loss_train: 0.0973 time: 19.9500s
Epoch: 0041 loss_train: 0.0854 time: 22.0450s
Epoch: 0051 loss_train: 0.0747 time: 19.0360s
Epoch: 0061 loss_train: 0.0658 time: 21.6300s
Epoch: 0071 loss_train: 0.0584 time: 20.8820s
Epoch: 0081 loss_train: 0.0523 time: 24.4342s
Epoch: 0091 loss_train: 0.0466 time: 21.6174s
(231239, 64)


In [18]:
n_temb = text_embeddings.shape[1] - 1
n_gemb = graph_embeddings.shape[1]

In [64]:
# create the training matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, (7) its core number, 
# (8-9-10) its page rank, (11-12) its betweenness centrality, (13-14-15) its clustering coefficient,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18) text_embeddings from Doc2Vec
X_train = np.zeros((n_train, 17+n_temb+n_gemb))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_train[i,0] = G.degree(node)
    X_train[i,1] = WG.degree(node)
    X_train[i,2] = SG.degree(node)
    X_train[i,3] = avg_neighbor_degree_g[node]
    X_train[i,4] = avg_neighbor_degree_wg[node]
    X_train[i,5] = avg_neighbor_degree_sg[node]
    X_train[i,6] = core_number_g[node]
    X_train[i,7] = page_rank_g[node]
    X_train[i,8] = page_rank_wg[node]
    X_train[i,9] = page_rank_sg[node]
    X_train[i,10] = betweenness_centrality_g[node]
    X_train[i,11] = betweenness_centrality_wg[node]
    X_train[i,12] = clustering_g[node]
    X_train[i,13] = clustering_wg[node]
    X_train[i,14] = clustering_sg[node]
    X_train[i,15] = n_papers[node]
    X_train[i,16] = average_coauthors_n_papers[node]
    X_train[i,17:17+n_gemb] = graph_embeddings[index,:]
    X_train[i,17+n_gemb:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]
    y_train[i] = row['h_index']

In [65]:
reg = LGBMRegressor(objective='mae', n_estimators=3000)

In [66]:
# cross-validation
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(np.mean(-scores))

3.437653709750124


In [67]:
# create the testing matrix. each node is represented as a vector of features:
# (1-2-3) its degree, (4-5-6) the average degree of its neighbors, (7) its core number, 
# (8-9-10) its page rank, (11-12) its betweenness centrality, (13-14-15) its clustering coefficient,
# (16) the number of written papers (cited), (17) the average number of written papers of its neighbors/coauthors,
# (18) text_embeddings from Doc2Vec
X_test = np.zeros((n_test, 17+n_temb+n_gemb))
for i,row in df_test.iterrows():
    node = int(row['authorID'])
    index = nodes[node]
    X_test[i,0] = G.degree(node)
    X_test[i,1] = WG.degree(node)
    X_test[i,2] = SG.degree(node)
    X_test[i,3] = avg_neighbor_degree_g[node]
    X_test[i,4] = avg_neighbor_degree_wg[node]
    X_test[i,5] = avg_neighbor_degree_sg[node]
    X_test[i,6] = core_number_g[node]
    X_test[i,7] = page_rank_g[node]
    X_test[i,8] = page_rank_wg[node]
    X_test[i,9] = page_rank_sg[node]
    X_test[i,10] = betweenness_centrality_g[node]
    X_test[i,11] = betweenness_centrality_wg[node]
    X_test[i,12] = clustering_g[node]
    X_test[i,13] = clustering_wg[node]
    X_test[i,14] = clustering_sg[node]
    X_test[i,15] = n_papers[node]
    X_test[i,16] = average_coauthors_n_papers[node]
    X_test[i,17:17+n_gemb] = graph_embeddings[index,:]
    X_test[i,17+n_gemb:] = text_embeddings[text_embeddings["authorID"] == node].iloc[:,1:]

In [68]:
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [69]:
# post-processing: make sure that the predicted h-index is less than the number of papers (<10)
for i in range(len(X_test)):
    npapers = X_test[i, 12]
    if npapers < 10 and y_pred[i] > npapers:
        y_pred[i] = npapers

In [70]:
# write the predictions to file
df_test['h_index_pred'].update(pd.Series(np.round_(y_pred, decimals=3)))
df_test.loc[:,["authorID","h_index_pred"]].to_csv('../predictions/test_predictions.csv', index=False)