In [None]:
from tqdm.notebook import trange, tqdm
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from gensim.models.doc2vec import Doc2Vec
from gensim.models import KeyedVectors

# Importing data and extracting features

In [None]:
df_train = pd.read_csv('train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

df_test = pd.read_csv('test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

In [None]:
def read_graphs(graph):
    G = nx.read_edgelist(graph, delimiter=' ', nodetype=int)
    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    return G, n_nodes, n_edges
def read_adj_graphs(graph):
    G = nx.read_multiline_adjlist(graph, nodetype=int)
    return G
def read_weighted_graphs(graph):
    G = nx.read_edgelist(graph, nodetype=int, data=(("weight", float),))
    return G

def compute_sim_features(G):
    pagerank = nx.pagerank(G)
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    triangles = nx.triangles(G)
    eigenvector_centrality = nx.eigenvector_centrality(G)
    return pagerank, eigenvector_centrality,triangles,avg_neighbor_degree

def compute_graph_features(G):
    core_number = nx.core_number(G) 
    clustering = nx.clustering(G)
    pagerank = nx.pagerank(G)
    avg_neighbor_degree = nx.average_neighbor_degree(G)
    onion_layers = nx.onion_layers(G)
    degree_centrality = nx.degree_centrality(G)
    return core_number, degree_centrality, clustering, avg_neighbor_degree, onion_layers, pagerank  

In [None]:
G_coauthor, co_nodes, co_edges = read_graphs("coauthorship.edgelist")
G_sum_sim = read_adj_graphs("sum_sim_authors.adjlist")
G_weighted = read_weighted_graphs("weighted_coauthorship.edgelist")
co_a, co_b, co_c, co_d, co_e, co_f = compute_graph_features(G_coauthor)
w_a, w_b, w_c, w_d = compute_sim_features(G_weighted)
ms_a, ms_b, ms_c, ms_d = compute_sim_features(G_sum_sim)

## Extract coauthor h_index data from graph

In [None]:
"""Extract max, min and mean coauthor hindex"""
df_dict = df_train.to_dict() 
inv_df = {v: k for k, v in df_dict["author"].items()}
def compute_mean_max_coauthor_hindex(graph, node):
    df_coauth_hindex = [df_dict["hindex"].get(key1) for key1 in [inv_df.get(key) for key in [x for x in graph.neighbors(node)]] if df_dict["hindex"].get(key1) is not None]
    if df_coauth_hindex == []:
        max_hind = 0
        min_hind = 0
        mean_hind = 0
    else:
        max_hind = np.max(df_coauth_hindex)
        min_hind = np.min(df_coauth_hindex)
        mean_hind = np.mean(df_coauth_hindex)
    return max_hind, min_hind, mean_hind, df_coauth_hindex

"""Extract number of paper the author and on average how many its coauthor has"""
file = open('author_papers.txt', encoding = 'utf8')
Author_paper_num = {}
Author_paper = {}
for i in tqdm(range(co_nodes)):
    newline = file.readline()
    author = newline.split(':')[0]
    top5 = newline.split(':')[1].strip().split("-")
    Author_paper[int(author)] = top5
    Author_paper_num[int(author)] = len(top5)
    
    
def mean_coauthor_paper_count(nodes):
    mean_co = {}
    for node in nodes:
        if len(list(G_coauthor.neighbors(node))) == 0:
            mean_co[node] = 0        
        p = 0
        for n in list(G_coauthor.neighbors(node)):
            p += len(Author_paper[n])
        mean_co[node] = p / len(list(G_coauthor.neighbors(node)))
    return mean_co

nodes = list(G_coauthor.nodes())
mean_co = mean_coauthor_paper_count(nodes)    

## Import Deepwalk embeddings

In [None]:
co_auth_emb = {}
filename = 'DeepWalkEmbeddings/coauthor.embeddings'
file = open(filename, encoding = 'utf8')
N_lines, n_dim2 = file.readline().strip().split(" ")
N_lines = int(N_lines); n_dim2 = int(n_dim2)
for i in range(N_lines):
    a = file.readline().split(' ')
    auth = a[0]
    rest = a[1:]
    co_auth_emb[auth] = np.array(rest).astype(np.float32)

## Import Doc2Vec embeddings

In [None]:
model_dv = Doc2Vec.load("d2v_50_new/d2v_model_PV-DM")
model_dbow = Doc2Vec.load("d2v_50_new/d2v_model_PV-DBOW")

In [None]:
linecount = open('abstracts.txt')
file = open('abstracts.txt', encoding='utf8')
ids = []
for i in tqdm(range(624181)):
    newLine = file.readline()
    split = newLine.split('----', 1)
    ids.append(split[0])
    
embedding_id_to_ids = { i : ids[i] for i in range(len(ids)) }
ids_to_embedding_id = dict((v, k) for k, v in embedding_id_to_ids.items())
mean_dict_dv = {}
mean_dict_dbow = {}
sum_dict_dv = {}
sum_dict_dbow = {}
for key in tqdm(Author_paper.keys()):
    list_id = [i for i in Author_paper[key]]
    
    emb = []
    emb2 = []
    for i in list_id:
        try:
            code = ids_to_embedding_id[i]
            emb.append(model_dv.dv[code])
            emb2.append(model_dbow.dv[code])
        except:
            pass
    mean_dv = np.mean(emb,axis=0)
    sum_dv = np.sum(emb,axis=0)
    mean_dict_dv[key] = mean_dv
    sum_dict_dv[key] = sum_dv
    
    mean_db = np.mean(emb2,axis=0)
    sum_db = np.sum(emb2,axis=0)
    mean_dict_dbow[key] = mean_db
    sum_dict_dbow[key] = sum_db

## Import Node2Vec embeddings

In [None]:
g_weighted_n2v = KeyedVectors.load_word2vec_format('Node2VecEmb/n2v_g_weighted.nodevectors')
g_sim_n2v = KeyedVectors.load_word2vec_format("Node2VecEmb/n2v_g_sim.nodevectors")
g_sim_n2v_ = {}
for g in tqdm(G_coauthor.nodes()):
    try: 
        g_sim_n2v_[g] = g_sim_n2v[str(g)]
    except:
        g_sim_n2v_[g] = np.zeros(50)

# Building Train, Test datasets

In [None]:
dim = 50
dim_2 = 100
X_train = np.zeros((n_train,  3 * dim + 22))
y_train = np.zeros(n_train)
for i, row in tqdm(df_train.iterrows()):
    node = row['author']
    max_hind, min_hind, mean_hind, df_coauth_hindex = compute_mean_max_coauthor_hindex(G_coauthor, node)
    X_train[i, 0] = co_a[node]
    X_train[i, 1] = co_b[node]
    X_train[i, 2] = co_c[node]
    X_train[i, 3] = co_d[node] 
    X_train[i, 4] = co_e[node] 
    X_train[i, 5] = co_f[node] 
    X_train[i, 6] = ms_a[node]
    X_train[i, 7] = ms_b[node]
    X_train[i, 8] = ms_c[node]
    X_train[i, 9] = ms_d[node]
    X_train[i, 10] = w_a[node]
    X_train[i, 11] = w_b[node]
    X_train[i, 12] = w_c[node]
    X_train[i, 13] = w_d[node]
    X_train[i, 14] = (max_hind - min_hind)
    X_train[i, 15] = np.sum(df_coauth_hindex)
    X_train[i, 16] = mean_hind
    X_train[i, 17] = Author_paper_num[node]
    X_train[i, 18] = mean_co[node]
    X_train[i, 19] = G_coauthor.degree(node)
    X_train[i, 20] = G_sum_sim.degree(node)
    X_train[i, 21] = G_weighted.degree(node)
    X_train[i, 22:dim + 22] = sum_dict_dbow[node]
    X_train[i, dim + 22 :  2 * dim + 22] = sum_dict_dv[node]
    #X_train[i, dim + 22 : 2 * dim + 22] = sum_dict_dv[node]
    #X_train[i, 2 * dim + 22:  2 * dim + 22 +64] = co_auth_emb[auth]
    X_train[i, 2 * dim + 22 : 2 * dim + 22 + dim] = g_weighted_n2v[str(int(node))]
    #X_train[i, 2 * dim + 22 + dim :3 * dim + 22 + dim] = g_sim_n2v_[int(node)]
    #X_train[i, 2 * dim + 22: 2 * dim + 22 + dim_2] = g_weighted_n2v[str(int(node))]
    #X_train[i, 2 * dim + 22 : 3 * dim + 22] = g_sim_n2v_[int(node)]
    #X_train[i, 2 * dim + 22 + dim_2 : 2 * dim + 22 + 2 * dim_2] = g_weighted_n2v[str(int(node))]
    #X_train[i, 2 * dim_2 + dim + 21: 2 * dim_2 + 2 * dim + 21] = model_dbow_sum[node]
    y_train[i] = row["hindex"]
    
X_test = np.zeros((n_test,  3 * dim + 22))
for i, row in tqdm(df_test.iterrows()):
    node = row['author']
    max_hind, min_hind, mean_hind, df_coauth_hindex = compute_mean_max_coauthor_hindex(G_coauthor, node)
#     X_test[i, dim] = sum_dict_dv[node]
#     X_test[i, dim : 2 * dim] = sum_dict_dbow[node]
    X_test[i, 0] = co_a[node]
    X_test[i, 1] = co_b[node]
    X_test[i, 2] = co_c[node]
    X_test[i, 3] = co_d[node] 
    X_test[i, 4] = co_e[node] 
    X_test[i, 5] = co_f[node] 
    X_test[i, 6] = ms_a[node]
    X_test[i, 7] = ms_b[node]
    X_test[i, 8] = ms_c[node]
    X_test[i, 9] = ms_d[node]
    X_test[i, 10] = w_a[node]
    X_test[i, 11] = w_b[node]
    X_test[i, 12] = w_c[node]
    X_test[i, 13] = w_d[node]
    X_test[i, 14] = (max_hind - min_hind)
    X_test[i, 15] = np.sum(df_coauth_hindex)
    X_test[i, 16] = mean_hind
    X_test[i, 17] = Author_paper_num[node]
    X_test[i, 18] = mean_co[node]
    X_test[i, 19] = G_coauthor.degree(node)
    X_test[i, 20] = G_sum_sim.degree(node)
    X_test[i, 21] = G_weighted.degree(node)
    X_test[i, 22:dim + 22] = sum_dict_dbow[node]
    X_test[i, dim + 22 :  2 * dim + 22] = sum_dict_dv[node]
    #X_test[i, 2 * dim + 22 : 3 * dim + 22] = g_sim_n2v_[int(node)]
    #X_test[i, 2 * dim + 22:  2 * dim + 22 +64] = co_auth_emb[auth]
    X_test[i, 2 * dim + 22 : 2 * dim + 22 + dim] = g_weighted_n2v[str(int(node))]
    #X_test[i, dim_2 + 22: dim_2 + 22 + 64] = co_auth_emb[auth]
    #X_test[i, 2 * dim + 22 + dim :3 * dim + 22 + dim] = g_sim_n2v_[int(node)]
    #X_test[i, dim_2 + 21: 2 * dim_2 + 21] = g_sim_n2v_[int(node)]
    #X_test[i, 2 * dim_2 + dim + 21: 2 * dim_2 + 2 * dim + 21] = model_dbow_sum[node]

In [None]:
pd.DataFrame(X_train).to_csv('X_train.csv',index=False)
pd.DataFrame(X_test).to_csv('X_test.csv',index=False)
pd.DataFrame(y_train).to_csv('y_train.csv',index=False)

In [None]:
X_train_nan = np.nan_to_num(X_train)
X_test_nan = np.nan_to_num(X_test)
X_train_1 = scaler.fit_transform(X_train_nan)
X_test_1 = scaler.fit_transform(X_test_nan)
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_train_1, y_train, test_size=0.25, random_state=42)

In [None]:
##GridSearchCV pipeline found online 
from sklearn.model_selection import GridSearchCV
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
          pred = fitted_model.predict_proba(X_test_data)
    else:
          pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [None]:
%%time
from lightgbm import LGBMRegressor
model = LGBMRegressor(n_jobs=12)
param_grid = {
    'n_estimators': [7000, 8000, 9000, 10000],
    'max_depth': [10, 12, 14, 20],
    'num_leaves': [16, 24, 32],
    'learning_rate': [0.006, 0.008, 0.01, 0.012, 0.015, 0.02,0.03],
    'reg_lambda': [0.1, 0.3, 0.7]
}
from sklearn.model_selection import RandomizedSearchCV
random_cv = RandomizedSearchCV(
    model, param_grid, n_iter=5, cv=3, scoring="neg_mean_squared_error", n_jobs=-1
)


_ = random_cv.fit(X_train_,y_train_)
# model, pred = algorithm_pipeline(X_train_, X_test_, y_train_, y_test_, model, 
#                                  param_grid, cv=3, scoring_fit='accuracy')

print(random_cv.best_score_)
print(random_cv.best_params_)